diff --git a/airbyte-cdk/python/.coveragerc b/airbyte-cdk/python/.coveragerc new file mode 100644 index 000000000000..e83ca1d70fa3 --- /dev/null +++ b/airbyte-cdk/python/.coveragerc @@ -0,0 +1,11 @@ +[report] +# show lines missing coverage +show_missing = true + +[run] +omit = + # omit the models package as it's auto-generated + airbyte_cdk/models/* + + # omit as unimplemented + airbyte_cdk/base_python/cdk/streams/auth/jwt.py diff --git a/airbyte-cdk/python/.dockerignore b/airbyte-cdk/python/.dockerignore new file mode 100644 index 000000000000..378eac25d311 --- /dev/null +++ b/airbyte-cdk/python/.dockerignore @@ -0,0 +1 @@ +build diff --git a/airbyte-cdk/python/.gitignore b/airbyte-cdk/python/.gitignore new file mode 100644 index 000000000000..3d99a021c606 --- /dev/null +++ b/airbyte-cdk/python/.gitignore @@ -0,0 +1,11 @@ +.coverage + +# TODO: these are tmp files generated by unit tests. They should go to the /tmp directory. +` + +## 0.56.1 + +no-op to verify pypi publish flow + +## 0.56.0 + +Allow for connectors to continue syncing when a stream fails + +## 0.55.5 + +File-based CDK: hide source-defined primary key; users can define primary keys in the connection's configuration + +## 0.55.4 + +Source Integration tests: decoupling entrypoint wrapper from pytest + +## 0.55.3 + +First iteration of integration tests tooling (http mocker and response builder) + +## 0.55.2 + +concurrent-cdk: factory method initializes concurrent source with default number of max tasks + +## 0.55.1 + +Vector DB CDK: Add omit_raw_text flag + +## 0.55.0 + +concurrent cdk: read multiple streams concurrently + +## 0.54.0 + +low-code: fix injection of page token if first request + +## 0.53.9 + +Fix of generate the error message using \_try_get_error based on list of errors + +## 0.53.8 + +Vector DB CDK: Remove CDC records, File CDK: Update unstructured parser + +## 0.53.7 + +low-code: fix debug logging when using --debug flag + +## 0.53.6 + +Increase maximum_attempts_to_acquire to avoid crashing in acquire_call + +## 0.53.5 + +File CDK: Improve stream config appearance + +## 0.53.4 + +Concurrent CDK: fix futures pruning + +## 0.53.3 + +Fix spec schema generation for File CDK and Vector DB CDK and allow skipping invalid files in document file parser + +## 0.53.2 + +Concurrent CDK: Increase connection pool size to allow for 20 max workers + +## 0.53.1 + +Concurrent CDK: Improve handling of future to avoid memory leak and improve performances + +## 0.53.0 + +Add call rate functionality + +## 0.52.10 + +Fix class SessionTokenAuthenticator for CLASS_TYPES_REGISTRY mapper + +## 0.52.9 + +File CDK: Improve file type detection in document file type parser + +## 0.52.8 + +Concurrent CDK: incremental (missing state conversion). Outside of concurrent specific work, this includes the following changes: + +- Checkpointing state was acting on the number of records per slice. This has been changed to consider the number of records per syncs +- `Source.read_state` and `Source._emit_legacy_state_format` are now classmethods to allow for developers to have access to the state before instantiating the source + +## 0.52.7 + +File CDK: Add pptx support + +## 0.52.6 + +make parameter as not required for default backoff handler + +## 0.52.5 + +use in-memory cache if no file path is provided + +## 0.52.4 + +File CDK: Add unstructured parser + +## 0.52.3 + +Update source-declarative-manifest base image to update Linux alpine and Python + +## 0.52.2 + +## 0.52.1 + +Add max time for backoff handler + +## 0.52.0 + +File CDK: Add CustomFileBasedException for custom errors + +## 0.51.44 + +low-code: Allow connector developers to specify the type of an added field + +## 0.51.43 + +concurrent cdk: fail fast if a partition raises an exception + +## 0.51.42 + +File CDK: Avoid listing all files for check command + +## 0.51.41 + +Vector DB CDK: Expose stream identifier logic, add field remapping to processing | File CDK: Emit analytics message for used streams + +## 0.51.40 + +Add filters for base64 encode and decode in Jinja Interpolation + +## 0.51.39 + +Few bug fixes for concurrent cdk + +## 0.51.38 + +Add ability to wrap HTTP errors with specific status codes occurred during access token refresh into AirbyteTracedException + +## 0.51.37 + +Enable debug logging when running availability check + +## 0.51.36 + +Enable debug logging when running availability check + +## 0.51.35 + +File CDK: Allow configuring number of tested files for schema inference and parsability check + +## 0.51.34 + +Vector DB CDK: Fix OpenAI compatible embedder when used without api key + +## 0.51.33 + +Vector DB CDK: Improve batching process + +## 0.51.32 + +Introduce experimental ThreadBasedConcurrentStream + +## 0.51.31 + +Fix initialize of token_expiry_is_time_of_expiration field + +## 0.51.30 + +Add new token_expiry_is_time_of_expiration property for AbstractOauth2Authenticator for indicate that token's expiry_in is a time of expiration + +## 0.51.29 + +Coerce read_records to iterable in http availabilty strategy + +## 0.51.28 + +Add functionality enabling Page Number/Offset to be set on the first request + +## 0.51.27 + +Fix parsing of UUID fields in avro files + +## 0.51.26 + +Vector DB CDK: Fix OpenAI embedder batch size + +## 0.51.25 + +Add configurable OpenAI embedder to cdk and add cloud environment helper + +## 0.51.24 + +Fix previous version of request_cache clearing + +## 0.51.23 + +Fix request_cache clearing and move it to tmp folder + +## 0.51.22 + +Vector DB CDK: Adjust batch size for Azure embedder to current limits + +## 0.51.21 + +Change Error message if Stream is not found + +## 0.51.20 + +Vector DB CDK: Add text splitting options to document processing + +## 0.51.19 + +Ensuring invalid user-provided urls does not generate sentry issues + +## 0.51.18 + +Vector DB CDK adjustments: Prevent failures with big records and OpenAI embedder + +## 0.51.17 + +[ISSUE #30353] File-Based CDK: remove file_type from stream config + +## 0.51.16 + +Connector Builder: fix datetime format inference for str parsable as int but not isdecimal + +## 0.51.15 + +Vector DB CDK: Add Azure OpenAI embedder + +## 0.51.14 + +File-based CDK: improve error message for CSV parsing error + +## 0.51.13 + +File-based CDK: migrated parsing error to config error to avoid sentry alerts + +## 0.51.12 + +Add from-field embedder to vector db CDK + +## 0.51.11 + +FIle-based CDK: Update spec and fix autogenerated headers with skip after + +## 0.51.10 + +Vector DB CDK adjustments: Fix id generation, improve config spec, add base test case + +## 0.51.9 + +[Issue #29660] Support empty keys with record selection + +## 0.51.8 + +Add vector db CDK helpers + +## 0.51.7 + +File-based CDK: allow user to provided column names for CSV files + +## 0.51.6 + +File-based CDK: allow for extension mismatch + +## 0.51.5 + +File-based CDK: Remove CSV noisy log + +## 0.51.4 + +Source-S3 V4: feature parity rollout + +## 0.51.3 + +File-based CDK: Do not stop processing files in slice on error + +## 0.51.2 + +Check config against spec in embedded sources and remove list endpoint from connector builder module + +## 0.51.1 + +low-code: allow formatting datetime as milliseconds since unix epoch + +## 0.51.0 + +File-based CDK: handle legacy options + +## 0.50.2 + +Fix title and description of datetime_format fields + +## 0.50.1 + +File-based CDK cursor and entrypoint updates + +## 0.50.0 + +Low code CDK: Decouple SimpleRetriever and HttpStream + +## 0.49.0 + +Add utils for embedding sources in other Python applications + +## 0.48.0 + +Relax pydantic version requirement and update to protocol models version 0.4.0 + +## 0.47.5 + +Support many format for cursor datetime + +## 0.47.4 + +File-based CDK updates + +## 0.47.3 + +Connector Builder: Ensure we return when there are no slices + +## 0.47.2 + +low-code: deduplicate query params if they are already encoded in the URL + +## 0.47.1 + +Fix RemoveFields transformation issue + +## 0.47.0 + +Breaking change: Rename existing SessionTokenAuthenticator to LegacySessionTokenAuthenticator and make SessionTokenAuthenticator more generic + +## 0.46.1 + +Connector builder: warn if the max number of records was reached + +## 0.46.0 + +Remove pyarrow from main dependency and add it to extras + +## 0.45.0 + +Fix pyyaml and cython incompatibility + +## 0.44.4 + +Connector builder: Show all request/responses as part of the testing panel + +## 0.44.3 + +[ISSUE #27494] allow for state to rely on transformed field + +## 0.44.2 + +Ensuring the state value format matches the cursor value from the record + +## 0.44.1 + +Fix issue with incremental sync following data feed release + +## 0.44.0 + +Support data feed like incremental syncs + +## 0.43.3 + +Fix return type of RecordFilter: changed from generator to list + +## 0.43.2 + +Connector builder module: serialize request body as string + +## 0.43.1 + +Fix availability check to handle HttpErrors which happen during slice extraction + +## 0.43.0 + +Refactoring declarative state management + +## 0.42.1 + +Error message on state per partition state discrepancy + +## 0.42.0 + +Supporting state per partition given incremental sync and partition router + +## 0.41.0 + +Use x-www-urlencoded for access token refresh requests + +## 0.40.5 + +Replace with when making oauth calls + +## 0.40.4 + +Emit messages using message repository + +## 0.40.3 + +Add utils for inferring datetime formats + +## 0.40.2 + +Add a metadata field to the declarative component schema + +## 0.40.1 + +make DatetimeBasedCursor.end_datetime optional + +## 0.40.0 + +Remove SingleUseRefreshTokenOAuthAuthenticator from low code CDK and add generic injection capabilities to ApiKeyAuthenticator + +## 0.39.4 + +Connector builder: add latest connector config control message to read calls + +## 0.39.3 + +Add refresh token update capabilities to OAuthAuthenticator + +## 0.39.2 + +Make step and cursor_granularity optional + +## 0.39.1 + +Improve connector builder error messages + +## 0.39.0 + +Align schema generation in SchemaInferrer with Airbyte platform capabilities + +## 0.38.0 + +Allow nested objects in request_body_json + +## 0.37.0 + +low-code: Make refresh token in oauth authenticator optional + +## 0.36.5 + +Unfreeze requests version and test new pipeline + +## 0.36.4 + +low-code: use jinja sandbox and restrict some methods + +## 0.36.3 + +pin the version of the requests library + +## 0.36.2 + +Support parsing non UTC dates and Connector Builder set slice descriptor + +## 0.36.1 + +low-code: fix add field transformation when running from the connector builder + +## 0.36.0 + +Emit stream status messages + +## 0.35.4 + +low-code: remove now_local() macro because it's too unpredictable + +## 0.35.3 + +low-code: alias stream_interval and stream_partition to stream_slice in jinja context + +## 0.35.2 + +Connector builder scrubs secrets from raw request and response + +## 0.35.1 + +low-code: Add title, description, and examples for all fields in the manifest schema + +## 0.35.0 + +low-code: simplify session token authenticator interface + +## 0.34.3 + +low-code: fix typo in ManifestDeclarativeSource + +## 0.34.2 + +Emit slice log messages when running the connector builder + +## 0.34.1 + +set slice and pages limit when reading from the connector builder module + +## 0.34.0 + +Low-Code CDK: Enable use of SingleUseRefreshTokenAuthenticator + +## 0.33.2 + +low-code: fix duplicate stream slicer update + +## 0.33.1 + +Low-Code CDK: make RecordFilter.filter_records as generator + +## 0.33.0 + +Enable oauth flow for low-code connectors + +## 0.32.0 + +Remove unexpected error swallowing on abstract source's check method + +## 0.31.1 + +connector builder: send stacktrace when error on read + +## 0.31.0 + +Add connector builder module for handling Connector Builder server requests + +## 0.30.4 + +CDK's read command handler supports Connector Builder list_streams requests + +## 0.30.3 + +Fix reset pagination issue on test reads + +## 0.30.2 + +- Low-code CDK: Override refresh_access_token logic DeclarativeOAuthAuthenticator + +## 0.30.1 + +Releasing using the new release flow. No change to the CDK per se + +## 0.30.0 + +OAuth: retry refresh access token requests + +## 0.29.3 + +Low-Code CDK: duration macro added + +## 0.29.2 + +support python3.8 + +## 0.29.1 + +Publishing Docker image for source-declarative-manifest + +## 0.29.0 + +**Breaking changes: We have promoted the low-code CDK to Beta. This release contains a number of breaking changes intended to improve the overall usability of the language by reorganizing certain concepts, renaming, reducing some field duplication, and removal of fields that are seldom used.** + +The changes are: + +- Deprecated the concept of Stream Slicers in favor of two individual concepts: Incremental Syncs, and Partition Routers: + - Stream will define an `incremental_sync` field which is responsible for defining how the connector should support incremental syncs using a cursor field. `DatetimeStreamSlicer` has been renamed to `DatetimeBasedCursor` and can be used for this field. + - `Retriever`s will now define a `partition_router` field. The remaining slicers are now called `SubstreamPartitionRouter` and `ListPartitionRouter`, both of which can be used here as they already have been. + - The `CartesianProductStreamSlicer` because `partition_router` can accept a list of values and will generate that same cartesian product by default. +- `$options` have been renamed to `$parameters` +- Changed the notation for component references to the JSON schema notation (`$ref: "#/definitions/requester"`) +- `DefaultPaginator` no longer has a `url_base` field. Moving forward, paginators will derive the `url_base` from the `HttpRequester`. There are some unique cases for connectors that implement a custom `Retriever`. +- `primary_key` and `name` no longer need to be defined on `Retriever`s or `Requester`s. They will be derived from the stream’s definition +- Streams no longer define a `stream_cursor_field` and will derive it from the `incremental_sync` component. `checkpoint_interval` has also been deprecated +- DpathExtractor `field_pointer` has been renamed to `field_path` +- `RequestOption` can no longer be used with with `inject_into` set to `path`. There is now a dedicated `RequestPath` component moving forward. + +## 0.28.1 + +Low-Code CDK: fix signature \_parse_records_and_emit_request_and_responses + +## 0.28.0 + +Low-Code: improve day_delta macro and MinMaxDatetime component + +## 0.27.0 + +Make HttpAvailabilityStrategy default for HttpStreams + +## 0.26.0 + +Low-Code CDK: make DatetimeStreamSlicer.step as InterpolatedString + +## 0.25.2 + +Low-Code: SubstreamSlicer.parent_key - dpath support added + +## 0.25.1 + +Fix issue when trying to log stream slices that are non-JSON-serializable + +## 0.25.0 + +Use dpath.util.values method to parse response with nested lists + +## 0.24.0 + +Use dpath.util.values method to parse response with nested lists + +## 0.23.0 + +Limiting the number of HTTP requests during a test read + +## 0.22.0 + +Surface the resolved manifest in the CDK + +## 0.21.0 + +Add AvailabilityStrategy concept and use check_availability within CheckStream + +## 0.20.2 + +Add missing package in previous patch release + +## 0.20.1 + +Handle edge cases for CheckStream - checking connection to empty stream, and checking connection to substream with no parent records + +## 0.20.0 + +Low-Code: Refactor low-code to use Pydantic model based manifest parsing and component creation + +## 0.19.1 + +Low-code: Make documentation_url in the Spec be optional + +## 0.19.0 + +Low-Code: Handle forward references in manifest + +## 0.18.1 + +Allow for CustomRequester to be defined within declarative manifests + +## 0.18.0 + +Adding `cursor_granularity` to the declarative API of DatetimeStreamSlicer + +## 0.17.0 + +Add utility class to infer schemas from real records + +## 0.16.3 + +Do not eagerly refresh access token in `SingleUseRefreshTokenOauth2Authenticator` [#20923](https://github.com/airbytehq/airbyte/pull/20923) + +## 0.16.2 + +Fix the naming of OAuthAuthenticator + +## 0.16.1 + +Include declarative_component_schema.yaml in the publish to PyPi + +## 0.16.0 + +Start validating low-code manifests using the declarative_component_schema.yaml file + +## 0.15.0 + +Reverts additions from versions 0.13.0 and 0.13.3. + +## 0.14.0 + +Low-code: Add token_expiry_date_format to OAuth Authenticator. Resolve ref schema + +## 0.13.3 + +Fixed `StopIteration` exception for empty streams while `check_availability` runs. + +## 0.13.2 + +Low-code: Enable low-code CDK users to specify schema inline in the manifest + +## 0.13.1 + +Low-code: Add `SessionTokenAuthenticator` + +## 0.13.0 + +Add `Stream.check_availability` and `Stream.AvailabilityStrategy`. Make `HttpAvailabilityStrategy` the default `HttpStream.AvailabilityStrategy`. + +## 0.12.4 + +Lookback window should applied when a state is supplied as well + +## 0.12.3 + +Low-code: Finally, make `OffsetIncrement.page_size` interpolated string or int + +## 0.12.2 + +Revert breaking change on `read_config` while keeping the improvement on the error message + +## 0.12.0 + +Improve error readability when reading JSON config files + +## 0.11.3 + +Low-code: Log response error message on failure + +## 0.11.2 + +Low-code: Include the HTTP method used by the request in logging output of the `airbyte-cdk` + +## 0.11.1 + +Low-code: Fix the component manifest schema to and validate check instead of checker + +## 0.11.0 + +Declare a new authenticator `SingleUseRefreshTokenOauth2Authenticator` that can perform connector configuration mutation and emit `AirbyteControlMessage.ConnectorConfig`. + +## 0.10.0 + +Low-code: Add `start_from_page` option to a PageIncrement class + +## 0.9.5 + +Low-code: Add jinja macro `format_datetime` + +## 0.9.4 + +Low-code: Fix reference resolution for connector builder + +## 0.9.3 + +Low-code: Avoid duplicate HTTP query in `simple_retriever` + +## 0.9.2 + +Low-code: Make `default_paginator.page_token_option` optional + +## 0.9.1 + +Low-code: Fix filtering vars in `InterpolatedRequestInputProvider.eval_request_inputs` + +## 0.9.0 + +Low-code: Allow `grant_type` to be specified for OAuthAuthenticator + +## 0.8.1 + +Low-code: Don't update cursor for non-record messages and fix default loader for connector builder manifests + +## 0.8.0 + +Low-code: Allow for request and response to be emitted as log messages + +## 0.7.1 + +Low-code: Decouple yaml manifest parsing from the declarative source implementation + +## 0.7.0 + +Low-code: Allow connector specifications to be defined in the manifest + +## 0.6.0 + +Low-code: Add support for monthly and yearly incremental updates for `DatetimeStreamSlicer` + +## 0.5.4 + +Low-code: Get response.json in a safe way + +## 0.5.3 + +Low-code: Replace EmptySchemaLoader with DefaultSchemaLoader to retain backwards compatibility +Low-code: Evaluate backoff strategies at runtime + +## 0.5.2 + +Low-code: Allow for read even when schemas are not defined for a connector yet + +## 0.4.2 + +Low-code: Fix off by one error with the stream slicers + +## 0.4.1 + +Low-code: Fix a few bugs with the stream slicers + +## 0.4.0 + +Low-code: Add support for custom error messages on error response filters + +## 0.3.0 + +Publish python typehints via `py.typed` file. + +## 0.2.3 + +- Propagate options to InterpolatedRequestInputProvider + +## 0.2.2 + +- Report config validation errors as failed connection status during `check`. +- Report config validation errors as `config_error` failure type. + +## 0.2.1 + +- Low-code: Always convert stream slices output to an iterator + +## 0.2.0 + +- Replace caching method: VCR.py -> requests-cache with SQLite backend + +## 0.1.104 + +- Protocol change: `supported_sync_modes` is now a required properties on AirbyteStream. [#15591](https://github.com/airbytehq/airbyte/pull/15591) + +## 0.1.103 + +- Low-code: added hash filter to jinja template + +## 0.1.102 + +- Low-code: Fix check for streams that do not define a stream slicer + +## 0.1.101 + +- Low-code: $options do not overwrite parameters that are already set + +## 0.1.100 + +- Low-code: Pass stream_slice to read_records when reading from CheckStream + +## 0.1.99 + +- Low-code: Fix default stream schema loader + +## 0.1.98 + +- Low-code: Expose WaitUntilTimeFromHeader strategy and WaitTimeFromHeader as component type + +## 0.1.97 + +- Revert 0.1.96 + +## 0.1.96 + +- Improve error for returning non-iterable from connectors parse_response + +## 0.1.95 + +- Low-code: Expose PageIncrement strategy as component type + +## 0.1.94 + +- Low-code: Stream schema loader has a default value and can be omitted + +## 0.1.93 + +- Low-code: Standardize slashes in url_base and path + +## 0.1.92 + +- Low-code: Properly propagate $options to array items +- Low-code: Log request and response when running check operation in debug mode + +## 0.1.91 + +- Low-code: Rename LimitPaginator to DefaultPaginator and move page_size field to PaginationStrategy + +## 0.1.90 + +- Fix error when TypeTransformer tries to warn about invalid transformations in arrays + +## 0.1.89 + +- Fix: properly emit state when a stream has empty slices, provided by an iterator + +## 0.1.88 + +- Bugfix: Evaluate `response.text` only in debug mode + +## 0.1.87 + +- During incremental syncs allow for streams to emit state messages in the per-stream format + +## 0.1.86 + +- TypeTransformer now converts simple types to array of simple types +- TypeTransformer make warning message more informative + +## 0.1.85 + +- Make TypeTransformer more robust to incorrect incoming records + +## 0.1.84 + +- Emit legacy format when state is unspecified for read override connectors + +## 0.1.83 + +- Fix per-stream to send legacy format for connectors that override read + +## 0.1.82 + +- Freeze dataclasses-jsonschema to 2.15.1 + +## 0.1.81 + +- Fix regression in `_checkpoint_state` arg + +## Unreleased + +- Update Airbyte Protocol model to support protocol_version + +## 0.1.80 + +- Add NoAuth to declarative registry and auth parse bug fix + +## 0.1.79 + +- Fix yaml schema parsing when running from docker container + +## 0.1.78 + +- Fix yaml config parsing when running from docker container + +## 0.1.77 + +- Add schema validation for declarative YAML connector configs + +## 0.1.76 + +- Bugfix: Correctly set parent slice stream for sub-resource streams + +## 0.1.75 + +- Improve `filter_secrets` skip empty secret + +## 0.1.74 + +- Replace JelloRecordExtractor with DpathRecordExtractor + +## 0.1.73 + +- Bugfix: Fix bug in DatetimeStreamSlicer's parsing method + +## 0.1.72 + +- Bugfix: Fix bug in DatetimeStreamSlicer's format method + +## 0.1.71 + +- Refactor declarative package to dataclasses +- Bugfix: Requester header always converted to string +- Bugfix: Reset paginator state between stream slices +- Bugfix: Record selector handles single records + +## 0.1.70 + +- Bugfix: DatetimeStreamSlicer cast interpolated result to string before converting to datetime +- Bugfix: Set stream slicer's request options in SimpleRetriever + +## 0.1.69 + +- AbstractSource emits a state message when reading incremental even if there were no stream slices to process. + +## 0.1.68 + +- Replace parse-time string interpolation with run-time interpolation in YAML-based sources + +## 0.1.67 + +- Add support declarative token authenticator. + +## 0.1.66 + +- Call init_uncaught_exception_handler from AirbyteEntrypoint.**init** and Destination.run_cmd +- Add the ability to remove & add records in YAML-based sources + +## 0.1.65 + +- Allow for detailed debug messages to be enabled using the --debug command. + +## 0.1.64 + +- Add support for configurable oauth request payload and declarative oauth authenticator. + +## 0.1.63 + +- Define `namespace` property on the `Stream` class inside `core.py`. + +## 0.1.62 + +Bugfix: Correctly obfuscate nested secrets and secrets specified inside oneOf blocks inside the connector's spec. + +## 0.1.61 + +- Remove legacy sentry code + +## 0.1.60 + +- Add `requests.exceptions.ChunkedEncodingError` to transient errors so it could be retried + +## 0.1.59 + +- Add `Stream.get_error_display_message()` to retrieve user-friendly messages from exceptions encountered while reading streams. +- Add default error error message retrieval logic for `HTTPStream`s following common API patterns. + +## 0.1.58 + +`TypeTransformer.default_convert` catch `TypeError` + +## 0.1.57 + +Update protocol models to support per-stream state: [#12829](https://github.com/airbytehq/airbyte/pull/12829). + +## 0.1.56 + +- Update protocol models to include `AirbyteTraceMessage` +- Emit an `AirbyteTraceMessage` on uncaught exceptions +- Add `AirbyteTracedException` + +## 0.1.55 + +Add support for reading the spec from a YAML file (`spec.yaml`) + +## 0.1.54 + +- Add ability to import `IncrementalMixin` from `airbyte_cdk.sources.streams`. +- Bumped minimum supported Python version to 3.9. + +## 0.1.53 + +Remove a false positive error logging during the send process. + +## 0.1.52 + +Fix BaseBackoffException constructor + +## 0.1.50 + +Improve logging for Error handling during send process. + +## 0.1.49 + +Add support for streams with explicit state attribute. + +## 0.1.48 + +Fix type annotations. + +## 0.1.47 + +Fix typing errors. + +## 0.1.45 + +Integrate Sentry for performance and errors tracking. + +## 0.1.44 + +Log http response status code and its content. + +## 0.1.43 + +Fix logging of unhandled exceptions: print stacktrace. + +## 0.1.42 + +Add base pydantic model for connector config and schemas. + +## 0.1.41 + +Fix build error + +## 0.1.40 + +Filter airbyte_secrets values at logger and other logging refactorings. + +## 0.1.39 + +Add `__init__.py` to mark the directory `airbyte_cdk/utils` as a package. + +## 0.1.38 + +Improve URL-creation in CDK. Changed to using `urllib.parse.urljoin()`. + +## 0.1.37 + +Fix `emitted_at` from `seconds * 1000` to correct milliseconds. + +## 0.1.36 + +Fix broken logger in streams: add logger inheritance for streams from `airbyte`. + +## 0.1.35 + +Fix false warnings on record transform. + +## 0.1.34 + +Fix logging inside source and streams + +## 0.1.33 + +Resolve $ref fields for discover json schema. + +## 0.1.32 + +- Added Sphinx docs `airbyte-cdk/python/reference_docs` module. +- Added module documents at `airbyte-cdk/python/sphinx-docs.md`. +- Added Read the Docs publishing configuration at `.readthedocs.yaml`. + +## 0.1.31 + +Transforming Python log levels to Airbyte protocol log levels + +## 0.1.30 + +Updated OAuth2Specification.rootObject type in airbyte_protocol to allow string or int + +## 0.1.29 + +Fix import logger error + +## 0.1.28 + +Added `check_config_against_spec` parameter to `Connector` abstract class +to allow skipping validating the input config against the spec for non-`check` calls + +## 0.1.27 + +Improving unit test for logger + +## 0.1.26 + +Use python standard logging instead of custom class + +## 0.1.25 + +Modified `OAuth2Specification` model, added new fields: `rootObject` and `oauthFlowOutputParameters` + +## 0.1.24 + +Added Transform class to use for mutating record value types so they adhere to jsonschema definition. + +## 0.1.23 + +Added the ability to use caching for efficient synchronization of nested streams. + +## 0.1.22 + +Allow passing custom headers to request in `OAuth2Authenticator.refresh_access_token()`: https://github.com/airbytehq/airbyte/pull/6219 + +## 0.1.21 + +Resolve nested schema references and move external references to single schema definitions. + +## 0.1.20 + +- Allow using `requests.auth.AuthBase` as authenticators instead of custom CDK authenticators. +- Implement Oauth2Authenticator, MultipleTokenAuthenticator and TokenAuthenticator authenticators. +- Add support for both legacy and requests native authenticator to HttpStream class. + +## 0.1.19 + +No longer prints full config files on validation error to prevent exposing secrets to log file: https://github.com/airbytehq/airbyte/pull/5879 + +## 0.1.18 + +Fix incremental stream not saved state when internal limit config set. + +## 0.1.17 + +Fix mismatching between number of records actually read and number of records in logs by 1: https://github.com/airbytehq/airbyte/pull/5767 + +## 0.1.16 + +Update generated AirbyteProtocol models to contain [Oauth changes](https://github.com/airbytehq/airbyte/pull/5776). + +## 0.1.15 + +Add \_limit and \_page_size as internal config parameters for SAT + +## 0.1.14 + +If the input config file does not comply with spec schema, raise an exception instead of `system.exit`. + +## 0.1.13 + +Fix defect with user defined backoff time retry attempts, number of retries logic fixed + +## 0.1.12 + +Add raise_on_http_errors, max_retries, retry_factor properties to be able to ignore http status errors and modify retry time in HTTP stream + +## 0.1.11 + +Add checking specified config againt spec for read, write, check and discover commands + +## 0.1.10 + +Add `MultipleTokenAuthenticator` class to allow cycling through a list of API tokens when making HTTP requests + +## 0.1.8 + +Allow to fetch primary key info from singer catalog + +## 0.1.7 + +Allow to use non-JSON payloads in request body for http source + +## 0.1.6 + +Add abstraction for creating destinations. + +Fix logging of the initial state. + +## 0.1.5 + +Allow specifying keyword arguments to be sent on a request made by an HTTP stream: https://github.com/airbytehq/airbyte/pull/4493 + +## 0.1.4 + +Allow to use Python 3.7.0: https://github.com/airbytehq/airbyte/pull/3566 + +## 0.1.2 + +Fix an issue that caused infinite pagination: https://github.com/airbytehq/airbyte/pull/3366 + +## 0.1.1 + +Initial Release diff --git a/airbyte-cdk/python/LICENSE.txt b/airbyte-cdk/python/LICENSE.txt new file mode 100644 index 000000000000..06a0065fd011 --- /dev/null +++ b/airbyte-cdk/python/LICENSE.txt @@ -0,0 +1,19 @@ +Copyright (c) 2020 Airbyte + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/airbyte-cdk/python/README.md b/airbyte-cdk/python/README.md index b35713584f58..4df178963b3c 100644 --- a/airbyte-cdk/python/README.md +++ b/airbyte-cdk/python/README.md @@ -1,3 +1,222 @@ -# The Airbyte Python CDK has Moved! +# Airbyte Python CDK and Low-Code CDK -The CDK's new home is at: https://github.com/airbytehq/airbyte-python-cdk +Airbyte Python CDK is a framework for building Airbyte API Source Connectors. It provides a set of +classes and helpers that make it easy to build a connector against an HTTP API (REST, GraphQL, etc), +or a generic Python source connector. + +## Usage + +If you're looking to build a connector, we highly recommend that you +[start with the Connector Builder](https://docs.airbyte.com/connector-development/connector-builder-ui/overview). +It should be enough for 90% connectors out there. For more flexible and complex connectors, use the +[low-code CDK and `SourceDeclarativeManifest`](https://docs.airbyte.com/connector-development/config-based/low-code-cdk-overview). + +If that doesn't work, then consider building on top of the +[lower-level Python CDK itself](https://docs.airbyte.com/connector-development/cdk-python/). + +### Quick Start + +To get started on a Python CDK based connector or a low-code connector, you can generate a connector +project from a template: + +```bash +# from the repo root +cd airbyte-integrations/connector-templates/generator +./generate.sh +``` + +### Example Connectors + +**HTTP Connectors**: + +- [Stripe](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-stripe/) +- [Salesforce](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-salesforce/) + +**Python connectors using the bare-bones `Source` abstraction**: + +- [Google Sheets](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-google-sheets/google_sheets_source/google_sheets_source.py) + +This will generate a project with a type and a name of your choice and put it in +`airbyte-integrations/connectors`. Open the directory with your connector in an editor and follow +the `TODO` items. + +## Python CDK Overview + +Airbyte CDK code is within `airbyte_cdk` directory. Here's a high level overview of what's inside: + +- `connector_builder`. Internal wrapper that helps the Connector Builder platform run a declarative + manifest (low-code connector). You should not use this code directly. If you need to run a + `SourceDeclarativeManifest`, take a look at + [`source-declarative-manifest`](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-declarative-manifest) + connector implementation instead. +- `destinations`. Basic Destination connector support! If you're building a Destination connector in + Python, try that. Some of our vector DB destinations like `destination-pinecone` are using that + code. +- `models` expose `airbyte_protocol.models` as a part of `airbyte_cdk` package. +- `sources/concurrent_source` is the Concurrent CDK implementation. It supports reading data from + streams concurrently per slice / partition, useful for connectors with high throughput and high + number of records. +- `sources/declarative` is the low-code CDK. It works on top of Airbyte Python CDK, but provides a + declarative manifest language to define streams, operations, etc. This makes it easier to build + connectors without writing Python code. +- `sources/file_based` is the CDK for file-based sources. Examples include S3, Azure, GCS, etc. + +## Contributing + +Thank you for being interested in contributing to Airbyte Python CDK! Here are some guidelines to +get you started: + +- We adhere to the [code of conduct](/CODE_OF_CONDUCT.md). +- You can contribute by reporting bugs, posting github discussions, opening issues, improving + [documentation](/docs/), and submitting pull requests with bugfixes and new features alike. +- If you're changing the code, please add unit tests for your change. +- When submitting issues or PRs, please add a small reproduction project. Using the changes in your + connector and providing that connector code as an example (or a satellite PR) helps! + +### First time setup + +Install the project dependencies and development tools: + +```bash +poetry install --all-extras +``` + +Installing all extras is required to run the full suite of unit tests. + +#### Running tests locally + +- Iterate on the CDK code locally +- Run tests via `poetry run poe unit-test-with-cov`, or `python -m pytest -s unit_tests` if you want + to pass pytest options. +- Run `poetry run poe check-local` to lint all code, type-check modified code, and run unit tests + with coverage in one command. + +To see all available scripts, run `poetry run poe`. + +##### Autogenerated files + +Low-code CDK models are generated from `sources/declarative/declarative_component_schema.yaml`. If +the iteration you are working on includes changes to the models or the connector generator, you +might want to regenerate them. In order to do that, you can run: + +```bash +poetry run poe build +``` + +This will generate the code generator docker image and the component manifest files based on the +schemas and templates. + +#### Testing + +All tests are located in the `unit_tests` directory. Run `poetry run poe unit-test-with-cov` to run +them. This also presents a test coverage report. For faster iteration with no coverage report and +more options, `python -m pytest -s unit_tests` is a good place to start. + +#### Building and testing a connector with your local CDK + +When developing a new feature in the CDK, you may find it helpful to run a connector that uses that +new feature. You can test this in one of two ways: + +- Running a connector locally +- Building and running a source via Docker + +##### Installing your local CDK into a local Python connector + +Open the connector's `pyproject.toml` file and replace the line with `airbyte_cdk` with the +following: + +```toml +airbyte_cdk = { path = "../../../airbyte-cdk/python/airbyte_cdk", develop = true } +``` + +Then, running `poetry update` should reinstall `airbyte_cdk` from your local working directory. + +##### Building a Python connector in Docker with your local CDK installed + +_Pre-requisite: Install the +[`airbyte-ci` CLI](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md)_ + +You can build your connector image with the local CDK using + +```bash +# from the airbytehq/airbyte base directory +airbyte-ci connectors --use-local-cdk --name= build +``` + +Note that the local CDK is injected at build time, so if you make changes, you will have to run the +build command again to see them reflected. + +##### Running Connector Acceptance Tests for a single connector in Docker with your local CDK installed + +_Pre-requisite: Install the +[`airbyte-ci` CLI](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md)_ + +To run acceptance tests for a single connectors using the local CDK, from the connector directory, +run + +```bash +airbyte-ci connectors --use-local-cdk --name= test +``` + +#### When you don't have access to the API + +There may be a time when you do not have access to the API (either because you don't have the +credentials, network access, etc...) You will probably still want to do end-to-end testing at least +once. In order to do so, you can emulate the server you would be reaching using a server stubbing +tool. + +For example, using [mockserver](https://www.mock-server.com/), you can set up an expectation file +like this: + +```json +{ + "httpRequest": { + "method": "GET", + "path": "/data" + }, + "httpResponse": { + "body": "{\"data\": [{\"record_key\": 1}, {\"record_key\": 2}]}" + } +} +``` + +Assuming this file has been created at `secrets/mock_server_config/expectations.json`, running the +following command will allow to match any requests on path `/data` to return the response defined in +the expectation file: + +```bash +docker run -d --rm -v $(pwd)/secrets/mock_server_config:/config -p 8113:8113 --env MOCKSERVER_LOG_LEVEL=TRACE --env MOCKSERVER_SERVER_PORT=8113 --env MOCKSERVER_WATCH_INITIALIZATION_JSON=true --env MOCKSERVER_PERSISTED_EXPECTATIONS_PATH=/config/expectations.json --env MOCKSERVER_INITIALIZATION_JSON_PATH=/config/expectations.json mockserver/mockserver:5.15.0 +``` + +HTTP requests to `localhost:8113/data` should now return the body defined in the expectations file. +To test this, the implementer either has to change the code which defines the base URL for Python +source or update the `url_base` from low-code. With the Connector Builder running in docker, you +will have to use domain `host.docker.internal` instead of `localhost` as the requests are executed +within docker. + +#### Publishing a new version to PyPi + +Python CDK has a +[GitHub workflow](https://github.com/airbytehq/airbyte/actions/workflows/publish-cdk-command-manually.yml) +that manages the CDK changelog, making a new release for `airbyte_cdk`, publishing it to PyPI, and +then making a commit to update (and subsequently auto-release) +[`source-declarative-m anifest`](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-declarative-manifest) +and Connector Builder (in the platform repository). + +> [!Note]: The workflow will handle the `CHANGELOG.md` entry for you. You should not add changelog +> lines in your PRs to the CDK itself. + +> [!Warning]: The workflow bumps version on it's own, please don't change the CDK version in +> `pyproject.toml` manually. + +1. You only trigger the release workflow once all the PRs that you want to be included are already + merged into the `master` branch. +2. The + [`Publish CDK Manually`](https://github.com/airbytehq/airbyte/actions/workflows/publish-cdk-command-manually.yml) + workflow from master using `release-type=major|manor|patch` and setting the changelog message. +3. When the workflow runs, it will commit a new version directly to master branch. +4. The workflow will bump the version of `source-declarative-manifest` according to the + `release-type` of the CDK, then commit these changes back to master. The commit to master will + kick off a publish of the new version of `source-declarative-manifest`. +5. The workflow will also add a pull request to `airbyte-platform-internal` repo to bump the + dependency in Connector Builder. diff --git a/airbyte-cdk/python/airbyte_cdk/__init__.py b/airbyte-cdk/python/airbyte_cdk/__init__.py new file mode 100644 index 000000000000..84501c6f37f9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/__init__.py @@ -0,0 +1,284 @@ +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +""" +# Welcome to the Airbyte Python CDK! + +The Airbyte Python CDK is a Python library that provides a set of tools to help you build +connectors for the Airbyte platform. + +## Building Source Connectors + +To build a source connector, you will want to refer to +the following classes and modules: + +- `airbyte_cdk.sources` +- `airbyte_cdk.sources.concurrent_source` +- `airbyte_cdk.sources.config` +- `airbyte_cdk.sources.file_based` +- `airbyte_cdk.sources.streams` + +## Building Destination Connectors + +To build a destination connector, you will want to refer to +the following classes and modules: + +- `airbyte_cdk.destinations` +- `airbyte_cdk.destinations.Destination` +- `airbyte_cdk.destinations.vector_db_based` + +## Working with Airbyte Protocol Models + +The Airbyte CDK provides a set of classes that help you work with the Airbyte protocol models: + +- `airbyte_cdk.models.airbyte_protocol` +- `airbyte_cdk.models.airbyte_protocol_serializers` + +--- + +API Reference + +--- + +""" + +# Warning: The below imports are not stable and will cause circular +# dependencies if auto-sorted with isort. Please keep them in the same order. +# TODO: Submodules should import from lower-level modules, rather than importing from here. +# Imports should also be placed in `if TYPE_CHECKING` blocks if they are only used as type +# hints - again, to avoid circular dependencies. +# Once those issues are resolved, the below can be sorted with isort. +from importlib import metadata + +from .destinations import Destination +from .models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, Status, Type, FailureType, AirbyteStream, AdvancedAuth, DestinationSyncMode, ConnectorSpecification, OAuthConfigSpecification, OrchestratorType, ConfiguredAirbyteStream, SyncMode, AirbyteLogMessage, Level, AirbyteRecordMessage + +from .sources import Source +from .config_observation import create_connector_config_control_message, emit_configuration_as_airbyte_control_message +from .connector import BaseConnector, Connector + +from .entrypoint import launch, AirbyteEntrypoint + +from .logger import AirbyteLogFormatter, init_logger +from .sources import AbstractSource +from .sources.concurrent_source.concurrent_source import ConcurrentSource +from .sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter +from .sources.config import BaseConfig +from .sources.types import Config, Record, StreamSlice +from .sources.connector_state_manager import ConnectorStateManager +from .sources.declarative.auth import DeclarativeOauth2Authenticator +from .sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator +from .sources.declarative.auth.declarative_authenticator import NoAuth +from .sources.declarative.auth.oauth import DeclarativeSingleUseRefreshTokenOauth2Authenticator +from .sources.declarative.auth.token import BasicHttpAuthenticator, BearerAuthenticator, ApiKeyAuthenticator +from .sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from .sources.declarative.declarative_stream import DeclarativeStream +from .sources.declarative.decoders import Decoder, JsonDecoder +from .sources.declarative.exceptions import ReadException +from .sources.declarative.extractors import DpathExtractor, RecordSelector +from .sources.declarative.extractors.record_extractor import RecordExtractor +from .sources.declarative.extractors.record_filter import RecordFilter +from .sources.declarative.incremental import DatetimeBasedCursor +from .sources.declarative.interpolation import InterpolatedString, InterpolatedBoolean +from .sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from .sources.declarative.migrations.legacy_to_per_partition_state_migration import LegacyToPerPartitionStateMigration + +from .sources.declarative.partition_routers import CartesianProductStreamSlicer, SinglePartitionRouter, SubstreamPartitionRouter +from .sources.declarative.partition_routers.substream_partition_router import ParentStreamConfig +from .sources.declarative.requesters import Requester, HttpRequester + +from .sources.declarative.requesters.error_handlers import BackoffStrategy +from .sources.declarative.requesters.paginators import DefaultPaginator, PaginationStrategy +from .sources.declarative.requesters.paginators.strategies import OffsetIncrement, CursorPaginationStrategy, PageIncrement, StopConditionPaginationStrategyDecorator + +from .sources.declarative.requesters.request_option import RequestOption, RequestOptionType + +from .sources.declarative.requesters.request_options.default_request_options_provider import DefaultRequestOptionsProvider +from .sources.declarative.requesters.request_options.interpolated_request_input_provider import InterpolatedRequestInputProvider +from .sources.declarative.requesters.requester import HttpMethod +from .sources.declarative.retrievers import SimpleRetriever +from .sources.declarative.schema import JsonFileSchemaLoader +from .sources.declarative.transformations.add_fields import AddFields, AddedFieldDefinition +from .sources.declarative.transformations.transformation import RecordTransformation +from .sources.declarative.types import FieldPointer +from .sources.declarative.yaml_declarative_source import YamlDeclarativeSource +from .sources.message import InMemoryMessageRepository, MessageRepository +from .sources.source import TState +from .sources.streams.availability_strategy import AvailabilityStrategy +from .sources.streams.call_rate import AbstractAPIBudget, HttpAPIBudget, HttpRequestMatcher, MovingWindowCallRatePolicy, Rate, CachedLimiterSession, LimiterSession +from .sources.streams.checkpoint import Cursor as LegacyCursor +from .sources.streams.checkpoint import ResumableFullRefreshCursor +from .sources.streams.concurrent.adapters import StreamFacade +from .sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, FinalStateCursor +from .sources.streams.concurrent.cursor import Cursor +from .sources.streams.concurrent.state_converters.datetime_stream_state_converter import EpochValueConcurrentStreamStateConverter, IsoMillisConcurrentStreamStateConverter +from .sources.streams.core import Stream, IncrementalMixin, package_name_from_class +from .sources.streams.http import HttpStream, HttpSubStream +from .sources.streams.http.availability_strategy import HttpAvailabilityStrategy +from .sources.streams.http.exceptions import BaseBackoffException, DefaultBackoffException, UserDefinedBackoffException +from .sources.streams.http.rate_limiting import default_backoff_handler +from .sources.streams.http.requests_native_auth import Oauth2Authenticator, TokenAuthenticator, SingleUseRefreshTokenOauth2Authenticator +from .sources.streams.http.requests_native_auth.abstract_token import AbstractHeaderAuthenticator +from .sources.utils import casing +from .sources.utils.schema_helpers import InternalConfig, ResourceSchemaLoader, check_config_against_spec_or_exit, split_config, expand_refs +from .sources.utils.transform import TransformConfig, TypeTransformer +from .utils import AirbyteTracedException, is_cloud_environment +from .utils.constants import ENV_REQUEST_CACHE_PATH +from .utils.event_timing import create_timer +from .utils.oneof_option_config import OneOfOptionConfig +from .utils.spec_schema_transformations import resolve_refs +from .utils.stream_status_utils import as_airbyte_message + + +__all__ = [ + # Availability strategy + "AvailabilityStrategy", + "HttpAvailabilityStrategy", + # Checkpoint + "LegacyCursor", + "ResumableFullRefreshCursor", + # Concurrent + "ConcurrentCursor", + "ConcurrentSource", + "ConcurrentSourceAdapter", + "Cursor", + "CursorField", + "DEFAULT_CONCURRENCY", + "EpochValueConcurrentStreamStateConverter", + "FinalStateCursor", + "IsoMillisConcurrentStreamStateConverter", + "StreamFacade", + # Config observation + "create_connector_config_control_message", + "emit_configuration_as_airbyte_control_message", + # Connector + "AbstractSource", + "BaseConfig", + "BaseConnector", + "Connector", + "Destination", + "Source", + "TState", + # Declarative + "AddFields", + "AddedFieldDefinition", + "ApiKeyAuthenticator", + "BackoffStrategy", + "BasicHttpAuthenticator", + "BearerAuthenticator", + "CartesianProductStreamSlicer", + "CursorPaginationStrategy", + "DatetimeBasedCursor", + "DeclarativeAuthenticator", + "DeclarativeOauth2Authenticator", + "DeclarativeSingleUseRefreshTokenOauth2Authenticator", + "DeclarativeStream", + "Decoder", + "DefaultPaginator", + "DefaultRequestOptionsProvider", + "DpathExtractor", + "FieldPointer", + "HttpMethod", + "HttpRequester", + "InterpolatedBoolean", + "InterpolatedRequestInputProvider", + "InterpolatedString", + "JsonDecoder", + "JsonFileSchemaLoader", + "LegacyToPerPartitionStateMigration", + "ManifestDeclarativeSource", + "MinMaxDatetime", + "NoAuth", + "OffsetIncrement", + "PageIncrement", + "PaginationStrategy", + "ParentStreamConfig", + "ReadException", + "RecordExtractor", + "RecordFilter", + "RecordSelector", + "RecordTransformation", + "RequestOption", + "RequestOptionType", + "Requester", + "ResponseStatus", + "SimpleRetriever", + "SinglePartitionRouter", + "StopConditionPaginationStrategyDecorator", + "StreamSlice", + "SubstreamPartitionRouter", + "YamlDeclarativeSource", + # Entrypoint + "launch", + "AirbyteEntrypoint", + # HTTP + "AbstractAPIBudget", + "AbstractHeaderAuthenticator", + "BaseBackoffException", + "CachedLimiterSession", + "DefaultBackoffException", + "default_backoff_handler", + "HttpAPIBudget", + "HttpAuthenticator", + "HttpRequestMatcher", + "HttpStream", + "HttpSubStream", + "LimiterSession", + "MovingWindowCallRatePolicy", + "MultipleTokenAuthenticator", + "Oauth2Authenticator", + "Rate", + "SingleUseRefreshTokenOauth2Authenticator", + "TokenAuthenticator", + "UserDefinedBackoffException", + # Logger + "AirbyteLogFormatter", + "init_logger", + # Protocol classes + "AirbyteStream", + "AirbyteConnectionStatus", + "AirbyteMessage", + "ConfiguredAirbyteCatalog", + "Status", + "Type", + "OrchestratorType", + "ConfiguredAirbyteStream", + "DestinationSyncMode", + "SyncMode", + "FailureType", + "AdvancedAuth", + "AirbyteLogMessage", + "OAuthConfigSpecification", + "ConnectorSpecification", + "Level", + "AirbyteRecordMessage", + # Repository + "InMemoryMessageRepository", + "MessageRepository", + # State management + "ConnectorStateManager", + # Stream + "IncrementalMixin", + "Stream", + "StreamData", + "package_name_from_class", + # Utils + "AirbyteTracedException", + "is_cloud_environment", + "casing", + "InternalConfig", + "ResourceSchemaLoader", + "check_config_against_spec_or_exit", + "split_config", + "TransformConfig", + "TypeTransformer", + "ENV_REQUEST_CACHE_PATH", + "create_timer", + "OneOfOptionConfig", + "resolve_refs", + "as_airbyte_message", + # Types + "Config", + "Record", + "Source", + "StreamSlice", +] +__version__ = metadata.version("airbyte_cdk") diff --git a/airbyte-cdk/python/airbyte_cdk/config_observation.py b/airbyte-cdk/python/airbyte_cdk/config_observation.py new file mode 100644 index 000000000000..94a3d64a511b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/config_observation.py @@ -0,0 +1,96 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from __future__ import ( # Used to evaluate type hints at runtime, a NameError: name 'ConfigObserver' is not defined is thrown otherwise + annotations, +) + +import time +from copy import copy +from typing import Any, List, MutableMapping + +from airbyte_cdk.models import ( + AirbyteControlConnectorConfigMessage, + AirbyteControlMessage, + AirbyteMessage, + AirbyteMessageSerializer, + OrchestratorType, + Type, +) +from orjson import orjson + + +class ObservedDict(dict): # type: ignore # disallow_any_generics is set to True, and dict is equivalent to dict[Any] + def __init__( + self, non_observed_mapping: MutableMapping[Any, Any], observer: ConfigObserver, update_on_unchanged_value: bool = True + ) -> None: + non_observed_mapping = copy(non_observed_mapping) + self.observer = observer + self.update_on_unchanged_value = update_on_unchanged_value + for item, value in non_observed_mapping.items(): + # Observe nested dicts + if isinstance(value, MutableMapping): + non_observed_mapping[item] = ObservedDict(value, observer) + + # Observe nested list of dicts + if isinstance(value, List): + for i, sub_value in enumerate(value): + if isinstance(sub_value, MutableMapping): + value[i] = ObservedDict(sub_value, observer) + super().__init__(non_observed_mapping) + + def __setitem__(self, item: Any, value: Any) -> None: + """Override dict.__setitem__ by: + 1. Observing the new value if it is a dict + 2. Call observer update if the new value is different from the previous one + """ + previous_value = self.get(item) + if isinstance(value, MutableMapping): + value = ObservedDict(value, self.observer) + if isinstance(value, List): + for i, sub_value in enumerate(value): + if isinstance(sub_value, MutableMapping): + value[i] = ObservedDict(sub_value, self.observer) + super(ObservedDict, self).__setitem__(item, value) + if self.update_on_unchanged_value or value != previous_value: + self.observer.update() + + +class ConfigObserver: + """This class is made to track mutations on ObservedDict config. + When update is called a CONNECTOR_CONFIG control message is emitted on stdout. + """ + + def set_config(self, config: ObservedDict) -> None: + self.config = config + + def update(self) -> None: + emit_configuration_as_airbyte_control_message(self.config) + + +def observe_connector_config(non_observed_connector_config: MutableMapping[str, Any]) -> ObservedDict: + if isinstance(non_observed_connector_config, ObservedDict): + raise ValueError("This connector configuration is already observed") + connector_config_observer = ConfigObserver() + observed_connector_config = ObservedDict(non_observed_connector_config, connector_config_observer) + connector_config_observer.set_config(observed_connector_config) + return observed_connector_config + + +def emit_configuration_as_airbyte_control_message(config: MutableMapping[str, Any]) -> None: + """ + WARNING: deprecated - emit_configuration_as_airbyte_control_message is being deprecated in favor of the MessageRepository mechanism. + See the airbyte_cdk.sources.message package + """ + airbyte_message = create_connector_config_control_message(config) + print(orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode()) + + +def create_connector_config_control_message(config: MutableMapping[str, Any]) -> AirbyteMessage: + control_message = AirbyteControlMessage( + type=OrchestratorType.CONNECTOR_CONFIG, + emitted_at=time.time() * 1000, + connectorConfig=AirbyteControlConnectorConfigMessage(config=config), + ) + return AirbyteMessage(type=Type.CONTROL, control=control_message) diff --git a/airbyte-cdk/python/airbyte_cdk/connector.py b/airbyte-cdk/python/airbyte_cdk/connector.py new file mode 100644 index 000000000000..658a0b167077 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/connector.py @@ -0,0 +1,112 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import json +import logging +import os +import pkgutil +from abc import ABC, abstractmethod +from typing import Any, Generic, Mapping, Optional, Protocol, TypeVar + +import yaml +from airbyte_cdk.models import AirbyteConnectionStatus, ConnectorSpecification, ConnectorSpecificationSerializer + + +def load_optional_package_file(package: str, filename: str) -> Optional[bytes]: + """Gets a resource from a package, returning None if it does not exist""" + try: + return pkgutil.get_data(package, filename) + except FileNotFoundError: + return None + + +TConfig = TypeVar("TConfig", bound=Mapping[str, Any]) + + +class BaseConnector(ABC, Generic[TConfig]): + # configure whether the `check_config_against_spec_or_exit()` needs to be called + check_config_against_spec: bool = True + + @abstractmethod + def configure(self, config: Mapping[str, Any], temp_dir: str) -> TConfig: + """ + Persist config in temporary directory to run the Source job + """ + + @staticmethod + def read_config(config_path: str) -> Mapping[str, Any]: + config = BaseConnector._read_json_file(config_path) + if isinstance(config, Mapping): + return config + else: + raise ValueError( + f"The content of {config_path} is not an object and therefore is not a valid config. Please ensure the file represent a config." + ) + + @staticmethod + def _read_json_file(file_path: str) -> Any: + with open(file_path, "r") as file: + contents = file.read() + + try: + return json.loads(contents) + except json.JSONDecodeError as error: + raise ValueError(f"Could not read json file {file_path}: {error}. Please ensure that it is a valid JSON.") + + @staticmethod + def write_config(config: TConfig, config_path: str) -> None: + with open(config_path, "w") as fh: + fh.write(json.dumps(config)) + + def spec(self, logger: logging.Logger) -> ConnectorSpecification: + """ + Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password) + required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root. + """ + + package = self.__class__.__module__.split(".")[0] + + yaml_spec = load_optional_package_file(package, "spec.yaml") + json_spec = load_optional_package_file(package, "spec.json") + + if yaml_spec and json_spec: + raise RuntimeError("Found multiple spec files in the package. Only one of spec.yaml or spec.json should be provided.") + + if yaml_spec: + spec_obj = yaml.load(yaml_spec, Loader=yaml.SafeLoader) + elif json_spec: + try: + spec_obj = json.loads(json_spec) + except json.JSONDecodeError as error: + raise ValueError(f"Could not read json spec file: {error}. Please ensure that it is a valid JSON.") + else: + raise FileNotFoundError("Unable to find spec.yaml or spec.json in the package.") + + return ConnectorSpecificationSerializer.load(spec_obj) + + @abstractmethod + def check(self, logger: logging.Logger, config: TConfig) -> AirbyteConnectionStatus: + """ + Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect + to the Stripe API. + """ + + +class _WriteConfigProtocol(Protocol): + @staticmethod + def write_config(config: Mapping[str, Any], config_path: str) -> None: + ... + + +class DefaultConnectorMixin: + # can be overridden to change an input config + def configure(self: _WriteConfigProtocol, config: Mapping[str, Any], temp_dir: str) -> Mapping[str, Any]: + config_path = os.path.join(temp_dir, "config.json") + self.write_config(config, config_path) + return config + + +class Connector(DefaultConnectorMixin, BaseConnector[Mapping[str, Any]], ABC): + ... diff --git a/airbyte-cdk/python/airbyte_cdk/connector_builder/README.md b/airbyte-cdk/python/airbyte_cdk/connector_builder/README.md new file mode 100644 index 000000000000..cc2cf7064e29 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/connector_builder/README.md @@ -0,0 +1,53 @@ +# Connector Builder Backend + +This is the backend for requests from the +[Connector Builder](https://docs.airbyte.com/connector-development/connector-builder-ui/overview/). + +## Local development + +### Locally running the Connector Builder backend + +```bash +python main.py read --config path/to/config --catalog path/to/catalog +``` + +Note: + +- Requires the keys `__injected_declarative_manifest` and `__command` in its config, where + `__injected_declarative_manifest` is a JSON manifest and `__command` is one of the commands + handled by the ConnectorBuilderHandler (`stream_read` or `resolve_manifest`), i.e. + +```json +{ + "config": , + "__injected_declarative_manifest": {...}, + "__command": <"resolve_manifest" | "test_read"> +} +``` + +\*See +[ConnectionSpecification](https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#actor-specification) +for details on the `"config"` key if needed. + +- When the `__command` is `resolve_manifest`, the argument to `catalog` should be an empty string. +- The config can optionally contain an object under the `__test_read_config` key which can define + custom test read limits with `max_records`, `max_slices`, and `max_pages_per_slice` properties. + All custom limits are optional; a default will be used for any limit that is not provided. + +### Locally running the docker image + +#### Build + +First, make sure you build the latest Docker image: + +```bash +docker build -t airbyte/source-declarative-manifest:dev . +``` + +#### Run + +Then run any of the connector commands as follows: + +```bash +docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-declarative-manifest:dev read --config /secrets/config.json +``` diff --git a/airbyte-cdk/python/airbyte_cdk/connector_builder/__init__.py b/airbyte-cdk/python/airbyte_cdk/connector_builder/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/connector_builder/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/connector_builder/connector_builder_handler.py b/airbyte-cdk/python/airbyte_cdk/connector_builder/connector_builder_handler.py new file mode 100644 index 000000000000..b3cfd9a0503b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/connector_builder/connector_builder_handler.py @@ -0,0 +1,96 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import dataclasses +from datetime import datetime +from typing import Any, List, Mapping + +from airbyte_cdk.connector_builder.message_grouper import MessageGrouper +from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog +from airbyte_cdk.models import Type +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ModelToComponentFactory +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + +DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE = 5 +DEFAULT_MAXIMUM_NUMBER_OF_SLICES = 5 +DEFAULT_MAXIMUM_RECORDS = 100 + +MAX_PAGES_PER_SLICE_KEY = "max_pages_per_slice" +MAX_SLICES_KEY = "max_slices" +MAX_RECORDS_KEY = "max_records" + + +@dataclasses.dataclass +class TestReadLimits: + max_records: int = dataclasses.field(default=DEFAULT_MAXIMUM_RECORDS) + max_pages_per_slice: int = dataclasses.field(default=DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE) + max_slices: int = dataclasses.field(default=DEFAULT_MAXIMUM_NUMBER_OF_SLICES) + + +def get_limits(config: Mapping[str, Any]) -> TestReadLimits: + command_config = config.get("__test_read_config", {}) + max_pages_per_slice = command_config.get(MAX_PAGES_PER_SLICE_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE + max_slices = command_config.get(MAX_SLICES_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_SLICES + max_records = command_config.get(MAX_RECORDS_KEY) or DEFAULT_MAXIMUM_RECORDS + return TestReadLimits(max_records, max_pages_per_slice, max_slices) + + +def create_source(config: Mapping[str, Any], limits: TestReadLimits) -> ManifestDeclarativeSource: + manifest = config["__injected_declarative_manifest"] + return ManifestDeclarativeSource( + emit_connector_builder_messages=True, + source_config=manifest, + component_factory=ModelToComponentFactory( + emit_connector_builder_messages=True, + limit_pages_fetched_per_slice=limits.max_pages_per_slice, + limit_slices_fetched=limits.max_slices, + disable_retries=True, + disable_cache=True, + ), + ) + + +def read_stream( + source: DeclarativeSource, + config: Mapping[str, Any], + configured_catalog: ConfiguredAirbyteCatalog, + state: List[AirbyteStateMessage], + limits: TestReadLimits, +) -> AirbyteMessage: + try: + handler = MessageGrouper(limits.max_pages_per_slice, limits.max_slices, limits.max_records) + stream_name = configured_catalog.streams[0].stream.name # The connector builder only supports a single stream + stream_read = handler.get_message_groups(source, config, configured_catalog, state, limits.max_records) + return AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage(data=dataclasses.asdict(stream_read), stream=stream_name, emitted_at=_emitted_at()), + ) + except Exception as exc: + error = AirbyteTracedException.from_exception( + exc, message=filter_secrets(f"Error reading stream with config={config} and catalog={configured_catalog}: {str(exc)}") + ) + return error.as_airbyte_message() + + +def resolve_manifest(source: ManifestDeclarativeSource) -> AirbyteMessage: + try: + return AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage( + data={"manifest": source.resolved_manifest}, + emitted_at=_emitted_at(), + stream="resolve_manifest", + ), + ) + except Exception as exc: + error = AirbyteTracedException.from_exception(exc, message=f"Error resolving manifest: {str(exc)}") + return error.as_airbyte_message() + + +def _emitted_at() -> int: + return int(datetime.now().timestamp()) * 1000 diff --git a/airbyte-cdk/python/airbyte_cdk/connector_builder/main.py b/airbyte-cdk/python/airbyte_cdk/connector_builder/main.py new file mode 100644 index 000000000000..54e0b1e0be41 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/connector_builder/main.py @@ -0,0 +1,86 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import sys +from typing import Any, List, Mapping, Optional, Tuple + +from airbyte_cdk.connector import BaseConnector +from airbyte_cdk.connector_builder.connector_builder_handler import TestReadLimits, create_source, get_limits, read_stream, resolve_manifest +from airbyte_cdk.entrypoint import AirbyteEntrypoint +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteStateMessage, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, +) +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.source import Source +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from orjson import orjson + + +def get_config_and_catalog_from_args(args: List[str]) -> Tuple[str, Mapping[str, Any], Optional[ConfiguredAirbyteCatalog], Any]: + # TODO: Add functionality for the `debug` logger. + # Currently, no one `debug` level log will be displayed during `read` a stream for a connector created through `connector-builder`. + parsed_args = AirbyteEntrypoint.parse_args(args) + config_path, catalog_path, state_path = parsed_args.config, parsed_args.catalog, parsed_args.state + if parsed_args.command != "read": + raise ValueError("Only read commands are allowed for Connector Builder requests.") + + config = BaseConnector.read_config(config_path) + + if "__command" not in config: + raise ValueError( + f"Invalid config: `__command` should be provided at the root of the config but config only has keys {list(config.keys())}" + ) + + command = config["__command"] + if command == "test_read": + catalog = ConfiguredAirbyteCatalogSerializer.load(BaseConnector.read_config(catalog_path)) + state = Source.read_state(state_path) + else: + catalog = None + state = [] + + if "__injected_declarative_manifest" not in config: + raise ValueError( + f"Invalid config: `__injected_declarative_manifest` should be provided at the root of the config but config only has keys {list(config.keys())}" + ) + + return command, config, catalog, state + + +def handle_connector_builder_request( + source: ManifestDeclarativeSource, + command: str, + config: Mapping[str, Any], + catalog: Optional[ConfiguredAirbyteCatalog], + state: List[AirbyteStateMessage], + limits: TestReadLimits, +) -> AirbyteMessage: + if command == "resolve_manifest": + return resolve_manifest(source) + elif command == "test_read": + assert catalog is not None, "`test_read` requires a valid `ConfiguredAirbyteCatalog`, got None." + return read_stream(source, config, catalog, state, limits) + else: + raise ValueError(f"Unrecognized command {command}.") + + +def handle_request(args: List[str]) -> str: + command, config, catalog, state = get_config_and_catalog_from_args(args) + limits = get_limits(config) + source = create_source(config, limits) + return orjson.dumps(AirbyteMessageSerializer.dump(handle_connector_builder_request(source, command, config, catalog, state, limits))).decode() # type: ignore[no-any-return] # Serializer.dump() always returns AirbyteMessage + + +if __name__ == "__main__": + try: + print(handle_request(sys.argv[1:])) + except Exception as exc: + error = AirbyteTracedException.from_exception(exc, message=f"Error handling request: {str(exc)}") + m = error.as_airbyte_message() + print(orjson.dumps(AirbyteMessageSerializer.dump(m)).decode()) diff --git a/airbyte-cdk/python/airbyte_cdk/connector_builder/message_grouper.py b/airbyte-cdk/python/airbyte_cdk/connector_builder/message_grouper.py new file mode 100644 index 000000000000..e21ffd61abd8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/connector_builder/message_grouper.py @@ -0,0 +1,378 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from copy import deepcopy +from json import JSONDecodeError +from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Union + +from airbyte_cdk.connector_builder.models import ( + AuxiliaryRequest, + HttpRequest, + HttpResponse, + LogMessage, + StreamRead, + StreamReadPages, + StreamReadSlices, +) +from airbyte_cdk.entrypoint import AirbyteEntrypoint +from airbyte_cdk.models import ( + AirbyteControlMessage, + AirbyteLogMessage, + AirbyteMessage, + AirbyteStateMessage, + AirbyteTraceMessage, + ConfiguredAirbyteCatalog, + OrchestratorType, + TraceType, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from airbyte_cdk.sources.utils.types import JsonType +from airbyte_cdk.utils import AirbyteTracedException +from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer +from airbyte_cdk.utils.schema_inferrer import SchemaInferrer, SchemaValidationException + + +class MessageGrouper: + logger = logging.getLogger("airbyte.connector-builder") + + def __init__(self, max_pages_per_slice: int, max_slices: int, max_record_limit: int = 1000): + self._max_pages_per_slice = max_pages_per_slice + self._max_slices = max_slices + self._max_record_limit = max_record_limit + + def _pk_to_nested_and_composite_field(self, field: Optional[Union[str, List[str], List[List[str]]]]) -> List[List[str]]: + if not field: + return [[]] + + if isinstance(field, str): + return [[field]] + + is_composite_key = isinstance(field[0], str) + if is_composite_key: + return [[i] for i in field] # type: ignore # the type of field is expected to be List[str] here + + return field # type: ignore # the type of field is expected to be List[List[str]] here + + def _cursor_field_to_nested_and_composite_field(self, field: Union[str, List[str]]) -> List[List[str]]: + if not field: + return [[]] + + if isinstance(field, str): + return [[field]] + + is_nested_key = isinstance(field[0], str) + if is_nested_key: + return [field] # type: ignore # the type of field is expected to be List[str] here + + raise ValueError(f"Unknown type for cursor field `{field}") + + def get_message_groups( + self, + source: DeclarativeSource, + config: Mapping[str, Any], + configured_catalog: ConfiguredAirbyteCatalog, + state: List[AirbyteStateMessage], + record_limit: Optional[int] = None, + ) -> StreamRead: + if record_limit is not None and not (1 <= record_limit <= self._max_record_limit): + raise ValueError(f"Record limit must be between 1 and {self._max_record_limit}. Got {record_limit}") + stream = source.streams(config)[0] # The connector builder currently only supports reading from a single stream at a time + schema_inferrer = SchemaInferrer( + self._pk_to_nested_and_composite_field(stream.primary_key), + self._cursor_field_to_nested_and_composite_field(stream.cursor_field), + ) + datetime_format_inferrer = DatetimeFormatInferrer() + + if record_limit is None: + record_limit = self._max_record_limit + else: + record_limit = min(record_limit, self._max_record_limit) + + slices = [] + log_messages = [] + latest_config_update: AirbyteControlMessage = None + auxiliary_requests = [] + for message_group in self._get_message_groups( + self._read_stream(source, config, configured_catalog, state), + schema_inferrer, + datetime_format_inferrer, + record_limit, + ): + if isinstance(message_group, AirbyteLogMessage): + log_messages.append(LogMessage(**{"message": message_group.message, "level": message_group.level.value})) + elif isinstance(message_group, AirbyteTraceMessage): + if message_group.type == TraceType.ERROR: + log_messages.append( + LogMessage( + **{ + "message": message_group.error.message, + "level": "ERROR", + "internal_message": message_group.error.internal_message, + "stacktrace": message_group.error.stack_trace, + } + ) + ) + elif isinstance(message_group, AirbyteControlMessage): + if not latest_config_update or latest_config_update.emitted_at <= message_group.emitted_at: + latest_config_update = message_group + elif isinstance(message_group, AuxiliaryRequest): + auxiliary_requests.append(message_group) + elif isinstance(message_group, StreamReadSlices): + slices.append(message_group) + else: + raise ValueError(f"Unknown message group type: {type(message_group)}") + + try: + # The connector builder currently only supports reading from a single stream at a time + configured_stream = configured_catalog.streams[0] + schema = schema_inferrer.get_stream_schema(configured_stream.stream.name) + except SchemaValidationException as exception: + for validation_error in exception.validation_errors: + log_messages.append(LogMessage(validation_error, "ERROR")) + schema = exception.schema + + return StreamRead( + logs=log_messages, + slices=slices, + test_read_limit_reached=self._has_reached_limit(slices), + auxiliary_requests=auxiliary_requests, + inferred_schema=schema, + latest_config_update=self._clean_config(latest_config_update.connectorConfig.config) if latest_config_update else None, + inferred_datetime_formats=datetime_format_inferrer.get_inferred_datetime_formats(), + ) + + def _get_message_groups( + self, + messages: Iterator[AirbyteMessage], + schema_inferrer: SchemaInferrer, + datetime_format_inferrer: DatetimeFormatInferrer, + limit: int, + ) -> Iterable[Union[StreamReadPages, AirbyteControlMessage, AirbyteLogMessage, AirbyteTraceMessage, AuxiliaryRequest]]: + """ + Message groups are partitioned according to when request log messages are received. Subsequent response log messages + and record messages belong to the prior request log message and when we encounter another request, append the latest + message group, until records have been read. + + Messages received from the CDK read operation will always arrive in the following order: + {type: LOG, log: {message: "request: ..."}} + {type: LOG, log: {message: "response: ..."}} + ... 0 or more record messages + {type: RECORD, record: {data: ...}} + {type: RECORD, record: {data: ...}} + Repeats for each request/response made + + Note: The exception is that normal log messages can be received at any time which are not incorporated into grouping + """ + records_count = 0 + at_least_one_page_in_group = False + current_page_records: List[Mapping[str, Any]] = [] + current_slice_descriptor: Optional[Dict[str, Any]] = None + current_slice_pages: List[StreamReadPages] = [] + current_page_request: Optional[HttpRequest] = None + current_page_response: Optional[HttpResponse] = None + latest_state_message: Optional[Dict[str, Any]] = None + + while records_count < limit and (message := next(messages, None)): + json_object = self._parse_json(message.log) if message.type == MessageType.LOG else None + if json_object is not None and not isinstance(json_object, dict): + raise ValueError(f"Expected log message to be a dict, got {json_object} of type {type(json_object)}") + json_message: Optional[Dict[str, JsonType]] = json_object + if self._need_to_close_page(at_least_one_page_in_group, message, json_message): + self._close_page(current_page_request, current_page_response, current_slice_pages, current_page_records) + current_page_request = None + current_page_response = None + + if ( + at_least_one_page_in_group + and message.type == MessageType.LOG + and message.log.message.startswith(SliceLogger.SLICE_LOG_PREFIX) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message + ): + yield StreamReadSlices( + pages=current_slice_pages, + slice_descriptor=current_slice_descriptor, + state=[latest_state_message] if latest_state_message else [], + ) + current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message + current_slice_pages = [] + at_least_one_page_in_group = False + elif message.type == MessageType.LOG and message.log.message.startswith(SliceLogger.SLICE_LOG_PREFIX): # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message + # parsing the first slice + current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message + elif message.type == MessageType.LOG: + if json_message is not None and self._is_http_log(json_message): + if self._is_auxiliary_http_request(json_message): + airbyte_cdk = json_message.get("airbyte_cdk", {}) + if not isinstance(airbyte_cdk, dict): + raise ValueError(f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}") + stream = airbyte_cdk.get("stream", {}) + if not isinstance(stream, dict): + raise ValueError(f"Expected stream to be a dict, got {stream} of type {type(stream)}") + title_prefix = "Parent stream: " if stream.get("is_substream", False) else "" + http = json_message.get("http", {}) + if not isinstance(http, dict): + raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}") + yield AuxiliaryRequest( + title=title_prefix + str(http.get("title", None)), + description=str(http.get("description", None)), + request=self._create_request_from_log_message(json_message), + response=self._create_response_from_log_message(json_message), + ) + else: + at_least_one_page_in_group = True + current_page_request = self._create_request_from_log_message(json_message) + current_page_response = self._create_response_from_log_message(json_message) + else: + yield message.log + elif message.type == MessageType.TRACE: + if message.trace.type == TraceType.ERROR: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has trace.type + yield message.trace + elif message.type == MessageType.RECORD: + current_page_records.append(message.record.data) # type: ignore[union-attr] # AirbyteMessage with MessageType.RECORD has record.data + records_count += 1 + schema_inferrer.accumulate(message.record) + datetime_format_inferrer.accumulate(message.record) + elif message.type == MessageType.CONTROL and message.control.type == OrchestratorType.CONNECTOR_CONFIG: # type: ignore[union-attr] # AirbyteMessage with MessageType.CONTROL has control.type + yield message.control + elif message.type == MessageType.STATE: + latest_state_message = message.state # type: ignore[assignment] + else: + if current_page_request or current_page_response or current_page_records: + self._close_page(current_page_request, current_page_response, current_slice_pages, current_page_records) + yield StreamReadSlices( + pages=current_slice_pages, + slice_descriptor=current_slice_descriptor, + state=[latest_state_message] if latest_state_message else [], + ) + + @staticmethod + def _need_to_close_page(at_least_one_page_in_group: bool, message: AirbyteMessage, json_message: Optional[Dict[str, Any]]) -> bool: + return ( + at_least_one_page_in_group + and message.type == MessageType.LOG + and (MessageGrouper._is_page_http_request(json_message) or message.log.message.startswith("slice:")) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message + ) + + @staticmethod + def _is_page_http_request(json_message: Optional[Dict[str, Any]]) -> bool: + if not json_message: + return False + else: + return MessageGrouper._is_http_log(json_message) and not MessageGrouper._is_auxiliary_http_request(json_message) + + @staticmethod + def _is_http_log(message: Dict[str, JsonType]) -> bool: + return bool(message.get("http", False)) + + @staticmethod + def _is_auxiliary_http_request(message: Optional[Dict[str, Any]]) -> bool: + """ + A auxiliary request is a request that is performed and will not directly lead to record for the specific stream it is being queried. + A couple of examples are: + * OAuth authentication + * Substream slice generation + """ + if not message: + return False + + is_http = MessageGrouper._is_http_log(message) + return is_http and message.get("http", {}).get("is_auxiliary", False) + + @staticmethod + def _close_page( + current_page_request: Optional[HttpRequest], + current_page_response: Optional[HttpResponse], + current_slice_pages: List[StreamReadPages], + current_page_records: List[Mapping[str, Any]], + ) -> None: + """ + Close a page when parsing message groups + """ + current_slice_pages.append( + StreamReadPages(request=current_page_request, response=current_page_response, records=deepcopy(current_page_records)) # type: ignore + ) + current_page_records.clear() + + def _read_stream( + self, + source: DeclarativeSource, + config: Mapping[str, Any], + configured_catalog: ConfiguredAirbyteCatalog, + state: List[AirbyteStateMessage], + ) -> Iterator[AirbyteMessage]: + # the generator can raise an exception + # iterate over the generated messages. if next raise an exception, catch it and yield it as an AirbyteLogMessage + try: + yield from AirbyteEntrypoint(source).read(source.spec(self.logger), config, configured_catalog, state) + except AirbyteTracedException as traced_exception: + # Look for this message which indicates that it is the "final exception" raised by AbstractSource. + # If it matches, don't yield this as we don't need to show this in the Builder. + # This is somewhat brittle as it relies on the message string, but if they drift then the worst case + # is that this message will be shown in the Builder. + if ( + traced_exception.message is not None + and "During the sync, the following streams did not sync successfully" in traced_exception.message + ): + return + yield traced_exception.as_airbyte_message() + except Exception as e: + error_message = f"{e.args[0] if len(e.args) > 0 else str(e)}" + yield AirbyteTracedException.from_exception(e, message=error_message).as_airbyte_message() + + @staticmethod + def _parse_json(log_message: AirbyteLogMessage) -> JsonType: + # TODO: As a temporary stopgap, the CDK emits request/response data as a log message string. Ideally this should come in the + # form of a custom message object defined in the Airbyte protocol, but this unblocks us in the immediate while the + # protocol change is worked on. + try: + json_object: JsonType = json.loads(log_message.message) + return json_object + except JSONDecodeError: + return None + + @staticmethod + def _create_request_from_log_message(json_http_message: Dict[str, Any]) -> HttpRequest: + url = json_http_message.get("url", {}).get("full", "") + request = json_http_message.get("http", {}).get("request", {}) + return HttpRequest( + url=url, + http_method=request.get("method", ""), + headers=request.get("headers"), + body=request.get("body", {}).get("content", ""), + ) + + @staticmethod + def _create_response_from_log_message(json_http_message: Dict[str, Any]) -> HttpResponse: + response = json_http_message.get("http", {}).get("response", {}) + body = response.get("body", {}).get("content", "") + return HttpResponse(status=response.get("status_code"), body=body, headers=response.get("headers")) + + def _has_reached_limit(self, slices: List[StreamReadSlices]) -> bool: + if len(slices) >= self._max_slices: + return True + + record_count = 0 + + for _slice in slices: + if len(_slice.pages) >= self._max_pages_per_slice: + return True + for page in _slice.pages: + record_count += len(page.records) + if record_count >= self._max_record_limit: + return True + return False + + def _parse_slice_description(self, log_message: str) -> Dict[str, Any]: + return json.loads(log_message.replace(SliceLogger.SLICE_LOG_PREFIX, "", 1)) # type: ignore + + @staticmethod + def _clean_config(config: Dict[str, Any]) -> Dict[str, Any]: + cleaned_config = deepcopy(config) + for key in config.keys(): + if key.startswith("__"): + del cleaned_config[key] + return cleaned_config diff --git a/airbyte-cdk/python/airbyte_cdk/connector_builder/models.py b/airbyte-cdk/python/airbyte_cdk/connector_builder/models.py new file mode 100644 index 000000000000..50eb8eb95530 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/connector_builder/models.py @@ -0,0 +1,71 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + + +@dataclass +class HttpResponse: + status: int + body: Optional[str] = None + headers: Optional[Dict[str, Any]] = None + + +@dataclass +class HttpRequest: + url: str + headers: Optional[Dict[str, Any]] + http_method: str + body: Optional[str] = None + + +@dataclass +class StreamReadPages: + records: List[object] + request: Optional[HttpRequest] = None + response: Optional[HttpResponse] = None + + +@dataclass +class StreamReadSlices: + pages: List[StreamReadPages] + slice_descriptor: Optional[Dict[str, Any]] + state: Optional[List[Dict[str, Any]]] = None + + +@dataclass +class LogMessage: + message: str + level: str + internal_message: Optional[str] = None + stacktrace: Optional[str] = None + + +@dataclass +class AuxiliaryRequest: + title: str + description: str + request: HttpRequest + response: HttpResponse + + +@dataclass +class StreamRead(object): + logs: List[LogMessage] + slices: List[StreamReadSlices] + test_read_limit_reached: bool + auxiliary_requests: List[AuxiliaryRequest] + inferred_schema: Optional[Dict[str, Any]] + inferred_datetime_formats: Optional[Dict[str, str]] + latest_config_update: Optional[Dict[str, Any]] + + +@dataclass +class StreamReadRequestBody: + manifest: Dict[str, Any] + stream: str + config: Dict[str, Any] + state: Optional[Dict[str, Any]] + record_limit: Optional[int] diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/__init__.py b/airbyte-cdk/python/airbyte_cdk/destinations/__init__.py new file mode 100644 index 000000000000..3a641025b565 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +"""The destinations module provides classes for building destination connectors.""" + +from .destination import Destination + +__all__ = [ + "Destination", +] diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/destination.py b/airbyte-cdk/python/airbyte_cdk/destinations/destination.py new file mode 100644 index 000000000000..336a54a94e8f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/destination.py @@ -0,0 +1,120 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import argparse +import io +import logging +import sys +from abc import ABC, abstractmethod +from typing import Any, Iterable, List, Mapping + +from airbyte_cdk.connector import Connector +from airbyte_cdk.exception_handler import init_uncaught_exception_handler +from airbyte_cdk.models import AirbyteMessage, AirbyteMessageSerializer, ConfiguredAirbyteCatalog, ConfiguredAirbyteCatalogSerializer, Type +from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from orjson import orjson + +logger = logging.getLogger("airbyte") + + +class Destination(Connector, ABC): + VALID_CMDS = {"spec", "check", "write"} + + @abstractmethod + def write( + self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] + ) -> Iterable[AirbyteMessage]: + """Implement to define how the connector writes data to the destination""" + + def _run_check(self, config: Mapping[str, Any]) -> AirbyteMessage: + check_result = self.check(logger, config) + return AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result) + + def _parse_input_stream(self, input_stream: io.TextIOWrapper) -> Iterable[AirbyteMessage]: + """Reads from stdin, converting to Airbyte messages""" + for line in input_stream: + try: + yield AirbyteMessageSerializer.load(orjson.loads(line)) + except orjson.JSONDecodeError: + logger.info(f"ignoring input which can't be deserialized as Airbyte Message: {line}") + + def _run_write( + self, config: Mapping[str, Any], configured_catalog_path: str, input_stream: io.TextIOWrapper + ) -> Iterable[AirbyteMessage]: + catalog = ConfiguredAirbyteCatalogSerializer.load(orjson.loads(open(configured_catalog_path).read())) + input_messages = self._parse_input_stream(input_stream) + logger.info("Begin writing to the destination...") + yield from self.write(config=config, configured_catalog=catalog, input_messages=input_messages) + logger.info("Writing complete.") + + def parse_args(self, args: List[str]) -> argparse.Namespace: + """ + :param args: commandline arguments + :return: + """ + + parent_parser = argparse.ArgumentParser(add_help=False) + main_parser = argparse.ArgumentParser() + subparsers = main_parser.add_subparsers(title="commands", dest="command") + + # spec + subparsers.add_parser("spec", help="outputs the json configuration specification", parents=[parent_parser]) + + # check + check_parser = subparsers.add_parser("check", help="checks the config can be used to connect", parents=[parent_parser]) + required_check_parser = check_parser.add_argument_group("required named arguments") + required_check_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file") + + # write + write_parser = subparsers.add_parser("write", help="Writes data to the destination", parents=[parent_parser]) + write_required = write_parser.add_argument_group("required named arguments") + write_required.add_argument("--config", type=str, required=True, help="path to the JSON configuration file") + write_required.add_argument("--catalog", type=str, required=True, help="path to the configured catalog JSON file") + + parsed_args = main_parser.parse_args(args) + cmd = parsed_args.command + if not cmd: + raise Exception("No command entered. ") + elif cmd not in ["spec", "check", "write"]: + # This is technically dead code since parse_args() would fail if this was the case + # But it's non-obvious enough to warrant placing it here anyways + raise Exception(f"Unknown command entered: {cmd}") + + return parsed_args + + def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]: + + cmd = parsed_args.command + if cmd not in self.VALID_CMDS: + raise Exception(f"Unrecognized command: {cmd}") + + spec = self.spec(logger) + if cmd == "spec": + yield AirbyteMessage(type=Type.SPEC, spec=spec) + return + config = self.read_config(config_path=parsed_args.config) + if self.check_config_against_spec or cmd == "check": + try: + check_config_against_spec_or_exit(config, spec) + except AirbyteTracedException as traced_exc: + connection_status = traced_exc.as_connection_status_message() + if connection_status and cmd == "check": + yield connection_status + return + raise traced_exc + + if cmd == "check": + yield self._run_check(config=config) + elif cmd == "write": + # Wrap in UTF-8 to override any other input encodings + wrapped_stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8") + yield from self._run_write(config=config, configured_catalog_path=parsed_args.catalog, input_stream=wrapped_stdin) + + def run(self, args: List[str]) -> None: + init_uncaught_exception_handler(logger) + parsed_args = self.parse_args(args) + output_messages = self.run_cmd(parsed_args) + for message in output_messages: + print(orjson.dumps(AirbyteMessageSerializer.dump(message)).decode()) diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/README.md b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/README.md new file mode 100644 index 000000000000..09668b61e963 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/README.md @@ -0,0 +1,37 @@ +# Vector DB based destinations + +## Note: All helpers in this directory are experimental and subject to change + +This directory contains several helpers that can be used to create a destination that processes and chunks records, embeds their text part and loads them into a vector database. +The specific loading behavior is defined by the destination connector itself, but chunking and embedding behavior is handled by the helpers. + +To use these helpers, install the CDK with the `vector-db-based` extra: + +```bash +pip install airbyte-cdk[vector-db-based] +``` + +The helpers can be used in the following way: + +- Add the config models to the spec of the connector +- Implement the `Indexer` interface for your specific database +- In the check implementation of the destination, initialize the indexer and the embedder and call `check` on them +- In the write implementation of the destination, initialize the indexer, the embedder and pass them to a new instance of the writer. Then call the writers `write` method with the iterable for incoming messages + +If there are no connector-specific embedders, the `airbyte_cdk.destinations.vector_db_based.embedder.create_from_config` function can be used to get an embedder instance from the config. + +This is how the components interact: + +```text +┌─────────────┐ +│MyDestination│ +└┬────────────┘ +┌▽───────────────────────────────┐ +│Writer │ +└┬─────────┬──────────┬──────────┘ +┌▽───────┐┌▽────────┐┌▽────────────────┐ +│Embedder││MyIndexer││DocumentProcessor│ +└────────┘└─────────┘└─────────────────┘ +``` + +Normally, only the `MyDestination` class and the `MyIndexer` class has to be implemented specifically for the destination. The other classes are provided as is by the helpers. diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/__init__.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/__init__.py new file mode 100644 index 000000000000..86ae207f69d8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/__init__.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# + +from .config import ( + AzureOpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + FromFieldEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, + ProcessingConfigModel, +) +from .document_processor import Chunk, DocumentProcessor +from .embedder import CohereEmbedder, Embedder, FakeEmbedder, OpenAIEmbedder +from .indexer import Indexer +from .writer import Writer + +__all__ = [ + "AzureOpenAIEmbedder", + "AzureOpenAIEmbeddingConfigModel", + "Chunk", + "CohereEmbedder", + "CohereEmbeddingConfigModel", + "DocumentProcessor", + "Embedder", + "FakeEmbedder", + "FakeEmbeddingConfigModel", + "FromFieldEmbedder", + "FromFieldEmbeddingConfigModel", + "Indexer", + "OpenAICompatibleEmbedder", + "OpenAICompatibleEmbeddingConfigModel", + "OpenAIEmbedder", + "OpenAIEmbeddingConfigModel", + "ProcessingConfigModel", + "Writer", +] diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/config.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/config.py new file mode 100644 index 000000000000..90de6b777e97 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/config.py @@ -0,0 +1,275 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Dict, List, Literal, Optional, Union + +import dpath +from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig +from airbyte_cdk.utils.spec_schema_transformations import resolve_refs +from pydantic.v1 import BaseModel, Field + + +class SeparatorSplitterConfigModel(BaseModel): + mode: Literal["separator"] = Field("separator", const=True) + separators: List[str] = Field( + default=['"\\n\\n"', '"\\n"', '" "', '""'], + title="Separators", + description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".', + ) + keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks") + + class Config(OneOfOptionConfig): + title = "By Separator" + description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc." + discriminator = "mode" + + +class MarkdownHeaderSplitterConfigModel(BaseModel): + mode: Literal["markdown"] = Field("markdown", const=True) + split_level: int = Field( + default=1, + title="Split level", + description="Level of markdown headers to split text fields by. Headings down to the specified level will be used as split points", + le=6, + ge=1, + ) + + class Config(OneOfOptionConfig): + title = "By Markdown header" + description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk." + discriminator = "mode" + + +class CodeSplitterConfigModel(BaseModel): + mode: Literal["code"] = Field("code", const=True) + language: str = Field( + title="Language", + description="Split code in suitable places based on the programming language", + enum=[ + "cpp", + "go", + "java", + "js", + "php", + "proto", + "python", + "rst", + "ruby", + "rust", + "scala", + "swift", + "markdown", + "latex", + "html", + "sol", + ], + ) + + class Config(OneOfOptionConfig): + title = "By Programming Language" + description = ( + "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks." + ) + discriminator = "mode" + + +TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel] + + +class FieldNameMappingConfigModel(BaseModel): + from_field: str = Field(title="From field name", description="The field name in the source") + to_field: str = Field(title="To field name", description="The field name to use in the destination") + + +class ProcessingConfigModel(BaseModel): + chunk_size: int = Field( + ..., + title="Chunk size", + maximum=8191, + minimum=1, + description="Size of chunks in tokens to store in vector store (make sure it is not too big for the context if your LLM)", + ) + chunk_overlap: int = Field( + title="Chunk overlap", + description="Size of overlap between chunks in tokens to store in vector store to better capture relevant context", + default=0, + ) + text_fields: Optional[List[str]] = Field( + default=[], + title="Text fields to embed", + description="List of fields in the record that should be used to calculate the embedding. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered text fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array.", + always_show=True, + examples=["text", "user.name", "users.*.name"], + ) + metadata_fields: Optional[List[str]] = Field( + default=[], + title="Fields to store as metadata", + description="List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.", + always_show=True, + examples=["age", "user", "user.name"], + ) + text_splitter: TextSplitterConfigModel = Field( + default=None, + title="Text splitter", + discriminator="mode", + type="object", + description="Split text fields into chunks based on the specified method.", + ) + field_name_mappings: Optional[List[FieldNameMappingConfigModel]] = Field( + default=[], + title="Field name mappings", + description="List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.", + ) + + class Config: + schema_extra = {"group": "processing"} + + +class OpenAIEmbeddingConfigModel(BaseModel): + mode: Literal["openai"] = Field("openai", const=True) + openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True) + + class Config(OneOfOptionConfig): + title = "OpenAI" + description = ( + "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions." + ) + discriminator = "mode" + + +class OpenAICompatibleEmbeddingConfigModel(BaseModel): + mode: Literal["openai_compatible"] = Field("openai_compatible", const=True) + api_key: str = Field(title="API key", default="", airbyte_secret=True) + base_url: str = Field( + ..., title="Base URL", description="The base URL for your OpenAI-compatible service", examples=["https://your-service-name.com"] + ) + model_name: str = Field( + title="Model name", + description="The name of the model to use for embedding", + default="text-embedding-ada-002", + examples=["text-embedding-ada-002"], + ) + dimensions: int = Field( + title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384] + ) + + class Config(OneOfOptionConfig): + title = "OpenAI-compatible" + description = "Use a service that's compatible with the OpenAI API to embed text." + discriminator = "mode" + + +class AzureOpenAIEmbeddingConfigModel(BaseModel): + mode: Literal["azure_openai"] = Field("azure_openai", const=True) + openai_key: str = Field( + ..., + title="Azure OpenAI API key", + airbyte_secret=True, + description="The API key for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource", + ) + api_base: str = Field( + ..., + title="Resource base URL", + description="The base URL for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource", + examples=["https://your-resource-name.openai.azure.com"], + ) + deployment: str = Field( + ..., + title="Deployment", + description="The deployment for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource", + examples=["your-resource-name"], + ) + + class Config(OneOfOptionConfig): + title = "Azure OpenAI" + description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions." + discriminator = "mode" + + +class FakeEmbeddingConfigModel(BaseModel): + mode: Literal["fake"] = Field("fake", const=True) + + class Config(OneOfOptionConfig): + title = "Fake" + description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs." + discriminator = "mode" + + +class FromFieldEmbeddingConfigModel(BaseModel): + mode: Literal["from_field"] = Field("from_field", const=True) + field_name: str = Field( + ..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"] + ) + dimensions: int = Field( + ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384] + ) + + class Config(OneOfOptionConfig): + title = "From Field" + description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store." + discriminator = "mode" + + +class CohereEmbeddingConfigModel(BaseModel): + mode: Literal["cohere"] = Field("cohere", const=True) + cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True) + + class Config(OneOfOptionConfig): + title = "Cohere" + description = "Use the Cohere API to embed text." + discriminator = "mode" + + +class VectorDBConfigModel(BaseModel): + """ + The configuration model for the Vector DB based destinations. This model is used to generate the UI for the destination configuration, + as well as to provide type safety for the configuration passed to the destination. + + The configuration model is composed of four parts: + * Processing configuration + * Embedding configuration + * Indexing configuration + * Advanced configuration + + Processing, embedding and advanced configuration are provided by this base class, while the indexing configuration is provided by the destination connector in the sub class. + """ + + embedding: Union[ + OpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + AzureOpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, + ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object") + processing: ProcessingConfigModel + omit_raw_text: bool = Field( + default=False, + title="Do not store raw text", + group="advanced", + description="Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.", + ) + + class Config: + title = "Destination Config" + schema_extra = { + "groups": [ + {"id": "processing", "title": "Processing"}, + {"id": "embedding", "title": "Embedding"}, + {"id": "indexing", "title": "Indexing"}, + {"id": "advanced", "title": "Advanced"}, + ] + } + + @staticmethod + def remove_discriminator(schema: Dict[str, Any]) -> None: + """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references""" + dpath.delete(schema, "properties/**/discriminator") + + @classmethod + def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]: + """we're overriding the schema classmethod to enable some post-processing""" + schema: Dict[str, Any] = super().schema() + schema = resolve_refs(schema) + cls.remove_discriminator(schema) + return schema diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/document_processor.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/document_processor.py new file mode 100644 index 000000000000..45b6e4d7bc52 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/document_processor.py @@ -0,0 +1,184 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from dataclasses import dataclass +from typing import Any, Dict, List, Mapping, Optional, Tuple + +import dpath +from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel, SeparatorSplitterConfigModel, TextSplitterConfigModel +from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier +from airbyte_cdk.models import AirbyteRecordMessage, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode +from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType +from langchain.text_splitter import Language, RecursiveCharacterTextSplitter +from langchain.utils import stringify_dict +from langchain_core.documents.base import Document + +METADATA_STREAM_FIELD = "_ab_stream" +METADATA_RECORD_ID_FIELD = "_ab_record_id" + +CDC_DELETED_FIELD = "_ab_cdc_deleted_at" + + +@dataclass +class Chunk: + page_content: Optional[str] + metadata: Dict[str, Any] + record: AirbyteRecordMessage + embedding: Optional[List[float]] = None + + +headers_to_split_on = ["(?:^|\n)# ", "(?:^|\n)## ", "(?:^|\n)### ", "(?:^|\n)#### ", "(?:^|\n)##### ", "(?:^|\n)###### "] + + +class DocumentProcessor: + """ + DocumentProcessor is a helper class that generates documents from Airbyte records. + + It is used to generate documents from records before writing them to the destination: + * The text fields are extracted from the record and concatenated to a single string. + * The metadata fields are extracted from the record and added to the document metadata. + * The document is split into chunks of a given size using a langchain text splitter. + + The Writer class uses the DocumentProcessor class to internally generate documents from records - in most cases you don't need to use it directly, + except if you want to implement a custom writer. + + The config parameters specified by the ProcessingConfigModel has to be made part of the connector spec to allow the user to configure the document processor. + Calling DocumentProcessor.check_config(config) will validate the config and return an error message if the config is invalid. + """ + + streams: Mapping[str, ConfiguredAirbyteStream] + + @staticmethod + def check_config(config: ProcessingConfigModel) -> Optional[str]: + if config.text_splitter is not None and config.text_splitter.mode == "separator": + for s in config.text_splitter.separators: + try: + separator = json.loads(s) + if not isinstance(separator, str): + return f"Invalid separator: {s}. Separator needs to be a valid JSON string using double quotes." + except json.decoder.JSONDecodeError: + return f"Invalid separator: {s}. Separator needs to be a valid JSON string using double quotes." + return None + + def _get_text_splitter( + self, chunk_size: int, chunk_overlap: int, splitter_config: Optional[TextSplitterConfigModel] + ) -> RecursiveCharacterTextSplitter: + if splitter_config is None: + splitter_config = SeparatorSplitterConfigModel(mode="separator") + if splitter_config.mode == "separator": + return RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separators=[json.loads(s) for s in splitter_config.separators], + keep_separator=splitter_config.keep_separator, + disallowed_special=(), + ) + if splitter_config.mode == "markdown": + return RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separators=headers_to_split_on[: splitter_config.split_level], + is_separator_regex=True, + keep_separator=True, + disallowed_special=(), + ) + if splitter_config.mode == "code": + return RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separators=RecursiveCharacterTextSplitter.get_separators_for_language(Language(splitter_config.language)), + disallowed_special=(), + ) + + def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCatalog): + self.streams = {create_stream_identifier(stream.stream): stream for stream in catalog.streams} + + self.splitter = self._get_text_splitter(config.chunk_size, config.chunk_overlap, config.text_splitter) + self.text_fields = config.text_fields + self.metadata_fields = config.metadata_fields + self.field_name_mappings = config.field_name_mappings + self.logger = logging.getLogger("airbyte.document_processor") + + def process(self, record: AirbyteRecordMessage) -> Tuple[List[Chunk], Optional[str]]: + """ + Generate documents from records. + :param records: List of AirbyteRecordMessages + :return: Tuple of (List of document chunks, record id to delete if a stream is in dedup mode to avoid stale documents in the vector store) + """ + if CDC_DELETED_FIELD in record.data and record.data[CDC_DELETED_FIELD]: + return [], self._extract_primary_key(record) + doc = self._generate_document(record) + if doc is None: + text_fields = ", ".join(self.text_fields) if self.text_fields else "all fields" + raise AirbyteTracedException( + internal_message="No text fields found in record", + message=f"Record {str(record.data)[:250]}... does not contain any of the configured text fields: {text_fields}. Please check your processing configuration, there has to be at least one text field set in each record.", + failure_type=FailureType.config_error, + ) + chunks = [ + Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record) + for chunk_document in self._split_document(doc) + ] + id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None + return chunks, id_to_delete + + def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]: + relevant_fields = self._extract_relevant_fields(record, self.text_fields) + if len(relevant_fields) == 0: + return None + text = stringify_dict(relevant_fields) + metadata = self._extract_metadata(record) + return Document(page_content=text, metadata=metadata) + + def _extract_relevant_fields(self, record: AirbyteRecordMessage, fields: Optional[List[str]]) -> Dict[str, Any]: + relevant_fields = {} + if fields and len(fields) > 0: + for field in fields: + values = dpath.values(record.data, field, separator=".") + if values and len(values) > 0: + relevant_fields[field] = values if len(values) > 1 else values[0] + else: + relevant_fields = record.data + return self._remap_field_names(relevant_fields) + + def _extract_metadata(self, record: AirbyteRecordMessage) -> Dict[str, Any]: + metadata = self._extract_relevant_fields(record, self.metadata_fields) + metadata[METADATA_STREAM_FIELD] = create_stream_identifier(record) + primary_key = self._extract_primary_key(record) + if primary_key: + metadata[METADATA_RECORD_ID_FIELD] = primary_key + return metadata + + def _extract_primary_key(self, record: AirbyteRecordMessage) -> Optional[str]: + stream_identifier = create_stream_identifier(record) + current_stream: ConfiguredAirbyteStream = self.streams[stream_identifier] + # if the sync mode is deduping, use the primary key to upsert existing records instead of appending new ones + if not current_stream.primary_key or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup: + return None + + primary_key = [] + for key in current_stream.primary_key: + try: + primary_key.append(str(dpath.get(record.data, key))) + except KeyError: + primary_key.append("__not_found__") + stringified_primary_key = "_".join(primary_key) + return f"{stream_identifier}_{stringified_primary_key}" + + def _split_document(self, doc: Document) -> List[Document]: + chunks: List[Document] = self.splitter.split_documents([doc]) + return chunks + + def _remap_field_names(self, fields: Dict[str, Any]) -> Dict[str, Any]: + if not self.field_name_mappings: + return fields + + new_fields = fields.copy() + for mapping in self.field_name_mappings: + if mapping.from_field in new_fields: + new_fields[mapping.to_field] = new_fields.pop(mapping.from_field) + + return new_fields diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/embedder.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/embedder.py new file mode 100644 index 000000000000..7fb880fadaae --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/embedder.py @@ -0,0 +1,261 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List, Optional, Union, cast + +from airbyte_cdk.destinations.vector_db_based.config import ( + AzureOpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + FromFieldEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, + ProcessingConfigModel, +) +from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception +from airbyte_cdk.models import AirbyteRecordMessage +from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType +from langchain.embeddings.cohere import CohereEmbeddings +from langchain.embeddings.fake import FakeEmbeddings +from langchain.embeddings.localai import LocalAIEmbeddings +from langchain.embeddings.openai import OpenAIEmbeddings + + +@dataclass +class Document: + page_content: str + record: AirbyteRecordMessage + + +class Embedder(ABC): + """ + Embedder is an abstract class that defines the interface for embedding text. + + The Indexer class uses the Embedder class to internally embed text - each indexer is responsible to pass the text of all documents to the embedder and store the resulting embeddings in the destination. + The destination connector is responsible to create an embedder instance and pass it to the writer. + The CDK defines basic embedders that should be supported in each destination. It is possible to implement custom embedders for special destinations if needed. + """ + + def __init__(self) -> None: + pass + + @abstractmethod + def check(self) -> Optional[str]: + pass + + @abstractmethod + def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]: + """ + Embed the text of each chunk and return the resulting embedding vectors. + If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk. + """ + pass + + @property + @abstractmethod + def embedding_dimensions(self) -> int: + pass + + +OPEN_AI_VECTOR_SIZE = 1536 + +OPEN_AI_TOKEN_LIMIT = 150_000 # limit of tokens per minute + + +class BaseOpenAIEmbedder(Embedder): + def __init__(self, embeddings: OpenAIEmbeddings, chunk_size: int): + super().__init__() + self.embeddings = embeddings + self.chunk_size = chunk_size + + def check(self) -> Optional[str]: + try: + self.embeddings.embed_query("test") + except Exception as e: + return format_exception(e) + return None + + def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]: + """ + Embed the text of each chunk and return the resulting embedding vectors. + + As the OpenAI API will fail if more than the per-minute limit worth of tokens is sent at once, we split the request into batches and embed each batch separately. + It's still possible to run into the rate limit between each embed call because the available token budget hasn't recovered between the calls, + but the built-in retry mechanism of the OpenAI client handles that. + """ + # Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of documents that can be embedded at once without exhausting the limit in a single request + embedding_batch_size = OPEN_AI_TOKEN_LIMIT // self.chunk_size + batches = create_chunks(documents, batch_size=embedding_batch_size) + embeddings: List[Optional[List[float]]] = [] + for batch in batches: + embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch])) + return embeddings + + @property + def embedding_dimensions(self) -> int: + # vector size produced by text-embedding-ada-002 model + return OPEN_AI_VECTOR_SIZE + + +class OpenAIEmbedder(BaseOpenAIEmbedder): + def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int): + super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, max_retries=15, disallowed_special=()), chunk_size) # type: ignore + + +class AzureOpenAIEmbedder(BaseOpenAIEmbedder): + def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int): + # Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request + super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, chunk_size=16, max_retries=15, openai_api_type="azure", openai_api_version="2023-05-15", openai_api_base=config.api_base, deployment=config.deployment, disallowed_special=()), chunk_size) # type: ignore + + +COHERE_VECTOR_SIZE = 1024 + + +class CohereEmbedder(Embedder): + def __init__(self, config: CohereEmbeddingConfigModel): + super().__init__() + # Client is set internally + self.embeddings = CohereEmbeddings(cohere_api_key=config.cohere_key, model="embed-english-light-v2.0") # type: ignore + + def check(self) -> Optional[str]: + try: + self.embeddings.embed_query("test") + except Exception as e: + return format_exception(e) + return None + + def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]: + return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents])) + + @property + def embedding_dimensions(self) -> int: + # vector size produced by text-embedding-ada-002 model + return COHERE_VECTOR_SIZE + + +class FakeEmbedder(Embedder): + def __init__(self, config: FakeEmbeddingConfigModel): + super().__init__() + self.embeddings = FakeEmbeddings(size=OPEN_AI_VECTOR_SIZE) + + def check(self) -> Optional[str]: + try: + self.embeddings.embed_query("test") + except Exception as e: + return format_exception(e) + return None + + def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]: + return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents])) + + @property + def embedding_dimensions(self) -> int: + # use same vector size as for OpenAI embeddings to keep it realistic + return OPEN_AI_VECTOR_SIZE + + +CLOUD_DEPLOYMENT_MODE = "cloud" + + +class OpenAICompatibleEmbedder(Embedder): + def __init__(self, config: OpenAICompatibleEmbeddingConfigModel): + super().__init__() + self.config = config + # Client is set internally + # Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage. + self.embeddings = LocalAIEmbeddings(model=config.model_name, openai_api_key=config.api_key or "dummy-api-key", openai_api_base=config.base_url, max_retries=15, disallowed_special=()) # type: ignore + + def check(self) -> Optional[str]: + deployment_mode = os.environ.get("DEPLOYMENT_MODE", "") + if deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE and not self.config.base_url.startswith("https://"): + return "Base URL must start with https://" + + try: + self.embeddings.embed_query("test") + except Exception as e: + return format_exception(e) + return None + + def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]: + return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents])) + + @property + def embedding_dimensions(self) -> int: + # vector size produced by the model + return self.config.dimensions + + +class FromFieldEmbedder(Embedder): + def __init__(self, config: FromFieldEmbeddingConfigModel): + super().__init__() + self.config = config + + def check(self) -> Optional[str]: + return None + + def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]: + """ + From each chunk, pull the embedding from the field specified in the config. + Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem. + """ + embeddings: List[Optional[List[float]]] = [] + for document in documents: + data = document.record.data + if self.config.field_name not in data: + raise AirbyteTracedException( + internal_message="Embedding vector field not found", + failure_type=FailureType.config_error, + message=f"Record {str(data)[:250]}... in stream {document.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.", + ) + field = data[self.config.field_name] + if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field): + raise AirbyteTracedException( + internal_message="Embedding vector field not a list of numbers", + failure_type=FailureType.config_error, + message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.", + ) + if len(field) != self.config.dimensions: + raise AirbyteTracedException( + internal_message="Embedding vector field has wrong length", + failure_type=FailureType.config_error, + message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.", + ) + embeddings.append(field) + + return embeddings + + @property + def embedding_dimensions(self) -> int: + return self.config.dimensions + + +embedder_map = { + "openai": OpenAIEmbedder, + "cohere": CohereEmbedder, + "fake": FakeEmbedder, + "azure_openai": AzureOpenAIEmbedder, + "from_field": FromFieldEmbedder, + "openai_compatible": OpenAICompatibleEmbedder, +} + + +def create_from_config( + embedding_config: Union[ + AzureOpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + FromFieldEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, + ], + processing_config: ProcessingConfigModel, +) -> Embedder: + + if embedding_config.mode == "azure_openai" or embedding_config.mode == "openai": + return cast(Embedder, embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size)) + else: + return cast(Embedder, embedder_map[embedding_config.mode](embedding_config)) diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/indexer.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/indexer.py new file mode 100644 index 000000000000..c49f576a6709 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/indexer.py @@ -0,0 +1,78 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import itertools +from abc import ABC, abstractmethod +from typing import Any, Generator, Iterable, List, Optional, Tuple, TypeVar + +from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk +from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog + + +class Indexer(ABC): + """ + Indexer is an abstract class that defines the interface for indexing documents. + + The Writer class uses the Indexer class to internally index documents generated by the document processor. + In a destination connector, implement a custom indexer by extending this class and implementing the abstract methods. + """ + + def __init__(self, config: Any): + self.config = config + pass + + def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: + """ + Run before the sync starts. This method should be used to make sure all records in the destination that belong to streams with a destination mode of overwrite are deleted. + + Each record has a metadata field with the name airbyte_cdk.destinations.vector_db_based.document_processor.METADATA_STREAM_FIELD which can be used to filter documents for deletion. + Use the airbyte_cdk.destinations.vector_db_based.utils.create_stream_identifier method to create the stream identifier based on the stream definition to use for filtering. + """ + pass + + def post_sync(self) -> List[AirbyteMessage]: + """ + Run after the sync finishes. This method should be used to perform any cleanup operations and can return a list of AirbyteMessages to be logged. + """ + return [] + + @abstractmethod + def index(self, document_chunks: List[Chunk], namespace: str, stream: str) -> None: + """ + Index a list of document chunks. + + This method should be used to index the documents in the destination. If page_content is None, the document should be indexed without the raw text. + All chunks belong to the stream and namespace specified in the parameters. + """ + pass + + @abstractmethod + def delete(self, delete_ids: List[str], namespace: str, stream: str) -> None: + """ + Delete document chunks belonging to certain record ids. + + This method should be used to delete documents from the destination. + The delete_ids parameter contains a list of record ids - all chunks with a record id in this list should be deleted from the destination. + All ids belong to the stream and namespace specified in the parameters. + """ + pass + + @abstractmethod + def check(self) -> Optional[str]: + """ + Check if the indexer is configured correctly. This method should be used to check if the indexer is configured correctly and return an error message if it is not. + """ + pass + + +T = TypeVar("T") + + +def chunks(iterable: Iterable[T], batch_size: int) -> Generator[Tuple[T, ...], None, None]: + """A helper function to break an iterable into chunks of size batch_size.""" + it = iter(iterable) + chunk = tuple(itertools.islice(it, batch_size)) + while chunk: + yield chunk + chunk = tuple(itertools.islice(it, batch_size)) diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/test_utils.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/test_utils.py new file mode 100644 index 000000000000..7f8cfe5fbd8a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/test_utils.py @@ -0,0 +1,53 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import unittest +from typing import Any, Dict + +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + SyncMode, + Type, +) + + +class BaseIntegrationTest(unittest.TestCase): + """ + BaseIntegrationTest is a base class for integration tests for vector db destinations. + + It provides helper methods to create Airbyte catalogs, records and state messages. + """ + + def _get_configured_catalog(self, destination_mode: DestinationSyncMode) -> ConfiguredAirbyteCatalog: + stream_schema = {"type": "object", "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}}} + + overwrite_stream = ConfiguredAirbyteStream( + stream=AirbyteStream( + name="mystream", json_schema=stream_schema, supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh] + ), + primary_key=[["int_col"]], + sync_mode=SyncMode.incremental, + destination_sync_mode=destination_mode, + ) + + return ConfiguredAirbyteCatalog(streams=[overwrite_stream]) + + def _state(self, data: Dict[str, Any]) -> AirbyteMessage: + return AirbyteMessage(type=Type.STATE, state=AirbyteStateMessage(data=data)) + + def _record(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage: + return AirbyteMessage( + type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0) + ) + + def setUp(self) -> None: + with open("secrets/config.json", "r") as f: + self.config = json.loads(f.read()) diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/utils.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/utils.py new file mode 100644 index 000000000000..b0d4edebf890 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/utils.py @@ -0,0 +1,29 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import itertools +import traceback +from typing import Any, Iterable, Iterator, Tuple, Union + +from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream + + +def format_exception(exception: Exception) -> str: + return str(exception) + "\n" + "".join(traceback.TracebackException.from_exception(exception).format()) + + +def create_chunks(iterable: Iterable[Any], batch_size: int) -> Iterator[Tuple[Any, ...]]: + """A helper function to break an iterable into chunks of size batch_size.""" + it = iter(iterable) + chunk = tuple(itertools.islice(it, batch_size)) + while chunk: + yield chunk + chunk = tuple(itertools.islice(it, batch_size)) + + +def create_stream_identifier(stream: Union[AirbyteStream, AirbyteRecordMessage]) -> str: + if isinstance(stream, AirbyteStream): + return str(stream.name if stream.namespace is None else f"{stream.namespace}_{stream.name}") + else: + return str(stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}") diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/writer.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/writer.py new file mode 100644 index 000000000000..0f764c366b54 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/writer.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from collections import defaultdict +from typing import Dict, Iterable, List, Tuple + +from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel +from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor +from airbyte_cdk.destinations.vector_db_based.embedder import Document, Embedder +from airbyte_cdk.destinations.vector_db_based.indexer import Indexer +from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type + + +class Writer: + """ + The Writer class is orchestrating the document processor, the embedder and the indexer: + * Incoming records are passed through the document processor to generate chunks + * One the configured batch size is reached, the chunks are passed to the embedder to generate embeddings + * The embedder embeds the chunks + * The indexer deletes old chunks by the associated record id before indexing the new ones + + The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method. + The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination. + The omit_raw_text parameter can be used to omit the raw text from the chunks. This can be useful if the raw text is very large and not needed for the destination. + """ + + def __init__( + self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int, omit_raw_text: bool + ) -> None: + self.processing_config = processing_config + self.indexer = indexer + self.embedder = embedder + self.batch_size = batch_size + self.omit_raw_text = omit_raw_text + self._init_batch() + + def _init_batch(self) -> None: + self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list) + self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list) + self.number_of_chunks = 0 + + def _convert_to_document(self, chunk: Chunk) -> Document: + """ + Convert a chunk to a document for the embedder. + """ + if chunk.page_content is None: + raise ValueError("Cannot embed a chunk without page content") + return Document(page_content=chunk.page_content, record=chunk.record) + + def _process_batch(self) -> None: + for (namespace, stream), ids in self.ids_to_delete.items(): + self.indexer.delete(ids, namespace, stream) + + for (namespace, stream), chunks in self.chunks.items(): + embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks]) + for i, document in enumerate(chunks): + document.embedding = embeddings[i] + if self.omit_raw_text: + document.page_content = None + self.indexer.index(chunks, namespace, stream) + + self._init_batch() + + def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]: + self.processor = DocumentProcessor(self.processing_config, configured_catalog) + self.indexer.pre_sync(configured_catalog) + for message in input_messages: + if message.type == Type.STATE: + # Emitting a state message indicates that all records which came before it have been written to the destination. So we flush + # the queue to ensure writes happen, then output the state message to indicate it's safe to checkpoint state + self._process_batch() + yield message + elif message.type == Type.RECORD: + record_chunks, record_id_to_delete = self.processor.process(message.record) + self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks) + if record_id_to_delete is not None: + self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete) + self.number_of_chunks += len(record_chunks) + if self.number_of_chunks >= self.batch_size: + self._process_batch() + + self._process_batch() + yield from self.indexer.post_sync() diff --git a/airbyte-cdk/python/airbyte_cdk/entrypoint.py b/airbyte-cdk/python/airbyte_cdk/entrypoint.py new file mode 100644 index 000000000000..945d28405336 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/entrypoint.py @@ -0,0 +1,334 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import argparse +import importlib +import ipaddress +import logging +import os.path +import socket +import sys +import tempfile +from collections import defaultdict +from functools import wraps +from typing import Any, DefaultDict, Iterable, List, Mapping, Optional +from urllib.parse import urlparse + +import requests +from airbyte_cdk.connector import TConfig +from airbyte_cdk.exception_handler import init_uncaught_exception_handler +from airbyte_cdk.logger import init_logger +from airbyte_cdk.models import ( # type: ignore [attr-defined] + AirbyteConnectionStatus, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteStateStats, + ConnectorSpecification, + FailureType, + Status, + Type, +) +from airbyte_cdk.sources import Source +from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor +from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit, split_config + +# from airbyte_cdk.utils import PrintBuffer, is_cloud_environment, message_utils # add PrintBuffer back once fixed +from airbyte_cdk.utils import is_cloud_environment, message_utils +from airbyte_cdk.utils.airbyte_secrets_utils import get_secrets, update_secrets +from airbyte_cdk.utils.constants import ENV_REQUEST_CACHE_PATH +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from orjson import orjson +from requests import PreparedRequest, Response, Session + +logger = init_logger("airbyte") + +VALID_URL_SCHEMES = ["https"] +CLOUD_DEPLOYMENT_MODE = "cloud" + + +class AirbyteEntrypoint(object): + def __init__(self, source: Source): + init_uncaught_exception_handler(logger) + + # Deployment mode is read when instantiating the entrypoint because it is the common path shared by syncs and connector builder test requests + if is_cloud_environment(): + _init_internal_request_filter() + + self.source = source + self.logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}") + + @staticmethod + def parse_args(args: List[str]) -> argparse.Namespace: + # set up parent parsers + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument("--debug", action="store_true", help="enables detailed debug logs related to the sync") + main_parser = argparse.ArgumentParser() + subparsers = main_parser.add_subparsers(title="commands", dest="command") + + # spec + subparsers.add_parser("spec", help="outputs the json configuration specification", parents=[parent_parser]) + + # check + check_parser = subparsers.add_parser("check", help="checks the config can be used to connect", parents=[parent_parser]) + required_check_parser = check_parser.add_argument_group("required named arguments") + required_check_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file") + + # discover + discover_parser = subparsers.add_parser( + "discover", help="outputs a catalog describing the source's schema", parents=[parent_parser] + ) + required_discover_parser = discover_parser.add_argument_group("required named arguments") + required_discover_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file") + + # read + read_parser = subparsers.add_parser("read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser]) + + read_parser.add_argument("--state", type=str, required=False, help="path to the json-encoded state file") + required_read_parser = read_parser.add_argument_group("required named arguments") + required_read_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file") + required_read_parser.add_argument( + "--catalog", type=str, required=True, help="path to the catalog used to determine which data to read" + ) + + return main_parser.parse_args(args) + + def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: + cmd = parsed_args.command + if not cmd: + raise Exception("No command passed") + + if hasattr(parsed_args, "debug") and parsed_args.debug: + self.logger.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) + self.logger.debug("Debug logs enabled") + else: + self.logger.setLevel(logging.INFO) + + source_spec: ConnectorSpecification = self.source.spec(self.logger) + try: + with tempfile.TemporaryDirectory() as temp_dir: + os.environ[ENV_REQUEST_CACHE_PATH] = temp_dir # set this as default directory for request_cache to store *.sqlite files + if cmd == "spec": + message = AirbyteMessage(type=Type.SPEC, spec=source_spec) + yield from [ + self.airbyte_message_to_string(queued_message) for queued_message in self._emit_queued_messages(self.source) + ] + yield self.airbyte_message_to_string(message) + else: + raw_config = self.source.read_config(parsed_args.config) + config = self.source.configure(raw_config, temp_dir) + + yield from [ + self.airbyte_message_to_string(queued_message) for queued_message in self._emit_queued_messages(self.source) + ] + if cmd == "check": + yield from map(AirbyteEntrypoint.airbyte_message_to_string, self.check(source_spec, config)) + elif cmd == "discover": + yield from map(AirbyteEntrypoint.airbyte_message_to_string, self.discover(source_spec, config)) + elif cmd == "read": + config_catalog = self.source.read_catalog(parsed_args.catalog) + state = self.source.read_state(parsed_args.state) + + yield from map(AirbyteEntrypoint.airbyte_message_to_string, self.read(source_spec, config, config_catalog, state)) + else: + raise Exception("Unexpected command " + cmd) + finally: + yield from [self.airbyte_message_to_string(queued_message) for queued_message in self._emit_queued_messages(self.source)] + + def check(self, source_spec: ConnectorSpecification, config: TConfig) -> Iterable[AirbyteMessage]: + self.set_up_secret_filter(config, source_spec.connectionSpecification) + try: + self.validate_connection(source_spec, config) + except AirbyteTracedException as traced_exc: + connection_status = traced_exc.as_connection_status_message() + # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error + # If the failure is not exceptional, we'll emit a failed connection status message and return + if traced_exc.failure_type != FailureType.config_error: + raise traced_exc + if connection_status: + yield from self._emit_queued_messages(self.source) + yield connection_status + return + + try: + check_result = self.source.check(self.logger, config) + except AirbyteTracedException as traced_exc: + yield traced_exc.as_airbyte_message() + # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error + # If the failure is not exceptional, we'll emit a failed connection status message and return + if traced_exc.failure_type != FailureType.config_error: + raise traced_exc + else: + yield AirbyteMessage( + type=Type.CONNECTION_STATUS, connectionStatus=AirbyteConnectionStatus(status=Status.FAILED, message=traced_exc.message) + ) + return + if check_result.status == Status.SUCCEEDED: + self.logger.info("Check succeeded") + else: + self.logger.error("Check failed") + + yield from self._emit_queued_messages(self.source) + yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result) + + def discover(self, source_spec: ConnectorSpecification, config: TConfig) -> Iterable[AirbyteMessage]: + self.set_up_secret_filter(config, source_spec.connectionSpecification) + if self.source.check_config_against_spec: + self.validate_connection(source_spec, config) + catalog = self.source.discover(self.logger, config) + + yield from self._emit_queued_messages(self.source) + yield AirbyteMessage(type=Type.CATALOG, catalog=catalog) + + def read(self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any]) -> Iterable[AirbyteMessage]: + self.set_up_secret_filter(config, source_spec.connectionSpecification) + if self.source.check_config_against_spec: + self.validate_connection(source_spec, config) + + # The Airbyte protocol dictates that counts be expressed as float/double to better protect against integer overflows + stream_message_counter: DefaultDict[HashableStreamDescriptor, float] = defaultdict(float) + for message in self.source.read(self.logger, config, catalog, state): + yield self.handle_record_counts(message, stream_message_counter) + for message in self._emit_queued_messages(self.source): + yield self.handle_record_counts(message, stream_message_counter) + + @staticmethod + def handle_record_counts(message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float]) -> AirbyteMessage: + match message.type: + case Type.RECORD: + stream_message_count[HashableStreamDescriptor(name=message.record.stream, namespace=message.record.namespace)] += 1.0 # type: ignore[union-attr] # record has `stream` and `namespace` + case Type.STATE: + stream_descriptor = message_utils.get_stream_descriptor(message) + + # Set record count from the counter onto the state message + message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats` + message.state.sourceStats.recordCount = stream_message_count.get(stream_descriptor, 0.0) # type: ignore[union-attr] # state has `sourceStats` + + # Reset the counter + stream_message_count[stream_descriptor] = 0.0 + return message + + @staticmethod + def validate_connection(source_spec: ConnectorSpecification, config: TConfig) -> None: + # Remove internal flags from config before validating so + # jsonschema's additionalProperties flag won't fail the validation + connector_config, _ = split_config(config) + check_config_against_spec_or_exit(connector_config, source_spec) + + @staticmethod + def set_up_secret_filter(config: TConfig, connection_specification: Mapping[str, Any]) -> None: + # Now that we have the config, we can use it to get a list of ai airbyte_secrets + # that we should filter in logging to avoid leaking secrets + config_secrets = get_secrets(connection_specification, config) + update_secrets(config_secrets) + + @staticmethod + def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str: + return orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode() # type: ignore[no-any-return] # orjson.dumps(message).decode() always returns string + + @classmethod + def extract_state(cls, args: List[str]) -> Optional[Any]: + parsed_args = cls.parse_args(args) + if hasattr(parsed_args, "state"): + return parsed_args.state + return None + + @classmethod + def extract_catalog(cls, args: List[str]) -> Optional[Any]: + parsed_args = cls.parse_args(args) + if hasattr(parsed_args, "catalog"): + return parsed_args.catalog + return None + + @classmethod + def extract_config(cls, args: List[str]) -> Optional[Any]: + parsed_args = cls.parse_args(args) + if hasattr(parsed_args, "config"): + return parsed_args.config + return None + + def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]: + if hasattr(source, "message_repository") and source.message_repository: + yield from source.message_repository.consume_queue() + return + + +def launch(source: Source, args: List[str]) -> None: + source_entrypoint = AirbyteEntrypoint(source) + parsed_args = source_entrypoint.parse_args(args) + # temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs + # Refer to: https://github.com/airbytehq/oncall/issues/6235 + # with PrintBuffer(): + for message in source_entrypoint.run(parsed_args): + # simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and + # the other for the break line. Adding `\n` to the message ensure that both are printed at the same time + print(f"{message}\n", end="", flush=True) + + +def _init_internal_request_filter() -> None: + """ + Wraps the Python requests library to prevent sending requests to internal URL endpoints. + """ + wrapped_fn = Session.send + + @wraps(wrapped_fn) + def filtered_send(self: Any, request: PreparedRequest, **kwargs: Any) -> Response: + parsed_url = urlparse(request.url) + + if parsed_url.scheme not in VALID_URL_SCHEMES: + raise requests.exceptions.InvalidSchema( + "Invalid Protocol Scheme: The endpoint that data is being requested from is using an invalid or insecure " + + f"protocol {parsed_url.scheme!r}. Valid protocol schemes: {','.join(VALID_URL_SCHEMES)}" + ) + + if not parsed_url.hostname: + raise requests.exceptions.InvalidURL("Invalid URL specified: The endpoint that data is being requested from is not a valid URL") + + try: + is_private = _is_private_url(parsed_url.hostname, parsed_url.port) # type: ignore [arg-type] + if is_private: + raise AirbyteTracedException( + internal_message=f"Invalid URL endpoint: `{parsed_url.hostname!r}` belongs to a private network", + failure_type=FailureType.config_error, + message="Invalid URL endpoint: The endpoint that data is being requested from belongs to a private network. Source connectors only support requesting data from public API endpoints.", + ) + except socket.gaierror as exception: + # This is a special case where the developer specifies an IP address string that is not formatted correctly like trailing + # whitespace which will fail the socket IP lookup. This only happens when using IP addresses and not text hostnames. + # Knowing that this is a request using the requests library, we will mock the exception without calling the lib + raise requests.exceptions.InvalidURL(f"Invalid URL {parsed_url}: {exception}") + + return wrapped_fn(self, request, **kwargs) + + Session.send = filtered_send # type: ignore [method-assign] + + +def _is_private_url(hostname: str, port: int) -> bool: + """ + Helper method that checks if any of the IP addresses associated with a hostname belong to a private network. + """ + address_info_entries = socket.getaddrinfo(hostname, port) + for entry in address_info_entries: + # getaddrinfo() returns entries in the form of a 5-tuple where the IP is stored as the sockaddr. For IPv4 this + # is a 2-tuple and for IPv6 it is a 4-tuple, but the address is always the first value of the tuple at 0. + # See https://docs.python.org/3/library/socket.html#socket.getaddrinfo for more details. + ip_address = entry[4][0] + if ipaddress.ip_address(ip_address).is_private: + return True + return False + + +def main() -> None: + impl_module = os.environ.get("AIRBYTE_IMPL_MODULE", Source.__module__) + impl_class = os.environ.get("AIRBYTE_IMPL_PATH", Source.__name__) + module = importlib.import_module(impl_module) + impl = getattr(module, impl_class) + + # set up and run entrypoint + source = impl() + + if not isinstance(source, Source): + raise Exception("Source implementation provided does not implement Source class!") + + launch(source, sys.argv[1:]) diff --git a/airbyte-cdk/python/airbyte_cdk/exception_handler.py b/airbyte-cdk/python/airbyte_cdk/exception_handler.py new file mode 100644 index 000000000000..77fa88989378 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/exception_handler.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import sys +from types import TracebackType +from typing import Any, List, Mapping, Optional + +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +def assemble_uncaught_exception(exception_type: type[BaseException], exception_value: BaseException) -> AirbyteTracedException: + if issubclass(exception_type, AirbyteTracedException): + return exception_value # type: ignore # validated as part of the previous line + return AirbyteTracedException.from_exception(exception_value) + + +def init_uncaught_exception_handler(logger: logging.Logger) -> None: + """ + Handles uncaught exceptions by emitting an AirbyteTraceMessage and making sure they are not + printed to the console without having secrets removed. + """ + + def hook_fn(exception_type: type[BaseException], exception_value: BaseException, traceback_: Optional[TracebackType]) -> Any: + # For developer ergonomics, we want to see the stack trace in the logs when we do a ctrl-c + if issubclass(exception_type, KeyboardInterrupt): + sys.__excepthook__(exception_type, exception_value, traceback_) + return + + logger.fatal(exception_value, exc_info=exception_value) + + # emit an AirbyteTraceMessage for any exception that gets to this spot + traced_exc = assemble_uncaught_exception(exception_type, exception_value) + + traced_exc.emit_message() + + sys.excepthook = hook_fn + + +def generate_failed_streams_error_message(stream_failures: Mapping[str, List[Exception]]) -> str: + failures = "\n".join( + [f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exceptions in stream_failures.items() for exception in exceptions] + ) + return f"During the sync, the following streams did not sync successfully: {failures}" diff --git a/airbyte-cdk/python/airbyte_cdk/logger.py b/airbyte-cdk/python/airbyte_cdk/logger.py new file mode 100644 index 000000000000..59d4d7dd68d3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/logger.py @@ -0,0 +1,97 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +import logging.config +from typing import Any, Callable, Mapping, Optional, Tuple + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteMessageSerializer, Level, Type +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from orjson import orjson + +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "airbyte": {"()": "airbyte_cdk.logger.AirbyteLogFormatter", "format": "%(message)s"}, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", + "formatter": "airbyte", + }, + }, + "root": { + "handlers": ["console"], + }, +} + + +def init_logger(name: Optional[str] = None) -> logging.Logger: + """Initial set up of logger""" + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + logging.config.dictConfig(LOGGING_CONFIG) + return logger + + +def lazy_log(logger: logging.Logger, level: int, lazy_log_provider: Callable[[], str]) -> None: + """ + This method ensure that the processing of the log message is only done if the logger is enabled for the log level. + """ + if logger.isEnabledFor(level): + logger.log(level, lazy_log_provider()) + + +class AirbyteLogFormatter(logging.Formatter): + """Output log records using AirbyteMessage""" + + # Transforming Python log levels to Airbyte protocol log levels + level_mapping = { + logging.FATAL: Level.FATAL, + logging.ERROR: Level.ERROR, + logging.WARNING: Level.WARN, + logging.INFO: Level.INFO, + logging.DEBUG: Level.DEBUG, + } + + def format(self, record: logging.LogRecord) -> str: + """Return a JSON representation of the log message""" + airbyte_level = self.level_mapping.get(record.levelno, "INFO") + if airbyte_level == Level.DEBUG: + extras = self.extract_extra_args_from_record(record) + debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras} + return filter_secrets(json.dumps(debug_dict)) + else: + message = super().format(record) + message = filter_secrets(message) + log_message = AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)) + return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode() # type: ignore[no-any-return] # orjson.dumps(message).decode() always returns string + + @staticmethod + def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]: + """ + The python logger conflates default args with extra args. We use an empty log record and set operations + to isolate fields passed to the log record via extra by the developer. + """ + default_attrs = logging.LogRecord("", 0, "", 0, None, None, None).__dict__.keys() + extra_keys = set(record.__dict__.keys()) - default_attrs + return {k: str(getattr(record, k)) for k in extra_keys if hasattr(record, k)} + + +def log_by_prefix(msg: str, default_level: str) -> Tuple[int, str]: + """Custom method, which takes log level from first word of message""" + valid_log_types = ["FATAL", "ERROR", "WARN", "INFO", "DEBUG", "TRACE"] + split_line = msg.split() + first_word = next(iter(split_line), None) + if first_word in valid_log_types: + log_level = logging.getLevelName(first_word) + rendered_message = " ".join(split_line[1:]) + else: + log_level = logging.getLevelName(default_level) + rendered_message = msg + + return log_level, rendered_message diff --git a/airbyte-cdk/python/airbyte_cdk/models/__init__.py b/airbyte-cdk/python/airbyte_cdk/models/__init__.py new file mode 100644 index 000000000000..c56df9adc43a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/models/__init__.py @@ -0,0 +1,70 @@ +# The earlier versions of airbyte-cdk (0.28.0<=) had the airbyte_protocol python classes +# declared inline in the airbyte-cdk code. However, somewhere around Feb 2023 the +# Airbyte Protocol moved to its own repo/PyPi package, called airbyte-protocol-models. +# This directory including the airbyte_protocol.py and well_known_types.py files +# are just wrappers on top of that stand-alone package which do some namespacing magic +# to make the airbyte_protocol python classes available to the airbyte-cdk consumer as part +# of airbyte-cdk rather than a standalone package. +from .airbyte_protocol import ( + AdvancedAuth, + AirbyteStateStats, + AirbyteAnalyticsTraceMessage, + AirbyteCatalog, + AirbyteConnectionStatus, + AirbyteControlConnectorConfigMessage, + AirbyteControlMessage, + AirbyteErrorTraceMessage, + AirbyteEstimateTraceMessage, + AirbyteGlobalState, + AirbyteLogMessage, + AirbyteMessage, + AirbyteProtocol, + AirbyteRecordMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + AirbyteStreamStatus, + AirbyteStreamStatusTraceMessage, + AirbyteStreamStatusReason, + AirbyteStreamStatusReasonType, + AirbyteTraceMessage, + AuthFlowType, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + ConnectorSpecification, + DestinationSyncMode, + EstimateType, + FailureType, + Level, + OAuthConfigSpecification, + OrchestratorType, + Status, + StreamDescriptor, + SyncMode, + TraceType, + Type, +) +from .well_known_types import ( + BinaryData, + Boolean, + Date, + Integer, + Model, + Number, + String, + TimestampWithoutTimezone, + TimestampWithTimezone, + TimeWithoutTimezone, + TimeWithTimezone, +) + +from .airbyte_protocol_serializers import ( +AirbyteStreamStateSerializer, +AirbyteStateMessageSerializer, +AirbyteMessageSerializer, +ConfiguredAirbyteCatalogSerializer, +ConfiguredAirbyteStreamSerializer, +ConnectorSpecificationSerializer, +) \ No newline at end of file diff --git a/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py b/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py new file mode 100644 index 000000000000..6c0cdbb1bac5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Annotated, Any, Dict, List, Mapping, Optional, Union + +from airbyte_cdk.models.file_transfer_record_message import AirbyteFileTransferRecordMessage +from airbyte_protocol_dataclasses.models import * +from serpyco_rs.metadata import Alias + + +@dataclass +class AirbyteStateBlob: + """ + A dataclass that dynamically sets attributes based on provided keyword arguments and positional arguments. + Used to "mimic" pydantic Basemodel with ConfigDict(extra='allow') option. + + The `AirbyteStateBlob` class allows for flexible instantiation by accepting any number of keyword arguments + and positional arguments. These are used to dynamically update the instance's attributes. This class is useful + in scenarios where the attributes of an object are not known until runtime and need to be set dynamically. + + Attributes: + kwargs (InitVar[Mapping[str, Any]]): A dictionary of keyword arguments used to set attributes dynamically. + + Methods: + __init__(*args: Any, **kwargs: Any) -> None: + Initializes the `AirbyteStateBlob` by setting attributes from the provided arguments. + + __eq__(other: object) -> bool: + Checks equality between two `AirbyteStateBlob` instances based on their internal dictionaries. + Returns `False` if the other object is not an instance of `AirbyteStateBlob`. + """ + + kwargs: InitVar[Mapping[str, Any]] + + def __init__(self, *args: Any, **kwargs: Any) -> None: + # Set any attribute passed in through kwargs + for arg in args: + self.__dict__.update(arg) + for key, value in kwargs.items(): + setattr(self, key, value) + + def __eq__(self, other: object) -> bool: + return False if not isinstance(other, AirbyteStateBlob) else bool(self.__dict__ == other.__dict__) + + +# The following dataclasses have been redeclared to include the new version of AirbyteStateBlob +@dataclass +class AirbyteStreamState: + stream_descriptor: StreamDescriptor # type: ignore [name-defined] + stream_state: Optional[AirbyteStateBlob] = None + + +@dataclass +class AirbyteGlobalState: + stream_states: List[AirbyteStreamState] + shared_state: Optional[AirbyteStateBlob] = None + + +@dataclass +class AirbyteStateMessage: + type: Optional[AirbyteStateType] = None # type: ignore [name-defined] + stream: Optional[AirbyteStreamState] = None + global_: Annotated[ + AirbyteGlobalState | None, Alias("global") + ] = None # "global" is a reserved keyword in python ⇒ Alias is used for (de-)serialization + data: Optional[Dict[str, Any]] = None + sourceStats: Optional[AirbyteStateStats] = None # type: ignore [name-defined] + destinationStats: Optional[AirbyteStateStats] = None # type: ignore [name-defined] + + +@dataclass +class AirbyteMessage: + type: Type # type: ignore [name-defined] + log: Optional[AirbyteLogMessage] = None # type: ignore [name-defined] + spec: Optional[ConnectorSpecification] = None # type: ignore [name-defined] + connectionStatus: Optional[AirbyteConnectionStatus] = None # type: ignore [name-defined] + catalog: Optional[AirbyteCatalog] = None # type: ignore [name-defined] + record: Optional[Union[AirbyteFileTransferRecordMessage, AirbyteRecordMessage]] = None # type: ignore [name-defined] + state: Optional[AirbyteStateMessage] = None + trace: Optional[AirbyteTraceMessage] = None # type: ignore [name-defined] + control: Optional[AirbyteControlMessage] = None # type: ignore [name-defined] diff --git a/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol_serializers.py b/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol_serializers.py new file mode 100644 index 000000000000..aeac43f794ce --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol_serializers.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +from typing import Any, Dict + +from serpyco_rs import CustomType, Serializer + +from .airbyte_protocol import ( # type: ignore[attr-defined] # all classes are imported to airbyte_protocol via * + AirbyteMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + ConnectorSpecification, +) + + +class AirbyteStateBlobType(CustomType[AirbyteStateBlob, Dict[str, Any]]): + def serialize(self, value: AirbyteStateBlob) -> Dict[str, Any]: + # cant use orjson.dumps() directly because private attributes are excluded, e.g. "__ab_full_refresh_sync_complete" + return {k: v for k, v in value.__dict__.items()} + + def deserialize(self, value: Dict[str, Any]) -> AirbyteStateBlob: + return AirbyteStateBlob(value) + + def get_json_schema(self) -> Dict[str, Any]: + return {"type": "object"} + + +def custom_type_resolver(t: type) -> CustomType[AirbyteStateBlob, Dict[str, Any]] | None: + return AirbyteStateBlobType() if t is AirbyteStateBlob else None + + +AirbyteStreamStateSerializer = Serializer(AirbyteStreamState, omit_none=True, custom_type_resolver=custom_type_resolver) +AirbyteStateMessageSerializer = Serializer(AirbyteStateMessage, omit_none=True, custom_type_resolver=custom_type_resolver) +AirbyteMessageSerializer = Serializer(AirbyteMessage, omit_none=True, custom_type_resolver=custom_type_resolver) +ConfiguredAirbyteCatalogSerializer = Serializer(ConfiguredAirbyteCatalog, omit_none=True) +ConfiguredAirbyteStreamSerializer = Serializer(ConfiguredAirbyteStream, omit_none=True) +ConnectorSpecificationSerializer = Serializer(ConnectorSpecification, omit_none=True) diff --git a/airbyte-cdk/python/airbyte_cdk/models/file_transfer_record_message.py b/airbyte-cdk/python/airbyte_cdk/models/file_transfer_record_message.py new file mode 100644 index 000000000000..dcc1b7a92cf1 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/models/file_transfer_record_message.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class AirbyteFileTransferRecordMessage: + stream: str + file: Dict[str, Any] + emitted_at: int + namespace: Optional[str] = None + data: Optional[Dict[str, Any]] = None diff --git a/airbyte-cdk/python/airbyte_cdk/models/well_known_types.py b/airbyte-cdk/python/airbyte_cdk/models/well_known_types.py new file mode 100644 index 000000000000..a063ad7db03a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/models/well_known_types.py @@ -0,0 +1,5 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_protocol_dataclasses.models.well_known_types import * diff --git a/airbyte-cdk/python/airbyte_cdk/py.typed b/airbyte-cdk/python/airbyte_cdk/py.typed new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/__init__.py new file mode 100644 index 000000000000..a6560a50310e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/__init__.py @@ -0,0 +1,26 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# + +import dpath.options + +from .abstract_source import AbstractSource +from .config import BaseConfig +from .source import Source + +# As part of the CDK sources, we do not control what the APIs return and it is possible that a key is empty. +# Reasons why we are doing this at the airbyte_cdk level: +# * As of today, all the use cases should allow for empty keys +# * Cases as of 2023-08-31: oauth/session token provider responses, extractor, transformation and substream) +# * The behavior is explicit at the package level and not hidden in every package that needs dpath.options.ALLOW_EMPTY_STRING_KEYS = True +# There is a downside in enforcing this option preemptively in the module __init__.py: the runtime code will import dpath even though the it +# might not need dpath leading to longer initialization time. +# There is a downside in using dpath as a library since the options are global: if we have two pieces of code that want different options, +# this will not be thread-safe. +dpath.options.ALLOW_EMPTY_STRING_KEYS = True + +__all__ = [ + "AbstractSource", + "BaseConfig", + "Source", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py b/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py new file mode 100644 index 000000000000..3656a88c3157 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py @@ -0,0 +1,280 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from typing import Any, Dict, Iterable, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union + +from airbyte_cdk.exception_handler import generate_failed_streams_error_message +from airbyte_cdk.models import ( + AirbyteCatalog, + AirbyteConnectionStatus, + AirbyteMessage, + AirbyteStateMessage, + AirbyteStreamStatus, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + FailureType, + Status, + StreamDescriptor, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository +from airbyte_cdk.sources.source import Source +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.streams.http.http import HttpStream +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message +from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config +from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger +from airbyte_cdk.utils.event_timing import create_timer +from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + +_default_message_repository = InMemoryMessageRepository() + + +class AbstractSource(Source, ABC): + """ + Abstract base class for an Airbyte Source. Consumers should implement any abstract methods + in this class to create an Airbyte Specification compliant Source. + """ + + @abstractmethod + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + """ + :param logger: source logger + :param config: The user-provided configuration as specified by the source's spec. + This usually contains information required to check connection e.g. tokens, secrets and keys etc. + :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful + and we can connect to the underlying data source using the provided configuration. + Otherwise, the input config cannot be used to connect to the underlying data source, + and the "error" object should describe what went wrong. + The error object will be cast to string to display the problem to the user. + """ + + @abstractmethod + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + """ + :param config: The user-provided configuration as specified by the source's spec. + Any stream construction related operation should happen here. + :return: A list of the streams in this source connector. + """ + + # Stream name to instance map for applying output object transformation + _stream_to_instance_map: Dict[str, Stream] = {} + _slice_logger: SliceLogger = DebugSliceLogger() + + def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog: + """Implements the Discover operation from the Airbyte Specification. + See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#discover. + """ + streams = [stream.as_airbyte_stream() for stream in self.streams(config=config)] + return AirbyteCatalog(streams=streams) + + def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: + """Implements the Check Connection operation from the Airbyte Specification. + See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check. + """ + check_succeeded, error = self.check_connection(logger, config) + if not check_succeeded: + return AirbyteConnectionStatus(status=Status.FAILED, message=repr(error)) + return AirbyteConnectionStatus(status=Status.SUCCEEDED) + + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: Optional[List[AirbyteStateMessage]] = None, + ) -> Iterator[AirbyteMessage]: + """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.""" + logger.info(f"Starting syncing {self.name}") + config, internal_config = split_config(config) + # TODO assert all streams exist in the connector + # get the streams once in case the connector needs to make any queries to generate them + stream_instances = {s.name: s for s in self.streams(config)} + state_manager = ConnectorStateManager(state=state) + self._stream_to_instance_map = stream_instances + + stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {} + + with create_timer(self.name) as timer: + for configured_stream in catalog.streams: + stream_instance = stream_instances.get(configured_stream.stream.name) + is_stream_exist = bool(stream_instance) + try: + # Used direct reference to `stream_instance` instead of `is_stream_exist` to avoid mypy type checking errors + if not stream_instance: + if not self.raise_exception_on_missing_stream: + yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE) + continue + + error_message = ( + f"The stream '{configured_stream.stream.name}' in your connection configuration was not found in the source. " + f"Refresh the schema in your replication settings and remove this stream from future sync attempts." + ) + + # Use configured_stream as stream_instance to support references in error handling. + stream_instance = configured_stream.stream + + raise AirbyteTracedException( + message="A stream listed in your configuration was not found in the source. Please check the logs for more " + "details.", + internal_message=error_message, + failure_type=FailureType.config_error, + ) + + timer.start_event(f"Syncing stream {configured_stream.stream.name}") + logger.info(f"Marking stream {configured_stream.stream.name} as STARTED") + yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.STARTED) + yield from self._read_stream( + logger=logger, + stream_instance=stream_instance, + configured_stream=configured_stream, + state_manager=state_manager, + internal_config=internal_config, + ) + logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED") + yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.COMPLETE) + + except Exception as e: + yield from self._emit_queued_messages() + logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}") + logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED") + yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE) + + stream_descriptor = StreamDescriptor(name=configured_stream.stream.name) + + if isinstance(e, AirbyteTracedException): + traced_exception = e + info_message = f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error." + else: + traced_exception = self._serialize_exception(stream_descriptor, e, stream_instance=stream_instance) + info_message = f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}" + + yield traced_exception.as_sanitized_airbyte_message(stream_descriptor=stream_descriptor) + stream_name_to_exception[stream_instance.name] = traced_exception # type: ignore # use configured_stream if stream_instance is None + if self.stop_sync_on_stream_failure: + logger.info(info_message) + break + finally: + # Finish read event only if the stream instance exists; + # otherwise, there's no need as it never started + if is_stream_exist: + timer.finish_event() + logger.info(f"Finished syncing {configured_stream.stream.name}") + logger.info(timer.report()) + + if len(stream_name_to_exception) > 0: + error_message = generate_failed_streams_error_message({key: [value] for key, value in stream_name_to_exception.items()}) # type: ignore # for some reason, mypy can't figure out the types for key and value + logger.info(error_message) + # We still raise at least one exception when a stream raises an exception because the platform currently relies + # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error + # type because this combined error isn't actionable, but rather the previously emitted individual errors. + raise AirbyteTracedException(message=error_message, failure_type=FailureType.config_error) + logger.info(f"Finished syncing {self.name}") + + @staticmethod + def _serialize_exception( + stream_descriptor: StreamDescriptor, e: Exception, stream_instance: Optional[Stream] = None + ) -> AirbyteTracedException: + display_message = stream_instance.get_error_display_message(e) if stream_instance else None + if display_message: + return AirbyteTracedException.from_exception(e, message=display_message, stream_descriptor=stream_descriptor) + return AirbyteTracedException.from_exception(e, stream_descriptor=stream_descriptor) + + @property + def raise_exception_on_missing_stream(self) -> bool: + return False + + def _read_stream( + self, + logger: logging.Logger, + stream_instance: Stream, + configured_stream: ConfiguredAirbyteStream, + state_manager: ConnectorStateManager, + internal_config: InternalConfig, + ) -> Iterator[AirbyteMessage]: + if internal_config.page_size and isinstance(stream_instance, HttpStream): + logger.info(f"Setting page size for {stream_instance.name} to {internal_config.page_size}") + stream_instance.page_size = internal_config.page_size + logger.debug( + f"Syncing configured stream: {configured_stream.stream.name}", + extra={ + "sync_mode": configured_stream.sync_mode, + "primary_key": configured_stream.primary_key, + "cursor_field": configured_stream.cursor_field, + }, + ) + stream_instance.log_stream_sync_configuration() + + stream_name = configured_stream.stream.name + stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace) + + # This is a hack. Existing full refresh streams that are converted into resumable full refresh need to discard + # the state because the terminal state for a full refresh sync is not compatible with substream resumable full + # refresh state. This is only required when running live traffic regression testing since the platform normally + # handles whether to pass state + if stream_state == {"__ab_no_cursor_state_message": True}: + stream_state = {} + + if "state" in dir(stream_instance): + stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance) + logger.info(f"Setting state of {self.name} stream to {stream_state}") + + record_iterator = stream_instance.read( + configured_stream, + logger, + self._slice_logger, + stream_state, + state_manager, + internal_config, + ) + + record_counter = 0 + logger.info(f"Syncing stream: {stream_name} ") + for record_data_or_message in record_iterator: + record = self._get_message(record_data_or_message, stream_instance) + if record.type == MessageType.RECORD: + record_counter += 1 + if record_counter == 1: + logger.info(f"Marking stream {stream_name} as RUNNING") + # If we just read the first record of the stream, emit the transition to the RUNNING state + yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.RUNNING) + yield from self._emit_queued_messages() + yield record + + logger.info(f"Read {record_counter} records from {stream_name} stream") + + def _emit_queued_messages(self) -> Iterable[AirbyteMessage]: + if self.message_repository: + yield from self.message_repository.consume_queue() + return + + def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage: + """ + Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage + """ + match record_data_or_message: + case AirbyteMessage(): + return record_data_or_message + case _: + return stream_data_to_airbyte_message(stream.name, record_data_or_message, stream.transformer, stream.get_json_schema()) + + @property + def message_repository(self) -> Union[None, MessageRepository]: + return _default_message_repository + + @property + def stop_sync_on_stream_failure(self) -> bool: + """ + WARNING: This function is in-development which means it is subject to change. Use at your own risk. + + By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then + continue syncing the next stream. This can be overwritten on a per-source basis so that the source will stop the sync + on the first error seen and emit a single error trace message for that stream. + """ + return False diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/__init__.py new file mode 100644 index 000000000000..f3b7d7f33013 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""The concurrent source model replaces the legacy Source model. + +The concurrent source model is a new way to build sources in the Airbyte CDK. It is designed to +be more ergonomic and performant than the legacy Source model. + +To implement a source using the concurrent source model, check out the submodules in this package. +""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py new file mode 100644 index 000000000000..0356f211d80f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -0,0 +1,221 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +from typing import Dict, Iterable, List, Optional, Set + +from airbyte_cdk.exception_handler import generate_failed_streams_error_message +from airbyte_cdk.models import AirbyteMessage, AirbyteStreamStatus, FailureType, StreamDescriptor +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel +from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer +from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from airbyte_cdk.utils import AirbyteTracedException +from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message + + +class ConcurrentReadProcessor: + def __init__( + self, + stream_instances_to_read_from: List[AbstractStream], + partition_enqueuer: PartitionEnqueuer, + thread_pool_manager: ThreadPoolManager, + logger: logging.Logger, + slice_logger: SliceLogger, + message_repository: MessageRepository, + partition_reader: PartitionReader, + ): + """ + This class is responsible for handling items from a concurrent stream read process. + :param stream_instances_to_read_from: List of streams to read from + :param partition_enqueuer: PartitionEnqueuer instance + :param thread_pool_manager: ThreadPoolManager instance + :param logger: Logger instance + :param slice_logger: SliceLogger instance + :param message_repository: MessageRepository instance + :param partition_reader: PartitionReader instance + """ + self._stream_name_to_instance = {s.name: s for s in stream_instances_to_read_from} + self._record_counter = {} + self._streams_to_running_partitions: Dict[str, Set[Partition]] = {} + for stream in stream_instances_to_read_from: + self._streams_to_running_partitions[stream.name] = set() + self._record_counter[stream.name] = 0 + self._thread_pool_manager = thread_pool_manager + self._partition_enqueuer = partition_enqueuer + self._stream_instances_to_start_partition_generation = stream_instances_to_read_from + self._streams_currently_generating_partitions: List[str] = [] + self._logger = logger + self._slice_logger = slice_logger + self._message_repository = message_repository + self._partition_reader = partition_reader + self._streams_done: Set[str] = set() + self._exceptions_per_stream_name: dict[str, List[Exception]] = {} + + def on_partition_generation_completed(self, sentinel: PartitionGenerationCompletedSentinel) -> Iterable[AirbyteMessage]: + """ + This method is called when a partition generation is completed. + 1. Remove the stream from the list of streams currently generating partitions + 2. If the stream is done, mark it as such and return a stream status message + 3. If there are more streams to read from, start the next partition generator + """ + stream_name = sentinel.stream.name + self._streams_currently_generating_partitions.remove(sentinel.stream.name) + # It is possible for the stream to already be done if no partitions were generated + # If the partition generation process was completed and there are no partitions left to process, the stream is done + if self._is_stream_done(stream_name) or len(self._streams_to_running_partitions[stream_name]) == 0: + yield from self._on_stream_is_done(stream_name) + if self._stream_instances_to_start_partition_generation: + yield self.start_next_partition_generator() # type:ignore # None may be yielded + + def on_partition(self, partition: Partition) -> None: + """ + This method is called when a partition is generated. + 1. Add the partition to the set of partitions for the stream + 2. Log the slice if necessary + 3. Submit the partition to the thread pool manager + """ + stream_name = partition.stream_name() + self._streams_to_running_partitions[stream_name].add(partition) + if self._slice_logger.should_log_slice_message(self._logger): + self._message_repository.emit_message(self._slice_logger.create_slice_log_message(partition.to_slice())) + self._thread_pool_manager.submit(self._partition_reader.process_partition, partition) + + def on_partition_complete_sentinel(self, sentinel: PartitionCompleteSentinel) -> Iterable[AirbyteMessage]: + """ + This method is called when a partition is completed. + 1. Close the partition + 2. If the stream is done, mark it as such and return a stream status message + 3. Emit messages that were added to the message repository + """ + partition = sentinel.partition + + try: + if sentinel.is_successful: + partition.close() + except Exception as exception: + self._flag_exception(partition.stream_name(), exception) + yield AirbyteTracedException.from_exception( + exception, stream_descriptor=StreamDescriptor(name=partition.stream_name()) + ).as_sanitized_airbyte_message() + finally: + partitions_running = self._streams_to_running_partitions[partition.stream_name()] + if partition in partitions_running: + partitions_running.remove(partition) + # If all partitions were generated and this was the last one, the stream is done + if partition.stream_name() not in self._streams_currently_generating_partitions and len(partitions_running) == 0: + yield from self._on_stream_is_done(partition.stream_name()) + yield from self._message_repository.consume_queue() + + def on_record(self, record: Record) -> Iterable[AirbyteMessage]: + """ + This method is called when a record is read from a partition. + 1. Convert the record to an AirbyteMessage + 2. If this is the first record for the stream, mark the stream as RUNNING + 3. Increment the record counter for the stream + 4. Ensures the cursor knows the record has been successfully emitted + 5. Emit the message + 6. Emit messages that were added to the message repository + """ + # Do not pass a transformer or a schema + # AbstractStreams are expected to return data as they are expected. + # Any transformation on the data should be done before reaching this point + message = stream_data_to_airbyte_message( + stream_name=record.partition.stream_name(), + data_or_message=record.data, + is_file_transfer_message=record.is_file_transfer_message, + ) + stream = self._stream_name_to_instance[record.partition.stream_name()] + + if message.type == MessageType.RECORD: + if self._record_counter[stream.name] == 0: + self._logger.info(f"Marking stream {stream.name} as RUNNING") + yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING) + self._record_counter[stream.name] += 1 + stream.cursor.observe(record) + yield message + yield from self._message_repository.consume_queue() + + def on_exception(self, exception: StreamThreadException) -> Iterable[AirbyteMessage]: + """ + This method is called when an exception is raised. + 1. Stop all running streams + 2. Raise the exception + """ + self._flag_exception(exception.stream_name, exception.exception) + self._logger.exception(f"Exception while syncing stream {exception.stream_name}", exc_info=exception.exception) + + stream_descriptor = StreamDescriptor(name=exception.stream_name) + if isinstance(exception.exception, AirbyteTracedException): + yield exception.exception.as_airbyte_message(stream_descriptor=stream_descriptor) + else: + yield AirbyteTracedException.from_exception(exception, stream_descriptor=stream_descriptor).as_airbyte_message() + + def _flag_exception(self, stream_name: str, exception: Exception) -> None: + self._exceptions_per_stream_name.setdefault(stream_name, []).append(exception) + + def start_next_partition_generator(self) -> Optional[AirbyteMessage]: + """ + Start the next partition generator. + 1. Pop the next stream to read from + 2. Submit the partition generator to the thread pool manager + 3. Add the stream to the list of streams currently generating partitions + 4. Return a stream status message + """ + if self._stream_instances_to_start_partition_generation: + stream = self._stream_instances_to_start_partition_generation.pop(0) + self._thread_pool_manager.submit(self._partition_enqueuer.generate_partitions, stream) + self._streams_currently_generating_partitions.append(stream.name) + self._logger.info(f"Marking stream {stream.name} as STARTED") + self._logger.info(f"Syncing stream: {stream.name} ") + return stream_status_as_airbyte_message( + stream.as_airbyte_stream(), + AirbyteStreamStatus.STARTED, + ) + else: + return None + + def is_done(self) -> bool: + """ + This method is called to check if the sync is done. + The sync is done when: + 1. There are no more streams generating partitions + 2. There are no more streams to read from + 3. All partitions for all streams are closed + """ + is_done = all([self._is_stream_done(stream_name) for stream_name in self._stream_name_to_instance.keys()]) + if is_done and self._exceptions_per_stream_name: + error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name) + self._logger.info(error_message) + # We still raise at least one exception when a stream raises an exception because the platform currently relies + # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error + # type because this combined error isn't actionable, but rather the previously emitted individual errors. + raise AirbyteTracedException( + message=error_message, internal_message="Concurrent read failure", failure_type=FailureType.config_error + ) + return is_done + + def _is_stream_done(self, stream_name: str) -> bool: + return stream_name in self._streams_done + + def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: + self._logger.info(f"Read {self._record_counter[stream_name]} records from {stream_name} stream") + self._logger.info(f"Marking stream {stream_name} as STOPPED") + stream = self._stream_name_to_instance[stream_name] + stream.cursor.ensure_at_least_one_state_emitted() + yield from self._message_repository.consume_queue() + self._logger.info(f"Finished syncing {stream.name}") + self._streams_done.add(stream_name) + stream_status = ( + AirbyteStreamStatus.INCOMPLETE if self._exceptions_per_stream_name.get(stream_name, []) else AirbyteStreamStatus.COMPLETE + ) + yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), stream_status) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source.py new file mode 100644 index 000000000000..8e49f66a19c8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -0,0 +1,147 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import concurrent +import logging +from queue import Queue +from typing import Iterable, Iterator, List + +from airbyte_cdk.models import AirbyteMessage +from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor +from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel +from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer +from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem +from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger + + +class ConcurrentSource: + """ + A Source that reads data from multiple AbstractStreams concurrently. + It does so by submitting partition generation, and partition read tasks to a thread pool. + The tasks asynchronously add their output to a shared queue. + The read is done when all partitions for all streams w ere generated and read. + """ + + DEFAULT_TIMEOUT_SECONDS = 900 + + @staticmethod + def create( + num_workers: int, + initial_number_of_partitions_to_generate: int, + logger: logging.Logger, + slice_logger: SliceLogger, + message_repository: MessageRepository, + timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, + ) -> "ConcurrentSource": + is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1 + too_many_generator = not is_single_threaded and initial_number_of_partitions_to_generate >= num_workers + assert not too_many_generator, "It is required to have more workers than threads generating partitions" + threadpool = ThreadPoolManager( + concurrent.futures.ThreadPoolExecutor(max_workers=num_workers, thread_name_prefix="workerpool"), + logger, + ) + return ConcurrentSource( + threadpool, logger, slice_logger, message_repository, initial_number_of_partitions_to_generate, timeout_seconds + ) + + def __init__( + self, + threadpool: ThreadPoolManager, + logger: logging.Logger, + slice_logger: SliceLogger = DebugSliceLogger(), + message_repository: MessageRepository = InMemoryMessageRepository(), + initial_number_partitions_to_generate: int = 1, + timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, + ) -> None: + """ + :param threadpool: The threadpool to submit tasks to + :param logger: The logger to log to + :param slice_logger: The slice logger used to create messages on new slices + :param message_repository: The repository to emit messages to + :param initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible. + :param timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return. + """ + self._threadpool = threadpool + self._logger = logger + self._slice_logger = slice_logger + self._message_repository = message_repository + self._initial_number_partitions_to_generate = initial_number_partitions_to_generate + self._timeout_seconds = timeout_seconds + + def read( + self, + streams: List[AbstractStream], + ) -> Iterator[AirbyteMessage]: + self._logger.info("Starting syncing") + + # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less + # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating + # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more + # information and might even need to be configurable depending on the source + queue: Queue[QueueItem] = Queue(maxsize=10_000) + concurrent_stream_processor = ConcurrentReadProcessor( + streams, + PartitionEnqueuer(queue, self._threadpool), + self._threadpool, + self._logger, + self._slice_logger, + self._message_repository, + PartitionReader(queue), + ) + + # Enqueue initial partition generation tasks + yield from self._submit_initial_partition_generators(concurrent_stream_processor) + + # Read from the queue until all partitions were generated and read + yield from self._consume_from_queue( + queue, + concurrent_stream_processor, + ) + self._threadpool.check_for_errors_and_shutdown() + self._logger.info("Finished syncing") + + def _submit_initial_partition_generators(self, concurrent_stream_processor: ConcurrentReadProcessor) -> Iterable[AirbyteMessage]: + for _ in range(self._initial_number_partitions_to_generate): + status_message = concurrent_stream_processor.start_next_partition_generator() + if status_message: + yield status_message + + def _consume_from_queue( + self, + queue: Queue[QueueItem], + concurrent_stream_processor: ConcurrentReadProcessor, + ) -> Iterable[AirbyteMessage]: + while airbyte_message_or_record_or_exception := queue.get(): + yield from self._handle_item( + airbyte_message_or_record_or_exception, + concurrent_stream_processor, + ) + if concurrent_stream_processor.is_done() and queue.empty(): + # all partitions were generated and processed. we're done here + break + + def _handle_item( + self, + queue_item: QueueItem, + concurrent_stream_processor: ConcurrentReadProcessor, + ) -> Iterable[AirbyteMessage]: + # handle queue item and call the appropriate handler depending on the type of the queue item + if isinstance(queue_item, StreamThreadException): + yield from concurrent_stream_processor.on_exception(queue_item) + elif isinstance(queue_item, PartitionGenerationCompletedSentinel): + yield from concurrent_stream_processor.on_partition_generation_completed(queue_item) + elif isinstance(queue_item, Partition): + concurrent_stream_processor.on_partition(queue_item) + elif isinstance(queue_item, PartitionCompleteSentinel): + yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item) + elif isinstance(queue_item, Record): + yield from concurrent_stream_processor.on_record(queue_item) + else: + raise ValueError(f"Unknown queue item type: {type(queue_item)}") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py new file mode 100644 index 000000000000..bbffe8f88f71 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py @@ -0,0 +1,126 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC +from datetime import timedelta +from typing import Any, Callable, Iterator, List, Mapping, MutableMapping, Optional, Tuple + +from airbyte_cdk.models import AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog +from airbyte_cdk.sources import AbstractSource +from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade +from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, Cursor, CursorField, CursorValueType, FinalStateCursor, GapType +from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import AbstractStreamStateConverter + +DEFAULT_LOOKBACK_SECONDS = 0 + + +class ConcurrentSourceAdapter(AbstractSource, ABC): + def __init__(self, concurrent_source: ConcurrentSource, **kwargs: Any) -> None: + """ + ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source. + + The source's streams are still defined through the streams() method. + Streams wrapped in a StreamFacade will be processed concurrently. + Other streams will be processed sequentially as a later step. + """ + self._concurrent_source = concurrent_source + super().__init__(**kwargs) + + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: Optional[List[AirbyteStateMessage]] = None, + ) -> Iterator[AirbyteMessage]: + abstract_streams = self._select_abstract_streams(config, catalog) + concurrent_stream_names = {stream.name for stream in abstract_streams} + configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog( + streams=[stream for stream in catalog.streams if stream.stream.name not in concurrent_stream_names] + ) + if abstract_streams: + yield from self._concurrent_source.read(abstract_streams) + if configured_catalog_for_regular_streams.streams: + yield from super().read(logger, config, configured_catalog_for_regular_streams, state) + + def _select_abstract_streams(self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog) -> List[AbstractStream]: + """ + Selects streams that can be processed concurrently and returns their abstract representations. + """ + all_streams = self.streams(config) + stream_name_to_instance: Mapping[str, Stream] = {s.name: s for s in all_streams} + abstract_streams: List[AbstractStream] = [] + for configured_stream in configured_catalog.streams: + stream_instance = stream_name_to_instance.get(configured_stream.stream.name) + if not stream_instance: + continue + + if isinstance(stream_instance, AbstractStreamFacade): + abstract_streams.append(stream_instance.get_underlying_stream()) + return abstract_streams + + def convert_to_concurrent_stream( + self, logger: logging.Logger, stream: Stream, state_manager: ConnectorStateManager, cursor: Optional[Cursor] = None + ) -> Stream: + """ + Prepares a stream for concurrent processing by initializing or assigning a cursor, + managing the stream's state, and returning an updated Stream instance. + """ + state: MutableMapping[str, Any] = {} + + if cursor: + state = state_manager.get_stream_state(stream.name, stream.namespace) + + stream.cursor = cursor # type: ignore[assignment] # cursor is of type ConcurrentCursor, which inherits from Cursor + if hasattr(stream, "parent"): + stream.parent.cursor = cursor + else: + cursor = FinalStateCursor( + stream_name=stream.name, + stream_namespace=stream.namespace, + message_repository=self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case + ) + return StreamFacade.create_from_stream(stream, self, logger, state, cursor) + + def initialize_cursor( + self, + stream: Stream, + state_manager: ConnectorStateManager, + converter: AbstractStreamStateConverter, + slice_boundary_fields: Optional[Tuple[str, str]], + start: Optional[CursorValueType], + end_provider: Callable[[], CursorValueType], + lookback_window: Optional[GapType] = None, + slice_range: Optional[GapType] = None, + ) -> Optional[ConcurrentCursor]: + lookback_window = lookback_window or timedelta(seconds=DEFAULT_LOOKBACK_SECONDS) + + cursor_field_name = stream.cursor_field + + if cursor_field_name: + if not isinstance(cursor_field_name, str): + raise ValueError(f"Cursor field type must be a string, but received {type(cursor_field_name).__name__}.") + + return ConcurrentCursor( + stream.name, + stream.namespace, + state_manager.get_stream_state(stream.name, stream.namespace), + self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case + state_manager, + converter, + CursorField(cursor_field_name), + slice_boundary_fields, + start, + end_provider, + lookback_window, + slice_range, + ) + + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py new file mode 100644 index 000000000000..b6643042b24c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from typing import Any + +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream + + +class PartitionGenerationCompletedSentinel: + """ + A sentinel object indicating all partitions for a stream were produced. + Includes a pointer to the stream that was processed. + """ + + def __init__(self, stream: AbstractStream): + """ + :param stream: The stream that was processed + """ + self.stream = stream + + def __eq__(self, other: Any) -> bool: + if isinstance(other, PartitionGenerationCompletedSentinel): + return self.stream == other.stream + return False diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/stream_thread_exception.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/stream_thread_exception.py new file mode 100644 index 000000000000..c865bef59732 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/stream_thread_exception.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from typing import Any + + +class StreamThreadException(Exception): + def __init__(self, exception: Exception, stream_name: str): + self._exception = exception + self._stream_name = stream_name + + @property + def stream_name(self) -> str: + return self._stream_name + + @property + def exception(self) -> Exception: + return self._exception + + def __str__(self) -> str: + return f"Exception while syncing stream {self._stream_name}: {self._exception}" + + def __eq__(self, other: Any) -> bool: + if isinstance(other, StreamThreadException): + return self._exception == other._exception and self._stream_name == other._stream_name + return False diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/thread_pool_manager.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/thread_pool_manager.py new file mode 100644 index 000000000000..b6933e6bc3d2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/thread_pool_manager.py @@ -0,0 +1,109 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +import threading +from concurrent.futures import Future, ThreadPoolExecutor +from typing import Any, Callable, List, Optional + + +class ThreadPoolManager: + """ + Wrapper to abstract away the threadpool and the logic to wait for pending tasks to be completed. + """ + + DEFAULT_MAX_QUEUE_SIZE = 10_000 + + def __init__( + self, + threadpool: ThreadPoolExecutor, + logger: logging.Logger, + max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE, + ): + """ + :param threadpool: The threadpool to use + :param logger: The logger to use + :param max_concurrent_tasks: The maximum number of tasks that can be pending at the same time + """ + self._threadpool = threadpool + self._logger = logger + self._max_concurrent_tasks = max_concurrent_tasks + self._futures: List[Future[Any]] = [] + self._lock = threading.Lock() + self._most_recently_seen_exception: Optional[Exception] = None + + self._logging_threshold = max_concurrent_tasks * 2 + + def prune_to_validate_has_reached_futures_limit(self) -> bool: + self._prune_futures(self._futures) + if len(self._futures) > self._logging_threshold: + self._logger.warning(f"ThreadPoolManager: The list of futures is getting bigger than expected ({len(self._futures)})") + return len(self._futures) >= self._max_concurrent_tasks + + def submit(self, function: Callable[..., Any], *args: Any) -> None: + self._futures.append(self._threadpool.submit(function, *args)) + + def _prune_futures(self, futures: List[Future[Any]]) -> None: + """ + Take a list in input and remove the futures that are completed. If a future has an exception, it'll raise and kill the stream + operation. + + We are using a lock here as without it, the algorithm would not be thread safe + """ + with self._lock: + if len(futures) < self._max_concurrent_tasks: + return + + for index in reversed(range(len(futures))): + future = futures[index] + + if future.done(): + # Only call future.exception() if the future is known to be done because it will block until the future is done. + # See https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Future.exception + optional_exception = future.exception() + if optional_exception: + # Exception handling should be done in the main thread. Hence, we only store the exception and expect the main + # thread to call raise_if_exception + # We do not expect this error to happen. The futures created during concurrent syncs should catch the exception and + # push it to the queue. If this exception occurs, please review the futures and how they handle exceptions. + self._most_recently_seen_exception = RuntimeError( + f"Failed processing a future: {optional_exception}. Please contact the Airbyte team." + ) + futures.pop(index) + + def _shutdown(self) -> None: + # Without a way to stop the threads that have already started, this will not stop the Python application. We are fine today with + # this imperfect approach because we only do this in case of `self._most_recently_seen_exception` which we don't expect to happen + self._threadpool.shutdown(wait=False, cancel_futures=True) + + def is_done(self) -> bool: + return all([f.done() for f in self._futures]) + + def check_for_errors_and_shutdown(self) -> None: + """ + Check if any of the futures have an exception, and raise it if so. If all futures are done, shutdown the threadpool. + If the futures are not done, raise an exception. + :return: + """ + if self._most_recently_seen_exception: + self._logger.exception( + "An unknown exception has occurred while reading concurrently", + exc_info=self._most_recently_seen_exception, + ) + self._stop_and_raise_exception(self._most_recently_seen_exception) + + exceptions_from_futures = [f for f in [future.exception() for future in self._futures] if f is not None] + if exceptions_from_futures: + exception = RuntimeError(f"Failed reading with errors: {exceptions_from_futures}") + self._stop_and_raise_exception(exception) + else: + futures_not_done = [f for f in self._futures if not f.done()] + if futures_not_done: + exception = RuntimeError(f"Failed reading with futures not done: {futures_not_done}") + self._stop_and_raise_exception(exception) + else: + self._shutdown() + + def _stop_and_raise_exception(self, exception: BaseException) -> None: + self._shutdown() + raise exception diff --git a/airbyte-cdk/python/airbyte_cdk/sources/config.py b/airbyte-cdk/python/airbyte_cdk/sources/config.py new file mode 100644 index 000000000000..8ea2b6400db4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/config.py @@ -0,0 +1,26 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Dict + +from airbyte_cdk.sources.utils.schema_helpers import expand_refs, rename_key +from pydantic.v1 import BaseModel + + +class BaseConfig(BaseModel): + """Base class for connector spec, adds the following behaviour: + + - resolve $ref and replace it with definition + - replace all occurrences of anyOf with oneOf + - drop description + """ + + @classmethod + def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]: + """We're overriding the schema classmethod to enable some post-processing""" + schema = super().schema(*args, **kwargs) + rename_key(schema, old_key="anyOf", new_key="oneOf") # UI supports only oneOf + expand_refs(schema) + schema.pop("description", None) # description added from the docstring + return schema # type: ignore[no-any-return] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/connector_state_manager.py b/airbyte-cdk/python/airbyte_cdk/sources/connector_state_manager.py new file mode 100644 index 000000000000..547f4bb23dca --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/connector_state_manager.py @@ -0,0 +1,134 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +from dataclasses import dataclass +from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union + +from airbyte_cdk.models import AirbyteMessage, AirbyteStateBlob, AirbyteStateMessage, AirbyteStateType, AirbyteStreamState, StreamDescriptor +from airbyte_cdk.models import Type as MessageType + + +@dataclass(frozen=True) +class HashableStreamDescriptor: + """ + Helper class that overrides the existing StreamDescriptor class that is auto generated from the Airbyte Protocol and + freezes its fields so that it be used as a hash key. This is only marked public because we use it outside for unit tests. + """ + + name: str + namespace: Optional[str] = None + + +class ConnectorStateManager: + """ + ConnectorStateManager consolidates the various forms of a stream's incoming state message (STREAM / GLOBAL) under a common + interface. It also provides methods to extract and update state + """ + + def __init__(self, state: Optional[List[AirbyteStateMessage]] = None): + shared_state, per_stream_states = self._extract_from_state_message(state) + + # We explicitly throw an error if we receive a GLOBAL state message that contains a shared_state because API sources are + # designed to checkpoint state independently of one another. API sources should never be emitting a state message where + # shared_state is populated. Rather than define how to handle shared_state without a clear use case, we're opting to throw an + # error instead and if/when we find one, we will then implement processing of the shared_state value. + if shared_state: + raise ValueError( + "Received a GLOBAL AirbyteStateMessage that contains a shared_state. This library only ever generates per-STREAM " + "STATE messages so this was not generated by this connector. This must be an orchestrator or platform error. GLOBAL " + "state messages with shared_state will not be processed correctly. " + ) + self.per_stream_states = per_stream_states + + def get_stream_state(self, stream_name: str, namespace: Optional[str]) -> MutableMapping[str, Any]: + """ + Retrieves the state of a given stream based on its descriptor (name + namespace). + :param stream_name: Name of the stream being fetched + :param namespace: Namespace of the stream being fetched + :return: The per-stream state for a stream + """ + stream_state: AirbyteStateBlob | None = self.per_stream_states.get(HashableStreamDescriptor(name=stream_name, namespace=namespace)) + if stream_state: + return copy.deepcopy({k: v for k, v in stream_state.__dict__.items()}) + return {} + + def update_state_for_stream(self, stream_name: str, namespace: Optional[str], value: Mapping[str, Any]) -> None: + """ + Overwrites the state blob of a specific stream based on the provided stream name and optional namespace + :param stream_name: The name of the stream whose state is being updated + :param namespace: The namespace of the stream if it exists + :param value: A stream state mapping that is being updated for a stream + """ + stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace) + self.per_stream_states[stream_descriptor] = AirbyteStateBlob(value) + + def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage: + """ + Generates an AirbyteMessage using the current per-stream state of a specified stream + :param stream_name: The name of the stream for the message that is being created + :param namespace: The namespace of the stream for the message that is being created + :return: The Airbyte state message to be emitted by the connector during a sync + """ + hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace) + stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob() + + return AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state + ), + ), + ) + + @classmethod + def _extract_from_state_message( + cls, + state: Optional[List[AirbyteStateMessage]], + ) -> Tuple[Optional[AirbyteStateBlob], MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]]: + """ + Takes an incoming list of state messages or a global state message and extracts state attributes according to + type which can then be assigned to the new state manager being instantiated + :param state: The incoming state input + :return: A tuple of shared state and per stream state assembled from the incoming state list + """ + if state is None: + return None, {} + + is_global = cls._is_global_state(state) + + if is_global: + global_state = state[0].global_ # type: ignore # We verified state is a list in _is_global_state + shared_state = copy.deepcopy(global_state.shared_state, {}) # type: ignore[union-attr] # global_state has shared_state + streams = { + HashableStreamDescriptor( + name=per_stream_state.stream_descriptor.name, namespace=per_stream_state.stream_descriptor.namespace + ): per_stream_state.stream_state + for per_stream_state in global_state.stream_states # type: ignore[union-attr] # global_state has shared_state + } + return shared_state, streams + else: + streams = { + HashableStreamDescriptor( + name=per_stream_state.stream.stream_descriptor.name, namespace=per_stream_state.stream.stream_descriptor.namespace # type: ignore[union-attr] # stream has stream_descriptor + ): per_stream_state.stream.stream_state # type: ignore[union-attr] # stream has stream_state + for per_stream_state in state + if per_stream_state.type == AirbyteStateType.STREAM and hasattr(per_stream_state, "stream") # type: ignore # state is always a list of AirbyteStateMessage if is_per_stream is True + } + return None, streams + + @staticmethod + def _is_global_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool: + return ( + isinstance(state, List) + and len(state) == 1 + and isinstance(state[0], AirbyteStateMessage) + and state[0].type == AirbyteStateType.GLOBAL + ) + + @staticmethod + def _is_per_stream_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool: + return isinstance(state, List) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job.py new file mode 100644 index 000000000000..5b4f1c7ab0ce --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + + +from datetime import timedelta +from typing import Optional + +from airbyte_cdk.sources.declarative.async_job.timer import Timer +from airbyte_cdk.sources.types import StreamSlice + +from .status import AsyncJobStatus + + +class AsyncJob: + """ + Description of an API job. + + Note that the timer will only stop once `update_status` is called so the job might be completed on the API side but until we query for + it and call `ApiJob.update_status`, `ApiJob.status` will not reflect the actual API side status. + """ + + def __init__(self, api_job_id: str, job_parameters: StreamSlice, timeout: Optional[timedelta] = None) -> None: + self._api_job_id = api_job_id + self._job_parameters = job_parameters + self._status = AsyncJobStatus.RUNNING + + timeout = timeout if timeout else timedelta(minutes=60) + self._timer = Timer(timeout) + self._timer.start() + + def api_job_id(self) -> str: + return self._api_job_id + + def status(self) -> AsyncJobStatus: + if self._timer.has_timed_out(): + return AsyncJobStatus.TIMED_OUT + return self._status + + def job_parameters(self) -> StreamSlice: + return self._job_parameters + + def update_status(self, status: AsyncJobStatus) -> None: + if self._status != AsyncJobStatus.RUNNING and status == AsyncJobStatus.RUNNING: + self._timer.start() + elif status.is_terminal(): + self._timer.stop() + + self._status = status + + def __repr__(self) -> str: + return f"AsyncJob(api_job_id={self.api_job_id()}, job_parameters={self.job_parameters()}, status={self.status()})" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job_orchestrator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job_orchestrator.py new file mode 100644 index 000000000000..ddd7b8b3a7c9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job_orchestrator.py @@ -0,0 +1,443 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import logging +import threading +import time +import traceback +import uuid +from datetime import timedelta +from typing import Any, Generator, Generic, Iterable, List, Mapping, Optional, Set, Tuple, Type, TypeVar + +from airbyte_cdk import StreamSlice +from airbyte_cdk.logger import lazy_log +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.async_job.job import AsyncJob +from airbyte_cdk.sources.declarative.async_job.job_tracker import ConcurrentJobLimitReached, JobTracker +from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository +from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + +LOGGER = logging.getLogger("airbyte") +_NO_TIMEOUT = timedelta.max +_API_SIDE_RUNNING_STATUS = {AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT} + + +class AsyncPartition: + """ + This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs + """ + + _MAX_NUMBER_OF_ATTEMPTS = 3 + + def __init__(self, jobs: List[AsyncJob], stream_slice: StreamSlice) -> None: + self._attempts_per_job = {job: 1 for job in jobs} + self._stream_slice = stream_slice + + def has_reached_max_attempt(self) -> bool: + return any(map(lambda attempt_count: attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS, self._attempts_per_job.values())) + + def replace_job(self, job_to_replace: AsyncJob, new_jobs: List[AsyncJob]) -> None: + current_attempt_count = self._attempts_per_job.pop(job_to_replace, None) + if current_attempt_count is None: + raise ValueError("Could not find job to replace") + elif current_attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS: + raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}") + + new_attempt_count = current_attempt_count + 1 + for job in new_jobs: + self._attempts_per_job[job] = new_attempt_count + + def should_split(self, job: AsyncJob) -> bool: + """ + Not used right now but once we support job split, we should split based on the number of attempts + """ + return False + + @property + def jobs(self) -> Iterable[AsyncJob]: + return self._attempts_per_job.keys() + + @property + def stream_slice(self) -> StreamSlice: + return self._stream_slice + + @property + def status(self) -> AsyncJobStatus: + """ + Given different job statuses, the priority is: FAILED, TIMED_OUT, RUNNING. Else, it means everything is completed. + """ + statuses = set(map(lambda job: job.status(), self.jobs)) + if statuses == {AsyncJobStatus.COMPLETED}: + return AsyncJobStatus.COMPLETED + elif AsyncJobStatus.FAILED in statuses: + return AsyncJobStatus.FAILED + elif AsyncJobStatus.TIMED_OUT in statuses: + return AsyncJobStatus.TIMED_OUT + else: + return AsyncJobStatus.RUNNING + + def __repr__(self) -> str: + return f"AsyncPartition(stream_slice={self._stream_slice}, attempt_per_job={self._attempts_per_job})" + + def __json_serializable__(self) -> Any: + return self._stream_slice + + +T = TypeVar("T") + + +class LookaheadIterator(Generic[T]): + def __init__(self, iterable: Iterable[T]) -> None: + self._iterator = iter(iterable) + self._buffer: List[T] = [] + + def __iter__(self) -> "LookaheadIterator[T]": + return self + + def __next__(self) -> T: + if self._buffer: + return self._buffer.pop() + else: + return next(self._iterator) + + def has_next(self) -> bool: + if self._buffer: + return True + + try: + self._buffer = [next(self._iterator)] + except StopIteration: + return False + else: + return True + + def add_at_the_beginning(self, item: T) -> None: + self._buffer = [item] + self._buffer + + +class AsyncJobOrchestrator: + _WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS = 5 + _KNOWN_JOB_STATUSES = {AsyncJobStatus.COMPLETED, AsyncJobStatus.FAILED, AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT} + _RUNNING_ON_API_SIDE_STATUS = {AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT} + + def __init__( + self, + job_repository: AsyncJobRepository, + slices: Iterable[StreamSlice], + job_tracker: JobTracker, + message_repository: MessageRepository, + exceptions_to_break_on: Iterable[Type[Exception]] = tuple(), + has_bulk_parent: bool = False, + ) -> None: + """ + If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent` + needs to be set to True as jobs creation needs to be prioritized on the parent level. Doing otherwise could lead to a situation + where the child has taken up all the job budget without room to the parent to create more which would lead to an infinite loop of + "trying to start a parent job" and "ConcurrentJobLimitReached". + """ + if {*AsyncJobStatus} != self._KNOWN_JOB_STATUSES: + # this is to prevent developers updating the possible statuses without updating the logic of this class + raise ValueError( + "An AsyncJobStatus has been either removed or added which means the logic of this class needs to be reviewed. Once the logic has been updated, please update _KNOWN_JOB_STATUSES" + ) + + self._job_repository: AsyncJobRepository = job_repository + self._slice_iterator = LookaheadIterator(slices) + self._running_partitions: List[AsyncPartition] = [] + self._job_tracker = job_tracker + self._message_repository = message_repository + self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on) + self._has_bulk_parent = has_bulk_parent + + self._non_breaking_exceptions: List[Exception] = [] + + def _replace_failed_jobs(self, partition: AsyncPartition) -> None: + failed_status_jobs = (AsyncJobStatus.FAILED, AsyncJobStatus.TIMED_OUT) + jobs_to_replace = [job for job in partition.jobs if job.status() in failed_status_jobs] + for job in jobs_to_replace: + new_job = self._start_job(job.job_parameters(), job.api_job_id()) + partition.replace_job(job, [new_job]) + + def _start_jobs(self) -> None: + """ + Retry failed jobs and start jobs for each slice in the slice iterator. + This method iterates over the running jobs and slice iterator and starts a job for each slice. + The started jobs are added to the running partitions. + Returns: + None + + However, the first iteration is for sendgrid which only has one job. + """ + at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = False + _slice = None + try: + for partition in self._running_partitions: + self._replace_failed_jobs(partition) + + if self._has_bulk_parent and self._running_partitions and self._slice_iterator.has_next(): + LOGGER.debug( + "This AsyncJobOrchestrator is operating as a child of a bulk stream hence we limit the number of concurrent jobs on the child until there are no more parent slices to avoid the child taking all the API job budget" + ) + return + + for _slice in self._slice_iterator: + at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True + job = self._start_job(_slice) + self._running_partitions.append(AsyncPartition([job], _slice)) + if self._has_bulk_parent and self._slice_iterator.has_next(): + break + except ConcurrentJobLimitReached: + if at_least_one_slice_consumed_from_slice_iterator_during_current_iteration: + # this means a slice has been consumed but the job couldn't be create therefore we need to put it back at the beginning of the _slice_iterator + self._slice_iterator.add_at_the_beginning(_slice) # type: ignore # we know it's not None here because `ConcurrentJobLimitReached` happens during the for loop + LOGGER.debug("Waiting before creating more jobs as the limit of concurrent jobs has been reached. Will try again later...") + + def _start_job(self, _slice: StreamSlice, previous_job_id: Optional[str] = None) -> AsyncJob: + if previous_job_id: + id_to_replace = previous_job_id + lazy_log(LOGGER, logging.DEBUG, lambda: f"Attempting to replace job {id_to_replace}...") + else: + id_to_replace = self._job_tracker.try_to_get_intent() + + try: + job = self._job_repository.start(_slice) + self._job_tracker.add_job(id_to_replace, job.api_job_id()) + return job + except Exception as exception: + LOGGER.warning(f"Exception has occurred during job creation: {exception}") + if self._is_breaking_exception(exception): + self._job_tracker.remove_job(id_to_replace) + raise exception + return self._keep_api_budget_with_failed_job(_slice, exception, id_to_replace) + + def _keep_api_budget_with_failed_job(self, _slice: StreamSlice, exception: Exception, intent: str) -> AsyncJob: + """ + We have a mechanism to retry job. It is used when a job status is FAILED or TIMED_OUT. The easiest way to retry is to have this job + as created in a failed state and leverage the retry for failed/timed out jobs. This way, we don't have to have another process for + retrying jobs that couldn't be started. + """ + LOGGER.warning( + f"Could not start job for slice {_slice}. Job will be flagged as failed and retried if max number of attempts not reached: {exception}" + ) + traced_exception = exception if isinstance(exception, AirbyteTracedException) else AirbyteTracedException.from_exception(exception) + # Even though we're not sure this will break the stream, we will emit here for simplicity's sake. If we wanted to be more accurate, + # we would keep the exceptions in-memory until we know that we have reached the max attempt. + self._message_repository.emit_message(traced_exception.as_airbyte_message()) + job = self._create_failed_job(_slice) + self._job_tracker.add_job(intent, job.api_job_id()) + return job + + def _create_failed_job(self, stream_slice: StreamSlice) -> AsyncJob: + job = AsyncJob(f"{uuid.uuid4()} - Job that could not start", stream_slice, _NO_TIMEOUT) + job.update_status(AsyncJobStatus.FAILED) + return job + + def _get_running_jobs(self) -> Set[AsyncJob]: + """ + Returns a set of running AsyncJob objects. + + Returns: + Set[AsyncJob]: A set of AsyncJob objects that are currently running. + """ + return {job for partition in self._running_partitions for job in partition.jobs if job.status() == AsyncJobStatus.RUNNING} + + def _update_jobs_status(self) -> None: + """ + Update the status of all running jobs in the repository. + """ + running_jobs = self._get_running_jobs() + if running_jobs: + # update the status only if there are RUNNING jobs + self._job_repository.update_jobs_status(running_jobs) + + def _wait_on_status_update(self) -> None: + """ + Waits for a specified amount of time between status updates. + + + This method is used to introduce a delay between status updates in order to avoid excessive polling. + The duration of the delay is determined by the value of `_WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS`. + + Returns: + None + """ + lazy_log( + LOGGER, + logging.DEBUG, + lambda: f"Polling status in progress. There are currently {len(self._running_partitions)} running partitions.", + ) + + lazy_log( + LOGGER, + logging.DEBUG, + lambda: f"Waiting for {self._WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS} seconds before next poll...", + ) + time.sleep(self._WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS) + + def _process_completed_partition(self, partition: AsyncPartition) -> None: + """ + Process a completed partition. + Args: + partition (AsyncPartition): The completed partition to process. + """ + job_ids = list(map(lambda job: job.api_job_id(), {job for job in partition.jobs})) + LOGGER.info(f"The following jobs for stream slice {partition.stream_slice} have been completed: {job_ids}.") + + # It is important to remove the jobs from the job tracker before yielding the partition as the caller might try to schedule jobs + # but won't be able to as all jobs slots are taken even though job is done. + for job in partition.jobs: + self._job_tracker.remove_job(job.api_job_id()) + + def _process_running_partitions_and_yield_completed_ones(self) -> Generator[AsyncPartition, Any, None]: + """ + Process the running partitions. + + Yields: + AsyncPartition: The processed partition. + + Raises: + Any: Any exception raised during processing. + """ + current_running_partitions: List[AsyncPartition] = [] + for partition in self._running_partitions: + match partition.status: + case AsyncJobStatus.COMPLETED: + self._process_completed_partition(partition) + yield partition + case AsyncJobStatus.RUNNING: + current_running_partitions.append(partition) + case _ if partition.has_reached_max_attempt(): + self._stop_partition(partition) + self._process_partitions_with_errors(partition) + case _: + self._stop_timed_out_jobs(partition) + + # job will be restarted in `_start_job` + current_running_partitions.insert(0, partition) + + for job in partition.jobs: + # We only remove completed jobs as we want failed/timed out jobs to be re-allocated in priority + if job.status() == AsyncJobStatus.COMPLETED: + self._job_tracker.remove_job(job.api_job_id()) + + # update the referenced list with running partitions + self._running_partitions = current_running_partitions + + def _stop_partition(self, partition: AsyncPartition) -> None: + for job in partition.jobs: + if job.status() in _API_SIDE_RUNNING_STATUS: + self._abort_job(job, free_job_allocation=True) + else: + self._job_tracker.remove_job(job.api_job_id()) + + def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None: + for job in partition.jobs: + if job.status() == AsyncJobStatus.TIMED_OUT: + # we don't free allocation here because it is expected to retry the job + self._abort_job(job, free_job_allocation=False) + + def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None: + try: + self._job_repository.abort(job) + if free_job_allocation: + self._job_tracker.remove_job(job.api_job_id()) + except Exception as exception: + LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}") + + def _process_partitions_with_errors(self, partition: AsyncPartition) -> None: + """ + Process a partition with status errors (FAILED and TIMEOUT). + + Args: + partition (AsyncPartition): The partition to process. + Returns: + AirbyteTracedException: An exception indicating that at least one job could not be completed. + Raises: + AirbyteTracedException: If at least one job could not be completed. + """ + status_by_job_id = {job.api_job_id(): job.status() for job in partition.jobs} + self._non_breaking_exceptions.append( + AirbyteTracedException( + internal_message=f"At least one job could not be completed for slice {partition.stream_slice}. Job statuses were: {status_by_job_id}. See warning logs for more information.", + failure_type=FailureType.config_error, + ) + ) + + def create_and_get_completed_partitions(self) -> Iterable[AsyncPartition]: + """ + Creates and retrieves completed partitions. + This method continuously starts jobs, updates job status, processes running partitions, + logs polling partitions, and waits for status updates. It yields completed partitions + as they become available. + + Returns: + An iterable of completed partitions, represented as AsyncPartition objects. + Each partition is wrapped in an Optional, allowing for None values. + """ + while True: + try: + lazy_log( + LOGGER, + logging.DEBUG, + lambda: f"JobOrchestrator loop - (Thread {threading.get_native_id()}, AsyncJobOrchestrator {self}) is starting the async job loop", + ) + self._start_jobs() + if not self._slice_iterator.has_next() and not self._running_partitions: + break + + self._update_jobs_status() + yield from self._process_running_partitions_and_yield_completed_ones() + self._wait_on_status_update() + except Exception as exception: + if self._is_breaking_exception(exception): + LOGGER.warning(f"Caught exception that stops the processing of the jobs: {exception}") + self._abort_all_running_jobs() + raise exception + + self._non_breaking_exceptions.append(exception) + + LOGGER.info( + f"JobOrchestrator loop - Thread (Thread {threading.get_native_id()}, AsyncJobOrchestrator {self}) completed! Errors during creation were {self._non_breaking_exceptions}" + ) + if self._non_breaking_exceptions: + # We emitted traced message but we didn't break on non_breaking_exception. We still need to raise an exception so that the + # call of `create_and_get_completed_partitions` knows that there was an issue with some partitions and the sync is incomplete. + raise AirbyteTracedException( + message="", + internal_message="\n".join([filter_secrets(exception.__repr__()) for exception in self._non_breaking_exceptions]), + failure_type=FailureType.config_error, + ) + + def _handle_non_breaking_error(self, exception: Exception) -> None: + LOGGER.error(f"Failed to start the Job: {exception}, traceback: {traceback.format_exc()}") + self._non_breaking_exceptions.append(exception) + + def _abort_all_running_jobs(self) -> None: + for partition in self._running_partitions: + for job in partition.jobs: + if job.status() in self._RUNNING_ON_API_SIDE_STATUS: + self._abort_job(job, free_job_allocation=True) + self._job_tracker.remove_job(job.api_job_id()) + + self._running_partitions = [] + + def _is_breaking_exception(self, exception: Exception) -> bool: + return isinstance(exception, self._exceptions_to_break_on) or ( + isinstance(exception, AirbyteTracedException) and exception.failure_type == FailureType.config_error + ) + + def fetch_records(self, partition: AsyncPartition) -> Iterable[Mapping[str, Any]]: + """ + Fetches records from the given partition's jobs. + + Args: + partition (AsyncPartition): The partition containing the jobs. + + Yields: + Iterable[Mapping[str, Any]]: The fetched records from the jobs. + """ + for job in partition.jobs: + yield from self._job_repository.fetch_records(job) + self._job_repository.delete(job) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job_tracker.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job_tracker.py new file mode 100644 index 000000000000..54fbd26d5924 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/job_tracker.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import logging +import threading +import uuid +from typing import Set + +from airbyte_cdk.logger import lazy_log + +LOGGER = logging.getLogger("airbyte") + + +class ConcurrentJobLimitReached(Exception): + pass + + +class JobTracker: + def __init__(self, limit: int): + self._jobs: Set[str] = set() + self._limit = limit + self._lock = threading.Lock() + + def try_to_get_intent(self) -> str: + lazy_log(LOGGER, logging.DEBUG, lambda: f"JobTracker - Trying to acquire lock by thread {threading.get_native_id()}...") + with self._lock: + if self._has_reached_limit(): + raise ConcurrentJobLimitReached("Can't allocate more jobs right now: limit already reached") + intent = f"intent_{str(uuid.uuid4())}" + lazy_log(LOGGER, logging.DEBUG, lambda: f"JobTracker - Thread {threading.get_native_id()} has acquired {intent}!") + self._jobs.add(intent) + return intent + + def add_job(self, intent_or_job_id: str, job_id: str) -> None: + if intent_or_job_id not in self._jobs: + raise ValueError(f"Can't add job: Unknown intent or job id, known values are {self._jobs}") + + if intent_or_job_id == job_id: + # Nothing to do here as the ID to replace is the same + return + + lazy_log( + LOGGER, logging.DEBUG, lambda: f"JobTracker - Thread {threading.get_native_id()} replacing job {intent_or_job_id} by {job_id}!" + ) + with self._lock: + self._jobs.add(job_id) + self._jobs.remove(intent_or_job_id) + + def remove_job(self, job_id: str) -> None: + """ + If the job is not allocated as a running job, this method does nothing and it won't raise. + """ + lazy_log(LOGGER, logging.DEBUG, lambda: f"JobTracker - Thread {threading.get_native_id()} removing job {job_id}") + with self._lock: + self._jobs.discard(job_id) + + def _has_reached_limit(self) -> bool: + return len(self._jobs) >= self._limit diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/repository.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/repository.py new file mode 100644 index 000000000000..b2de8659a393 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/repository.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from abc import abstractmethod +from typing import Any, Iterable, Mapping, Set + +from airbyte_cdk.sources.declarative.async_job.job import AsyncJob +from airbyte_cdk.sources.types import StreamSlice + + +class AsyncJobRepository: + @abstractmethod + def start(self, stream_slice: StreamSlice) -> AsyncJob: + pass + + @abstractmethod + def update_jobs_status(self, jobs: Set[AsyncJob]) -> None: + pass + + @abstractmethod + def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]: + pass + + @abstractmethod + def abort(self, job: AsyncJob) -> None: + """ + Called when we need to stop on the API side. This method can raise NotImplementedError as not all the APIs will support aborting + jobs. + """ + raise NotImplementedError("Either the API or the AsyncJobRepository implementation do not support aborting jobs") + + @abstractmethod + def delete(self, job: AsyncJob) -> None: + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/status.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/status.py new file mode 100644 index 000000000000..586e79889ca1 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/status.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + + +from enum import Enum + +_TERMINAL = True + + +class AsyncJobStatus(Enum): + RUNNING = ("RUNNING", not _TERMINAL) + COMPLETED = ("COMPLETED", _TERMINAL) + FAILED = ("FAILED", _TERMINAL) + TIMED_OUT = ("TIMED_OUT", _TERMINAL) + + def __init__(self, value: str, is_terminal: bool) -> None: + self._value = value + self._is_terminal = is_terminal + + def is_terminal(self) -> bool: + """ + A status is terminal when a job status can't be updated anymore. For example if a job is completed, it will stay completed but a + running job might because completed, failed or timed out. + """ + return self._is_terminal diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/timer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/timer.py new file mode 100644 index 000000000000..c4e5a9a1d85a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/async_job/timer.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +from datetime import datetime, timedelta, timezone +from typing import Optional + + +class Timer: + def __init__(self, timeout: timedelta) -> None: + self._start_datetime: Optional[datetime] = None + self._end_datetime: Optional[datetime] = None + self._timeout = timeout + + def start(self) -> None: + self._start_datetime = self._now() + self._end_datetime = None + + def stop(self) -> None: + if self._end_datetime is None: + self._end_datetime = self._now() + + def is_started(self) -> bool: + return self._start_datetime is not None + + @property + def elapsed_time(self) -> Optional[timedelta]: + if not self._start_datetime: + return None + + end_time = self._end_datetime or self._now() + elapsed_period = end_time - self._start_datetime + return elapsed_period + + def has_timed_out(self) -> bool: + if not self.is_started(): + return False + return self.elapsed_time > self._timeout # type: ignore # given the job timer is started, we assume there is an elapsed_period + + @staticmethod + def _now() -> datetime: + return datetime.now(tz=timezone.utc) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/__init__.py new file mode 100644 index 000000000000..a02f6f140ba6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/__init__.py @@ -0,0 +1,11 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.auth.oauth import DeclarativeOauth2Authenticator +from airbyte_cdk.sources.declarative.auth.jwt import JwtAuthenticator + +__all__ = [ + "DeclarativeOauth2Authenticator", + "JwtAuthenticator" +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/declarative_authenticator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/declarative_authenticator.py new file mode 100644 index 000000000000..5517f546209a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/declarative_authenticator.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Union + +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_token import AbstractHeaderAuthenticator + + +@dataclass +class DeclarativeAuthenticator(AbstractHeaderAuthenticator): + """ + Interface used to associate which authenticators can be used as part of the declarative framework + """ + + def get_request_params(self) -> Mapping[str, Any]: + """HTTP request parameter to add to the requests""" + return {} + + def get_request_body_data(self) -> Union[Mapping[str, Any], str]: + """Form-encoded body data to set on the requests""" + return {} + + def get_request_body_json(self) -> Mapping[str, Any]: + """JSON-encoded body data to set on the requests""" + return {} + + +@dataclass +class NoAuth(DeclarativeAuthenticator): + parameters: InitVar[Mapping[str, Any]] + + @property + def auth_header(self) -> str: + return "" + + @property + def token(self) -> str: + return "" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/jwt.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/jwt.py new file mode 100644 index 000000000000..e24ee793715a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/jwt.py @@ -0,0 +1,170 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import base64 +from dataclasses import InitVar, dataclass +from datetime import datetime +from typing import Any, Mapping, Optional, Union + +import jwt +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + + +class JwtAlgorithm(str): + """ + Enum for supported JWT algorithms + """ + + HS256 = "HS256" + HS384 = "HS384" + HS512 = "HS512" + ES256 = "ES256" + ES256K = "ES256K" + ES384 = "ES384" + ES512 = "ES512" + RS256 = "RS256" + RS384 = "RS384" + RS512 = "RS512" + PS256 = "PS256" + PS384 = "PS384" + PS512 = "PS512" + EdDSA = "EdDSA" + + +@dataclass +class JwtAuthenticator(DeclarativeAuthenticator): + """ + Generates a JSON Web Token (JWT) based on a declarative connector configuration file. The generated token is attached to each request via the Authorization header. + + Attributes: + config (Mapping[str, Any]): The user-provided configuration as specified by the source's spec + secret_key (Union[InterpolatedString, str]): The secret key used to sign the JWT + algorithm (Union[str, JwtAlgorithm]): The algorithm used to sign the JWT + token_duration (Optional[int]): The duration in seconds for which the token is valid + base64_encode_secret_key (Optional[Union[InterpolatedBoolean, str, bool]]): Whether to base64 encode the secret key + header_prefix (Optional[Union[InterpolatedString, str]]): The prefix to add to the Authorization header + kid (Optional[Union[InterpolatedString, str]]): The key identifier to be included in the JWT header + typ (Optional[Union[InterpolatedString, str]]): The type of the JWT. + cty (Optional[Union[InterpolatedString, str]]): The content type of the JWT. + iss (Optional[Union[InterpolatedString, str]]): The issuer of the JWT. + sub (Optional[Union[InterpolatedString, str]]): The subject of the JWT. + aud (Optional[Union[InterpolatedString, str]]): The audience of the JWT. + additional_jwt_headers (Optional[Mapping[str, Any]]): Additional headers to include in the JWT. + additional_jwt_payload (Optional[Mapping[str, Any]]): Additional payload to include in the JWT. + """ + + config: Mapping[str, Any] + parameters: InitVar[Mapping[str, Any]] + secret_key: Union[InterpolatedString, str] + algorithm: Union[str, JwtAlgorithm] + token_duration: Optional[int] + base64_encode_secret_key: Optional[Union[InterpolatedBoolean, str, bool]] = False + header_prefix: Optional[Union[InterpolatedString, str]] = None + kid: Optional[Union[InterpolatedString, str]] = None + typ: Optional[Union[InterpolatedString, str]] = None + cty: Optional[Union[InterpolatedString, str]] = None + iss: Optional[Union[InterpolatedString, str]] = None + sub: Optional[Union[InterpolatedString, str]] = None + aud: Optional[Union[InterpolatedString, str]] = None + additional_jwt_headers: Optional[Mapping[str, Any]] = None + additional_jwt_payload: Optional[Mapping[str, Any]] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._secret_key = InterpolatedString.create(self.secret_key, parameters=parameters) + self._algorithm = JwtAlgorithm(self.algorithm) if isinstance(self.algorithm, str) else self.algorithm + self._base64_encode_secret_key = ( + InterpolatedBoolean(self.base64_encode_secret_key, parameters=parameters) + if isinstance(self.base64_encode_secret_key, str) + else self.base64_encode_secret_key + ) + self._token_duration = self.token_duration + self._header_prefix = InterpolatedString.create(self.header_prefix, parameters=parameters) if self.header_prefix else None + self._kid = InterpolatedString.create(self.kid, parameters=parameters) if self.kid else None + self._typ = InterpolatedString.create(self.typ, parameters=parameters) if self.typ else None + self._cty = InterpolatedString.create(self.cty, parameters=parameters) if self.cty else None + self._iss = InterpolatedString.create(self.iss, parameters=parameters) if self.iss else None + self._sub = InterpolatedString.create(self.sub, parameters=parameters) if self.sub else None + self._aud = InterpolatedString.create(self.aud, parameters=parameters) if self.aud else None + self._additional_jwt_headers = InterpolatedMapping(self.additional_jwt_headers or {}, parameters=parameters) + self._additional_jwt_payload = InterpolatedMapping(self.additional_jwt_payload or {}, parameters=parameters) + + def _get_jwt_headers(self) -> dict[str, Any]: + """ " + Builds and returns the headers used when signing the JWT. + """ + headers = self._additional_jwt_headers.eval(self.config) + if any(prop in headers for prop in ["kid", "alg", "typ", "cty"]): + raise ValueError("'kid', 'alg', 'typ', 'cty' are reserved headers and should not be set as part of 'additional_jwt_headers'") + + if self._kid: + headers["kid"] = self._kid.eval(self.config) + if self._typ: + headers["typ"] = self._typ.eval(self.config) + if self._cty: + headers["cty"] = self._cty.eval(self.config) + headers["alg"] = self._algorithm + return headers + + def _get_jwt_payload(self) -> dict[str, Any]: + """ + Builds and returns the payload used when signing the JWT. + """ + now = int(datetime.now().timestamp()) + exp = now + self._token_duration if isinstance(self._token_duration, int) else now + nbf = now + + payload = self._additional_jwt_payload.eval(self.config) + if any(prop in payload for prop in ["iss", "sub", "aud", "iat", "exp", "nbf"]): + raise ValueError( + "'iss', 'sub', 'aud', 'iat', 'exp', 'nbf' are reserved properties and should not be set as part of 'additional_jwt_payload'" + ) + + if self._iss: + payload["iss"] = self._iss.eval(self.config) + if self._sub: + payload["sub"] = self._sub.eval(self.config) + if self._aud: + payload["aud"] = self._aud.eval(self.config) + payload["iat"] = now + payload["exp"] = exp + payload["nbf"] = nbf + return payload + + def _get_secret_key(self) -> str: + """ + Returns the secret key used to sign the JWT. + """ + secret_key: str = self._secret_key.eval(self.config) + return base64.b64encode(secret_key.encode()).decode() if self._base64_encode_secret_key else secret_key + + def _get_signed_token(self) -> Union[str, Any]: + """ + Signed the JWT using the provided secret key and algorithm and the generated headers and payload. For additional information on PyJWT see: https://pyjwt.readthedocs.io/en/stable/ + """ + try: + return jwt.encode( + payload=self._get_jwt_payload(), + key=self._get_secret_key(), + algorithm=self._algorithm, + headers=self._get_jwt_headers(), + ) + except Exception as e: + raise ValueError(f"Failed to sign token: {e}") + + def _get_header_prefix(self) -> Union[str, None]: + """ + Returns the header prefix to be used when attaching the token to the request. + """ + return self._header_prefix.eval(self.config) if self._header_prefix else None + + @property + def auth_header(self) -> str: + return "Authorization" + + @property + def token(self) -> str: + return f"{self._get_header_prefix()} {self._get_signed_token()}" if self._get_header_prefix() else self._get_signed_token() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/oauth.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/oauth.py new file mode 100644 index 000000000000..b68dbcf1583b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/oauth.py @@ -0,0 +1,148 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, Optional, Union + +import pendulum +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.message import MessageRepository, NoopMessageRepository +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth import AbstractOauth2Authenticator +from airbyte_cdk.sources.streams.http.requests_native_auth.oauth import SingleUseRefreshTokenOauth2Authenticator + + +@dataclass +class DeclarativeOauth2Authenticator(AbstractOauth2Authenticator, DeclarativeAuthenticator): + """ + Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials based on + a declarative connector configuration file. Credentials can be defined explicitly or via interpolation + at runtime. The generated access token is attached to each request via the Authorization header. + + Attributes: + token_refresh_endpoint (Union[InterpolatedString, str]): The endpoint to refresh the access token + client_id (Union[InterpolatedString, str]): The client id + client_secret (Union[InterpolatedString, str]): Client secret + refresh_token (Union[InterpolatedString, str]): The token used to refresh the access token + access_token_name (Union[InterpolatedString, str]): THe field to extract access token from in the response + expires_in_name (Union[InterpolatedString, str]): The field to extract expires_in from in the response + config (Mapping[str, Any]): The user-provided configuration as specified by the source's spec + scopes (Optional[List[str]]): The scopes to request + token_expiry_date (Optional[Union[InterpolatedString, str]]): The access token expiration date + token_expiry_date_format str: format of the datetime; provide it if expires_in is returned in datetime instead of seconds + token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration + refresh_request_body (Optional[Mapping[str, Any]]): The request body to send in the refresh request + grant_type: The grant_type to request for access_token. If set to refresh_token, the refresh_token parameter has to be provided + message_repository (MessageRepository): the message repository used to emit logs on HTTP requests + """ + + token_refresh_endpoint: Union[InterpolatedString, str] + client_id: Union[InterpolatedString, str] + client_secret: Union[InterpolatedString, str] + config: Mapping[str, Any] + parameters: InitVar[Mapping[str, Any]] + refresh_token: Optional[Union[InterpolatedString, str]] = None + scopes: Optional[List[str]] = None + token_expiry_date: Optional[Union[InterpolatedString, str]] = None + _token_expiry_date: Optional[pendulum.DateTime] = field(init=False, repr=False, default=None) + token_expiry_date_format: Optional[str] = None + token_expiry_is_time_of_expiration: bool = False + access_token_name: Union[InterpolatedString, str] = "access_token" + expires_in_name: Union[InterpolatedString, str] = "expires_in" + refresh_request_body: Optional[Mapping[str, Any]] = None + grant_type: Union[InterpolatedString, str] = "refresh_token" + message_repository: MessageRepository = NoopMessageRepository() + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + super().__init__() + self._token_refresh_endpoint = InterpolatedString.create(self.token_refresh_endpoint, parameters=parameters) + self._client_id = InterpolatedString.create(self.client_id, parameters=parameters) + self._client_secret = InterpolatedString.create(self.client_secret, parameters=parameters) + if self.refresh_token is not None: + self._refresh_token: Optional[InterpolatedString] = InterpolatedString.create(self.refresh_token, parameters=parameters) + else: + self._refresh_token = None + self.access_token_name = InterpolatedString.create(self.access_token_name, parameters=parameters) + self.expires_in_name = InterpolatedString.create(self.expires_in_name, parameters=parameters) + self.grant_type = InterpolatedString.create(self.grant_type, parameters=parameters) + self._refresh_request_body = InterpolatedMapping(self.refresh_request_body or {}, parameters=parameters) + self._token_expiry_date: pendulum.DateTime = ( + pendulum.parse(InterpolatedString.create(self.token_expiry_date, parameters=parameters).eval(self.config)) # type: ignore # pendulum.parse returns a datetime in this context + if self.token_expiry_date + else pendulum.now().subtract(days=1) # type: ignore # substract does not have type hints + ) + self._access_token: Optional[str] = None # access_token is initialized by a setter + + if self.get_grant_type() == "refresh_token" and self._refresh_token is None: + raise ValueError("OAuthAuthenticator needs a refresh_token parameter if grant_type is set to `refresh_token`") + + def get_token_refresh_endpoint(self) -> str: + refresh_token: str = self._token_refresh_endpoint.eval(self.config) + if not refresh_token: + raise ValueError("OAuthAuthenticator was unable to evaluate token_refresh_endpoint parameter") + return refresh_token + + def get_client_id(self) -> str: + client_id: str = self._client_id.eval(self.config) + if not client_id: + raise ValueError("OAuthAuthenticator was unable to evaluate client_id parameter") + return client_id + + def get_client_secret(self) -> str: + client_secret: str = self._client_secret.eval(self.config) + if not client_secret: + raise ValueError("OAuthAuthenticator was unable to evaluate client_secret parameter") + return client_secret + + def get_refresh_token(self) -> Optional[str]: + return None if self._refresh_token is None else str(self._refresh_token.eval(self.config)) + + def get_scopes(self) -> List[str]: + return self.scopes or [] + + def get_access_token_name(self) -> str: + return self.access_token_name.eval(self.config) # type: ignore # eval returns a string in this context + + def get_expires_in_name(self) -> str: + return self.expires_in_name.eval(self.config) # type: ignore # eval returns a string in this context + + def get_grant_type(self) -> str: + return self.grant_type.eval(self.config) # type: ignore # eval returns a string in this context + + def get_refresh_request_body(self) -> Mapping[str, Any]: + return self._refresh_request_body.eval(self.config) # type: ignore # eval should return a Mapping in this context + + def get_token_expiry_date(self) -> pendulum.DateTime: + return self._token_expiry_date # type: ignore # _token_expiry_date is a pendulum.DateTime. It is never None despite what mypy thinks + + def set_token_expiry_date(self, value: Union[str, int]) -> None: + self._token_expiry_date = self._parse_token_expiration_date(value) + + @property + def access_token(self) -> str: + if self._access_token is None: + raise ValueError("access_token is not set") + return self._access_token + + @access_token.setter + def access_token(self, value: str) -> None: + self._access_token = value + + @property + def _message_repository(self) -> MessageRepository: + """ + Overriding AbstractOauth2Authenticator._message_repository to allow for HTTP request logs + """ + return self.message_repository + + +@dataclass +class DeclarativeSingleUseRefreshTokenOauth2Authenticator(SingleUseRefreshTokenOauth2Authenticator, DeclarativeAuthenticator): + """ + Declarative version of SingleUseRefreshTokenOauth2Authenticator which can be used in declarative connectors. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/selective_authenticator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/selective_authenticator.py new file mode 100644 index 000000000000..e3f39a0a8ec1 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/selective_authenticator.py @@ -0,0 +1,37 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Any, List, Mapping + +import dpath +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator + + +@dataclass +class SelectiveAuthenticator(DeclarativeAuthenticator): + """Authenticator that selects concrete implementation based on specific config value.""" + + config: Mapping[str, Any] + authenticators: Mapping[str, DeclarativeAuthenticator] + authenticator_selection_path: List[str] + + # returns "DeclarativeAuthenticator", but must return a subtype of "SelectiveAuthenticator" + def __new__( # type: ignore[misc] + cls, + config: Mapping[str, Any], + authenticators: Mapping[str, DeclarativeAuthenticator], + authenticator_selection_path: List[str], + *arg: Any, + **kwargs: Any, + ) -> DeclarativeAuthenticator: + try: + selected_key = str(dpath.get(config, authenticator_selection_path)) + except KeyError as err: + raise ValueError("The path from `authenticator_selection_path` is not found in the config.") from err + + try: + return authenticators[selected_key] + except KeyError as err: + raise ValueError(f"The authenticator `{selected_key}` is not found.") from err diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token.py new file mode 100644 index 000000000000..a2b64fce04c0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token.py @@ -0,0 +1,254 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import base64 +import logging +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Union + +import requests +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator +from airbyte_cdk.sources.declarative.auth.token_provider import TokenProvider +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.types import Config +from cachetools import TTLCache, cached + + +@dataclass +class ApiKeyAuthenticator(DeclarativeAuthenticator): + """ + ApiKeyAuth sets a request header on the HTTP requests sent. + + The header is of the form: + `"
": ""` + + For example, + `ApiKeyAuthenticator("Authorization", "Bearer hello")` + will result in the following header set on the HTTP request + `"Authorization": "Bearer hello"` + + Attributes: + request_option (RequestOption): request option how to inject the token into the request + token_provider (TokenProvider): Provider of the token + config (Config): The user-provided configuration as specified by the source's spec + parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + request_option: RequestOption + token_provider: TokenProvider + config: Config + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._field_name = InterpolatedString.create(self.request_option.field_name, parameters=parameters) + + @property + def auth_header(self) -> str: + options = self._get_request_options(RequestOptionType.header) + return next(iter(options.keys()), "") + + @property + def token(self) -> str: + return self.token_provider.get_token() + + def _get_request_options(self, option_type: RequestOptionType) -> Mapping[str, Any]: + options = {} + if self.request_option.inject_into == option_type: + options[self._field_name.eval(self.config)] = self.token + return options + + def get_request_params(self) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.request_parameter) + + def get_request_body_data(self) -> Union[Mapping[str, Any], str]: + return self._get_request_options(RequestOptionType.body_data) + + def get_request_body_json(self) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_json) + + +@dataclass +class BearerAuthenticator(DeclarativeAuthenticator): + """ + Authenticator that sets the Authorization header on the HTTP requests sent. + + The header is of the form: + `"Authorization": "Bearer "` + + Attributes: + token_provider (TokenProvider): Provider of the token + config (Config): The user-provided configuration as specified by the source's spec + parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + token_provider: TokenProvider + config: Config + parameters: InitVar[Mapping[str, Any]] + + @property + def auth_header(self) -> str: + return "Authorization" + + @property + def token(self) -> str: + return f"Bearer {self.token_provider.get_token()}" + + +@dataclass +class BasicHttpAuthenticator(DeclarativeAuthenticator): + """ + Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using base64 + https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme + + The header is of the form + `"Authorization": "Basic "` + + Attributes: + username (Union[InterpolatedString, str]): The username + config (Config): The user-provided configuration as specified by the source's spec + password (Union[InterpolatedString, str]): The password + parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + username: Union[InterpolatedString, str] + config: Config + parameters: InitVar[Mapping[str, Any]] + password: Union[InterpolatedString, str] = "" + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._username = InterpolatedString.create(self.username, parameters=parameters) + self._password = InterpolatedString.create(self.password, parameters=parameters) + + @property + def auth_header(self) -> str: + return "Authorization" + + @property + def token(self) -> str: + auth_string = f"{self._username.eval(self.config)}:{self._password.eval(self.config)}".encode("utf8") + b64_encoded = base64.b64encode(auth_string).decode("utf8") + return f"Basic {b64_encoded}" + + +""" + maxsize - The maximum size of the cache + ttl - time-to-live value in seconds + docs https://cachetools.readthedocs.io/en/latest/ + maxsize=1000 - when the cache is full, in this case more than 1000, + i.e. by adding another item the cache would exceed its maximum size, the cache must choose which item(s) to discard + ttl=86400 means that cached token will live for 86400 seconds (one day) +""" +cacheSessionTokenAuthenticator: TTLCache[str, str] = TTLCache(maxsize=1000, ttl=86400) + + +@cached(cacheSessionTokenAuthenticator) +def get_new_session_token(api_url: str, username: str, password: str, response_key: str) -> str: + """ + This method retrieves session token from api by username and password for SessionTokenAuthenticator. + It's cashed to avoid a multiple calling by sync and updating session token every stream sync. + Args: + api_url: api url for getting new session token + username: username for auth + password: password for auth + response_key: field name in response to retrieve a session token + + Returns: + session token + """ + response = requests.post( + f"{api_url}", + headers={"Content-Type": "application/json"}, + json={"username": username, "password": password}, + ) + response.raise_for_status() + if not response.ok: + raise ConnectionError(f"Failed to retrieve new session token, response code {response.status_code} because {response.reason}") + return str(response.json()[response_key]) + + +@dataclass +class LegacySessionTokenAuthenticator(DeclarativeAuthenticator): + """ + Builds auth based on session tokens. + A session token is a random value generated by a server to identify + a specific user for the duration of one interaction session. + + The header is of the form + `"Specific Header": "Session Token Value"` + + Attributes: + api_url (Union[InterpolatedString, str]): Base api url of source + username (Union[InterpolatedString, str]): The username + config (Config): The user-provided configuration as specified by the source's spec + password (Union[InterpolatedString, str]): The password + header (Union[InterpolatedString, str]): Specific header of source for providing session token + parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + session_token (Union[InterpolatedString, str]): Session token generated by user + session_token_response_key (Union[InterpolatedString, str]): Key for retrieving session token from api response + login_url (Union[InterpolatedString, str]): Url fot getting a specific session token + validate_session_url (Union[InterpolatedString, str]): Url to validate passed session token + """ + + api_url: Union[InterpolatedString, str] + header: Union[InterpolatedString, str] + session_token: Union[InterpolatedString, str] + session_token_response_key: Union[InterpolatedString, str] + username: Union[InterpolatedString, str] + config: Config + parameters: InitVar[Mapping[str, Any]] + login_url: Union[InterpolatedString, str] + validate_session_url: Union[InterpolatedString, str] + password: Union[InterpolatedString, str] = "" + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._username = InterpolatedString.create(self.username, parameters=parameters) + self._password = InterpolatedString.create(self.password, parameters=parameters) + self._api_url = InterpolatedString.create(self.api_url, parameters=parameters) + self._header = InterpolatedString.create(self.header, parameters=parameters) + self._session_token = InterpolatedString.create(self.session_token, parameters=parameters) + self._session_token_response_key = InterpolatedString.create(self.session_token_response_key, parameters=parameters) + self._login_url = InterpolatedString.create(self.login_url, parameters=parameters) + self._validate_session_url = InterpolatedString.create(self.validate_session_url, parameters=parameters) + + self.logger = logging.getLogger("airbyte") + + @property + def auth_header(self) -> str: + return str(self._header.eval(self.config)) + + @property + def token(self) -> str: + if self._session_token.eval(self.config): + if self.is_valid_session_token(): + return str(self._session_token.eval(self.config)) + if self._password.eval(self.config) and self._username.eval(self.config): + username = self._username.eval(self.config) + password = self._password.eval(self.config) + session_token_response_key = self._session_token_response_key.eval(self.config) + api_url = f"{self._api_url.eval(self.config)}{self._login_url.eval(self.config)}" + + self.logger.info("Using generated session token by username and password") + return get_new_session_token(api_url, username, password, session_token_response_key) + + raise ConnectionError("Invalid credentials: session token is not valid or provide username and password") + + def is_valid_session_token(self) -> bool: + try: + response = requests.get( + f"{self._api_url.eval(self.config)}{self._validate_session_url.eval(self.config)}", + headers={self.auth_header: self._session_token.eval(self.config)}, + ) + response.raise_for_status() + except requests.exceptions.HTTPError as e: + if e.response.status_code == requests.codes["unauthorized"]: + self.logger.info(f"Unable to connect by session token from config due to {str(e)}") + return False + else: + raise ConnectionError(f"Error while validating session token: {e}") + if response.ok: + self.logger.info("Connection check for source is successful.") + return True + else: + raise ConnectionError(f"Failed to retrieve new session token, response code {response.status_code} because {response.reason}") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token_provider.py new file mode 100644 index 000000000000..c3c2a41f555e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token_provider.py @@ -0,0 +1,81 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import datetime +from abc import abstractmethod +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, Optional, Union + +import dpath +import pendulum +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.exceptions import ReadException +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.requester import Requester +from airbyte_cdk.sources.http_logger import format_http_message +from airbyte_cdk.sources.message import MessageRepository, NoopMessageRepository +from airbyte_cdk.sources.types import Config +from isodate import Duration +from pendulum import DateTime + + +class TokenProvider: + @abstractmethod + def get_token(self) -> str: + pass + + +@dataclass +class SessionTokenProvider(TokenProvider): + login_requester: Requester + session_token_path: List[str] + expiration_duration: Optional[Union[datetime.timedelta, Duration]] + parameters: InitVar[Mapping[str, Any]] + message_repository: MessageRepository = NoopMessageRepository() + decoder: Decoder = field(default_factory=lambda: JsonDecoder(parameters={})) + + _next_expiration_time: Optional[DateTime] = None + _token: Optional[str] = None + + def get_token(self) -> str: + self._refresh_if_necessary() + if self._token is None: + raise ReadException("Failed to get session token, token is None") + return self._token + + def _refresh_if_necessary(self) -> None: + if self._next_expiration_time is None or self._next_expiration_time < pendulum.now(): + self._refresh() + + def _refresh(self) -> None: + response = self.login_requester.send_request( + log_formatter=lambda response: format_http_message( + response, + "Login request", + "Obtains session token", + None, + is_auxiliary=True, + ), + ) + if response is None: + raise ReadException("Failed to get session token, response got ignored by requester") + session_token = dpath.get(next(self.decoder.decode(response)), self.session_token_path) + if self.expiration_duration is not None: + self._next_expiration_time = pendulum.now() + self.expiration_duration + self._token = session_token # type: ignore # Returned decoded response will be Mapping and therefore session_token will be str or None + + +@dataclass +class InterpolatedStringTokenProvider(TokenProvider): + config: Config + api_token: Union[InterpolatedString, str] + parameters: Mapping[str, Any] + + def __post_init__(self) -> None: + self._token = InterpolatedString.create(self.api_token, parameters=self.parameters) + + def get_token(self) -> str: + return str(self._token.eval(self.config)) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/__init__.py new file mode 100644 index 000000000000..16244cd1f6f3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.checks.check_stream import CheckStream +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker + +__all__ = ["CheckStream", "ConnectionChecker"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/check_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/check_stream.py new file mode 100644 index 000000000000..baf056d3c799 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/check_stream.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import traceback +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Tuple + +from airbyte_cdk import AbstractSource +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy + + +@dataclass +class CheckStream(ConnectionChecker): + """ + Checks the connections by checking availability of one or many streams selected by the developer + + Attributes: + stream_name (List[str]): names of streams to check + """ + + stream_names: List[str] + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._parameters = parameters + + def check_connection(self, source: AbstractSource, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Any]: + streams = source.streams(config=config) + stream_name_to_stream = {s.name: s for s in streams} + if len(streams) == 0: + return False, f"No streams to connect to from source {source}" + for stream_name in self.stream_names: + if stream_name not in stream_name_to_stream.keys(): + raise ValueError(f"{stream_name} is not part of the catalog. Expected one of {stream_name_to_stream.keys()}.") + + stream = stream_name_to_stream[stream_name] + availability_strategy = HttpAvailabilityStrategy() + try: + stream_is_available, reason = availability_strategy.check_availability(stream, logger) + if not stream_is_available: + return False, reason + except Exception as error: + logger.error(f"Encountered an error trying to connect to stream {stream_name}. Error: \n {traceback.format_exc()}") + return False, f"Unable to connect to stream {stream_name} - {error}" + return True, None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/connection_checker.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/connection_checker.py new file mode 100644 index 000000000000..908e659b2a9d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/connection_checker.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from typing import Any, Mapping, Tuple + +from airbyte_cdk import AbstractSource + + +class ConnectionChecker(ABC): + """ + Abstract base class for checking a connection + """ + + @abstractmethod + def check_connection(self, source: AbstractSource, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Any]: + """ + Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect + to the Stripe API. + + :param source: source + :param logger: source logger + :param config: The user-provided configuration as specified by the source's spec. + This usually contains information required to check connection e.g. tokens, secrets and keys etc. + :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful + and we can connect to the underlying data source using the provided configuration. + Otherwise, the input config cannot be used to connect to the underlying data source, + and the "error" object should describe what went wrong. + The error object will be cast to string to display the problem to the user. + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrency_level/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrency_level/__init__.py new file mode 100644 index 000000000000..6c55c15c9d5e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrency_level/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.concurrency_level.concurrency_level import ConcurrencyLevel + +__all__ = ["ConcurrencyLevel"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py new file mode 100644 index 000000000000..a86c4f8ff8c0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.types import Config + + +@dataclass +class ConcurrencyLevel: + """ + Returns the number of worker threads that should be used when syncing concurrent streams in parallel + + Attributes: + default_concurrency (Union[int, str]): The hardcoded integer or interpolation of how many worker threads to use during a sync + max_concurrency (Optional[int]): The maximum number of worker threads to use when the default_concurrency is exceeded + """ + + default_concurrency: Union[int, str] + max_concurrency: Optional[int] + config: Config + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if isinstance(self.default_concurrency, int): + self._default_concurrency: Union[int, InterpolatedString] = self.default_concurrency + elif "config" in self.default_concurrency and not self.max_concurrency: + raise ValueError("ConcurrencyLevel requires that max_concurrency be defined if the default_concurrency can be used-specified") + else: + self._default_concurrency = InterpolatedString.create(self.default_concurrency, parameters=parameters) + + def get_concurrency_level(self) -> int: + if isinstance(self._default_concurrency, InterpolatedString): + evaluated_default_concurrency = self._default_concurrency.eval(config=self.config) + if not isinstance(evaluated_default_concurrency, int): + raise ValueError("default_concurrency did not evaluate to an integer") + return min(evaluated_default_concurrency, self.max_concurrency) if self.max_concurrency else evaluated_default_concurrency + else: + return self._default_concurrency diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrent_declarative_source.py new file mode 100644 index 000000000000..d6bdf50507d5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -0,0 +1,284 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import logging +from typing import Any, Generic, Iterator, List, Mapping, Optional, Tuple, Union + +from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog +from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.extractors import RecordSelector +from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ConcurrencyLevel as ConcurrencyLevelModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import DatetimeBasedCursor as DatetimeBasedCursorModel +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ModelToComponentFactory +from airbyte_cdk.sources.declarative.requesters import HttpRequester +from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever +from airbyte_cdk.sources.declarative.transformations.add_fields import AddFields +from airbyte_cdk.sources.declarative.types import ConnectionDefinition +from airbyte_cdk.sources.source import TState +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.adapters import CursorPartitionGenerator +from airbyte_cdk.sources.streams.concurrent.availability_strategy import AlwaysAvailableAvailabilityStrategy +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream + + +class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]): + + # By default, we defer to a value of 1 which represents running a connector using the Concurrent CDK engine on only one thread. + SINGLE_THREADED_CONCURRENCY_LEVEL = 1 + + def __init__( + self, + catalog: Optional[ConfiguredAirbyteCatalog], + config: Optional[Mapping[str, Any]], + state: TState, + source_config: ConnectionDefinition, + debug: bool = False, + emit_connector_builder_messages: bool = False, + component_factory: Optional[ModelToComponentFactory] = None, + **kwargs: Any, + ) -> None: + super().__init__( + source_config=source_config, + debug=debug, + emit_connector_builder_messages=emit_connector_builder_messages, + component_factory=component_factory, + ) + + self._state = state + + self._concurrent_streams: Optional[List[AbstractStream]] + self._synchronous_streams: Optional[List[Stream]] + + # If the connector command was SPEC, there is no incoming config, and we cannot instantiate streams because + # they might depend on it. Ideally we want to have a static method on this class to get the spec without + # any other arguments, but the existing entrypoint.py isn't designed to support this. Just noting this + # for our future improvements to the CDK. + if config: + self._concurrent_streams, self._synchronous_streams = self._group_streams(config=config or {}) + else: + self._concurrent_streams = None + self._synchronous_streams = None + + concurrency_level_from_manifest = self._source_config.get("concurrency_level") + if concurrency_level_from_manifest: + concurrency_level_component = self._constructor.create_component( + model_type=ConcurrencyLevelModel, component_definition=concurrency_level_from_manifest, config=config or {} + ) + if not isinstance(concurrency_level_component, ConcurrencyLevel): + raise ValueError(f"Expected to generate a ConcurrencyLevel component, but received {concurrency_level_component.__class__}") + + concurrency_level = concurrency_level_component.get_concurrency_level() + initial_number_of_partitions_to_generate = max( + concurrency_level // 2, 1 + ) # Partition_generation iterates using range based on this value. If this is floored to zero we end up in a dead lock during start up + else: + concurrency_level = self.SINGLE_THREADED_CONCURRENCY_LEVEL + initial_number_of_partitions_to_generate = self.SINGLE_THREADED_CONCURRENCY_LEVEL + + self._concurrent_source = ConcurrentSource.create( + num_workers=concurrency_level, + initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate, + logger=self.logger, + slice_logger=self._slice_logger, + message_repository=self.message_repository, # type: ignore # message_repository is always instantiated with a value by factory + ) + + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: Optional[Union[List[AirbyteStateMessage]]] = None, + ) -> Iterator[AirbyteMessage]: + + # ConcurrentReadProcessor pops streams that are finished being read so before syncing, the names of the concurrent + # streams must be saved so that they can be removed from the catalog before starting synchronous streams + if self._concurrent_streams: + concurrent_stream_names = set([concurrent_stream.name for concurrent_stream in self._concurrent_streams]) + + selected_concurrent_streams = self._select_streams(streams=self._concurrent_streams, configured_catalog=catalog) + # It would appear that passing in an empty set of streams causes an infinite loop in ConcurrentReadProcessor. + # This is also evident in concurrent_source_adapter.py so I'll leave this out of scope to fix for now + if selected_concurrent_streams: + yield from self._concurrent_source.read(selected_concurrent_streams) + + # Sync all streams that are not concurrent compatible. We filter out concurrent streams because the + # existing AbstractSource.read() implementation iterates over the catalog when syncing streams. Many + # of which were already synced using the Concurrent CDK + filtered_catalog = self._remove_concurrent_streams_from_catalog( + catalog=catalog, concurrent_stream_names=concurrent_stream_names + ) + else: + filtered_catalog = catalog + + yield from super().read(logger, config, filtered_catalog, state) + + def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog: + concurrent_streams = self._concurrent_streams or [] + synchronous_streams = self._synchronous_streams or [] + return AirbyteCatalog(streams=[stream.as_airbyte_stream() for stream in concurrent_streams + synchronous_streams]) + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + """ + The `streams` method is used as part of the AbstractSource in the following cases: + * ConcurrentDeclarativeSource.check -> ManifestDeclarativeSource.check -> AbstractSource.check -> DeclarativeSource.check_connection -> CheckStream.check_connection -> streams + * ConcurrentDeclarativeSource.read -> AbstractSource.read -> streams (note that we filter for a specific catalog which excludes concurrent streams so not all streams actually read from all the streams returned by `streams`) + Note that `super.streams(config)` is also called when splitting the streams between concurrent or not in `_group_streams`. + + In both case, we will assume that calling the DeclarativeStream is perfectly fine as the result for these is the same regardless of if it is a DeclarativeStream or a DefaultStream (concurrent). This should simply be removed once we have moved away from the mentioned code paths above. + """ + return super().streams(config) + + def _group_streams(self, config: Mapping[str, Any]) -> Tuple[List[AbstractStream], List[Stream]]: + concurrent_streams: List[AbstractStream] = [] + synchronous_streams: List[Stream] = [] + + state_manager = ConnectorStateManager(state=self._state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later + + name_to_stream_mapping = {stream["name"]: stream for stream in self.resolved_manifest["streams"]} + + for declarative_stream in self.streams(config=config): + # Some low-code sources use a combination of DeclarativeStream and regular Python streams. We can't inspect + # these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible, + # so we need to treat them as synchronous + if isinstance(declarative_stream, DeclarativeStream): + datetime_based_cursor_component_definition = name_to_stream_mapping[declarative_stream.name].get("incremental_sync") + + if ( + datetime_based_cursor_component_definition + and datetime_based_cursor_component_definition.get("type", "") == DatetimeBasedCursorModel.__name__ + and self._stream_supports_concurrent_partition_processing(declarative_stream=declarative_stream) + and hasattr(declarative_stream.retriever, "stream_slicer") + and isinstance(declarative_stream.retriever.stream_slicer, DatetimeBasedCursor) + ): + stream_state = state_manager.get_stream_state( + stream_name=declarative_stream.name, namespace=declarative_stream.namespace + ) + + cursor, connector_state_converter = self._constructor.create_concurrent_cursor_from_datetime_based_cursor( + state_manager=state_manager, + model_type=DatetimeBasedCursorModel, + component_definition=datetime_based_cursor_component_definition, + stream_name=declarative_stream.name, + stream_namespace=declarative_stream.namespace, + config=config or {}, + stream_state=stream_state, + ) + + # This is an optimization so that we don't invoke any cursor or state management flows within the + # low-code framework because state management is handled through the ConcurrentCursor. + if declarative_stream and declarative_stream.retriever and isinstance(declarative_stream.retriever, SimpleRetriever): + # Also a temporary hack. In the legacy Stream implementation, as part of the read, set_initial_state() is + # called to instantiate incoming state on the cursor. Although we no longer rely on the legacy low-code cursor + # for concurrent checkpointing, low-code components like StopConditionPaginationStrategyDecorator and + # ClientSideIncrementalRecordFilterDecorator still rely on a DatetimeBasedCursor that is properly initialized + # with state. + if declarative_stream.retriever.cursor: + declarative_stream.retriever.cursor.set_initial_state(stream_state=stream_state) + declarative_stream.retriever.cursor = None + + partition_generator = CursorPartitionGenerator( + stream=declarative_stream, + message_repository=self.message_repository, # type: ignore # message_repository is always instantiated with a value by factory + cursor=cursor, + connector_state_converter=connector_state_converter, + cursor_field=[cursor.cursor_field.cursor_field_key], + slice_boundary_fields=cursor.slice_boundary_fields, + ) + + concurrent_streams.append( + DefaultStream( + partition_generator=partition_generator, + name=declarative_stream.name, + json_schema=declarative_stream.get_json_schema(), + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=get_primary_key_from_stream(declarative_stream.primary_key), + cursor_field=cursor.cursor_field.cursor_field_key, + logger=self.logger, + cursor=cursor, + ) + ) + else: + synchronous_streams.append(declarative_stream) + else: + synchronous_streams.append(declarative_stream) + + return concurrent_streams, synchronous_streams + + def _stream_supports_concurrent_partition_processing(self, declarative_stream: DeclarativeStream) -> bool: + """ + Many connectors make use of stream_state during interpolation on a per-partition basis under the assumption that + state is updated sequentially. Because the concurrent CDK engine processes different partitions in parallel, + stream_state is no longer a thread-safe interpolation context. It would be a race condition because a cursor's + stream_state can be updated in any order depending on which stream partition's finish first. + + We should start to move away from depending on the value of stream_state for low-code components that operate + per-partition, but we need to gate this otherwise some connectors will be blocked from publishing. See the + cdk-migrations.md for the full list of connectors. + """ + + if isinstance(declarative_stream.retriever, SimpleRetriever) and isinstance(declarative_stream.retriever.requester, HttpRequester): + http_requester = declarative_stream.retriever.requester + if "stream_state" in http_requester._path.string: + self.logger.warning( + f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the HttpRequester which is not thread-safe. Defaulting to synchronous processing" + ) + return False + + request_options_provider = http_requester._request_options_provider + if request_options_provider.request_options_contain_stream_state(): + self.logger.warning( + f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the HttpRequester which is not thread-safe. Defaulting to synchronous processing" + ) + return False + + record_selector = declarative_stream.retriever.record_selector + if isinstance(record_selector, RecordSelector): + if record_selector.record_filter and "stream_state" in record_selector.record_filter.condition: + self.logger.warning( + f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the RecordFilter which is not thread-safe. Defaulting to synchronous processing" + ) + return False + + for add_fields in [ + transformation for transformation in record_selector.transformations if isinstance(transformation, AddFields) + ]: + for field in add_fields.fields: + if isinstance(field.value, str) and "stream_state" in field.value: + self.logger.warning( + f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the AddFields which is not thread-safe. Defaulting to synchronous processing" + ) + return False + if isinstance(field.value, InterpolatedString) and "stream_state" in field.value.string: + self.logger.warning( + f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the AddFields which is not thread-safe. Defaulting to synchronous processing" + ) + return False + return True + + @staticmethod + def _select_streams(streams: List[AbstractStream], configured_catalog: ConfiguredAirbyteCatalog) -> List[AbstractStream]: + stream_name_to_instance: Mapping[str, AbstractStream] = {s.name: s for s in streams} + abstract_streams: List[AbstractStream] = [] + for configured_stream in configured_catalog.streams: + stream_instance = stream_name_to_instance.get(configured_stream.stream.name) + if stream_instance: + abstract_streams.append(stream_instance) + + return abstract_streams + + @staticmethod + def _remove_concurrent_streams_from_catalog( + catalog: ConfiguredAirbyteCatalog, + concurrent_stream_names: set[str], + ) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog(streams=[stream for stream in catalog.streams if stream.stream.name not in concurrent_stream_names]) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/__init__.py new file mode 100644 index 000000000000..bf1f13e1edb2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime + +__all__ = ["MinMaxDatetime"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/datetime_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/datetime_parser.py new file mode 100644 index 000000000000..93122e29c591 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/datetime_parser.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime +from typing import Union + + +class DatetimeParser: + """ + Parses and formats datetime objects according to a specified format. + + This class mainly acts as a wrapper to properly handling timestamp formatting through the "%s" directive. + + %s is part of the list of format codes required by the 1989 C standard, but it is unreliable because it always return a datetime in the system's timezone. + Instead of using the directive directly, we can use datetime.fromtimestamp and dt.timestamp() + """ + + _UNIX_EPOCH = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) + + def parse(self, date: Union[str, int], format: str) -> datetime.datetime: + # "%s" is a valid (but unreliable) directive for formatting, but not for parsing + # It is defined as + # The number of seconds since the Epoch, 1970-01-01 00:00:00+0000 (UTC). https://man7.org/linux/man-pages/man3/strptime.3.html + # + # The recommended way to parse a date from its timestamp representation is to use datetime.fromtimestamp + # See https://stackoverflow.com/a/4974930 + if format == "%s": + return datetime.datetime.fromtimestamp(int(date), tz=datetime.timezone.utc) + elif format == "%s_as_float": + return datetime.datetime.fromtimestamp(float(date), tz=datetime.timezone.utc) + elif format == "%ms": + return self._UNIX_EPOCH + datetime.timedelta(milliseconds=int(date)) + + parsed_datetime = datetime.datetime.strptime(str(date), format) + if self._is_naive(parsed_datetime): + return parsed_datetime.replace(tzinfo=datetime.timezone.utc) + return parsed_datetime + + def format(self, dt: datetime.datetime, format: str) -> str: + # strftime("%s") is unreliable because it ignores the time zone information and assumes the time zone of the system it's running on + # It's safer to use the timestamp() method than the %s directive + # See https://stackoverflow.com/a/4974930 + if format == "%s": + return str(int(dt.timestamp())) + if format == "%s_as_float": + return str(float(dt.timestamp())) + if format == "%ms": + # timstamp() returns a float representing the number of seconds since the unix epoch + return str(int(dt.timestamp() * 1000)) + else: + return dt.strftime(format) + + def _is_naive(self, dt: datetime.datetime) -> bool: + return dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/min_max_datetime.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/min_max_datetime.py new file mode 100644 index 000000000000..2694da2762ca --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/min_max_datetime.py @@ -0,0 +1,98 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime as dt +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + + +@dataclass +class MinMaxDatetime: + """ + Compares the provided date against optional minimum or maximum times. If date is earlier than + min_date, then min_date is returned. If date is greater than max_date, then max_date is returned. + If neither, the input date is returned. + + The timestamp format accepts the same format codes as datetime.strfptime, which are + all the format codes required by the 1989 C standard. + Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html + + Attributes: + datetime (Union[InterpolatedString, str]): InterpolatedString or string representing the datetime in the format specified by `datetime_format` + datetime_format (str): Format of the datetime passed as argument + min_datetime (Union[InterpolatedString, str]): Represents the minimum allowed datetime value. + max_datetime (Union[InterpolatedString, str]): Represents the maximum allowed datetime value. + """ + + datetime: Union[InterpolatedString, str] + parameters: InitVar[Mapping[str, Any]] + # datetime_format is a unique case where we inherit it from the parent if it is not specified before using the default value + # which is why we need dedicated getter/setter methods and private dataclass field + datetime_format: str = "" + _datetime_format: str = field(init=False, repr=False, default="") + min_datetime: Union[InterpolatedString, str] = "" + max_datetime: Union[InterpolatedString, str] = "" + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self.datetime = InterpolatedString.create(self.datetime, parameters=parameters or {}) + self._parser = DatetimeParser() + self.min_datetime = InterpolatedString.create(self.min_datetime, parameters=parameters) if self.min_datetime else None # type: ignore + self.max_datetime = InterpolatedString.create(self.max_datetime, parameters=parameters) if self.max_datetime else None # type: ignore + + def get_datetime(self, config: Mapping[str, Any], **additional_parameters: Mapping[str, Any]) -> dt.datetime: + """ + Evaluates and returns the datetime + :param config: The user-provided configuration as specified by the source's spec + :param additional_parameters: Additional arguments to be passed to the strings for interpolation + :return: The evaluated datetime + """ + # We apply a default datetime format here instead of at instantiation, so it can be set by the parent first + datetime_format = self._datetime_format + if not datetime_format: + datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" + + time = self._parser.parse(str(self.datetime.eval(config, **additional_parameters)), datetime_format) # type: ignore # datetime is always cast to an interpolated string + + if self.min_datetime: + min_time = str(self.min_datetime.eval(config, **additional_parameters)) # type: ignore # min_datetime is always cast to an interpolated string + if min_time: + min_datetime = self._parser.parse(min_time, datetime_format) # type: ignore # min_datetime is always cast to an interpolated string + time = max(time, min_datetime) + if self.max_datetime: + max_time = str(self.max_datetime.eval(config, **additional_parameters)) # type: ignore # max_datetime is always cast to an interpolated string + if max_time: + max_datetime = self._parser.parse(max_time, datetime_format) + time = min(time, max_datetime) + return time + + @property # type: ignore # properties don't play well with dataclasses... + def datetime_format(self) -> str: + """The format of the string representing the datetime""" + return self._datetime_format + + @datetime_format.setter + def datetime_format(self, value: str) -> None: + """Setter for the datetime format""" + # Covers the case where datetime_format is not provided in the constructor, which causes the property object + # to be set which we need to avoid doing + if not isinstance(value, property): + self._datetime_format = value + + @classmethod + def create( + cls, + interpolated_string_or_min_max_datetime: Union[InterpolatedString, str, "MinMaxDatetime"], + parameters: Optional[Mapping[str, Any]] = None, + ) -> "MinMaxDatetime": + if parameters is None: + parameters = {} + if isinstance(interpolated_string_or_min_max_datetime, InterpolatedString) or isinstance( + interpolated_string_or_min_max_datetime, str + ): + return MinMaxDatetime(datetime=interpolated_string_or_min_max_datetime, parameters=parameters) + else: + return interpolated_string_or_min_max_datetime diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_component_schema.yaml new file mode 100644 index 000000000000..3fcbbf34672d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -0,0 +1,2849 @@ +"$schema": http://json-schema.org/draft-07/schema# +"$id": https://github.com/airbytehq/airbyte/blob/master/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +title: DeclarativeSource +type: object +description: An API source that extracts data according to its declarative components. +version: 1.0.0 +required: + - type + - check + - streams + - version +properties: + type: + type: string + enum: [DeclarativeSource] + check: + "$ref": "#/definitions/CheckStream" + streams: + type: array + items: + "$ref": "#/definitions/DeclarativeStream" + version: + type: string + description: The version of the Airbyte CDK used to build and test the source. + schemas: + "$ref": "#/definitions/Schemas" + definitions: + type: object + spec: + "$ref": "#/definitions/Spec" + concurrency_level: + "$ref": "#/definitions/ConcurrencyLevel" + metadata: + type: object + description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata. + additionalProperties: true + description: + type: string + description: A description of the connector. It will be presented on the Source documentation page. +additionalProperties: false +definitions: + AddedFieldDefinition: + title: Definition Of Field To Add + description: Defines the field to add on a record. + type: object + required: + - type + - path + - value + properties: + type: + type: string + enum: [AddedFieldDefinition] + path: + title: Path + description: List of strings defining the path where to add the value on the record. + type: array + items: + type: string + examples: + - ["segment_id"] + - ["metadata", "segment_id"] + value: + title: Value + description: Value of the new field. Use {{ record['existing_field'] }} syntax to refer to other fields in the record. + type: string + interpolation_context: + - config + - record + - stream_interval + - stream_partition + - stream_slice + - stream_state + examples: + - "{{ record['updates'] }}" + - "{{ record['MetaData']['LastUpdatedTime'] }}" + - "{{ stream_partition['segment_id'] }}" + value_type: + title: Value Type + description: Type of the value. If not specified, the type will be inferred from the value. + "$ref": "#/definitions/ValueType" + $parameters: + type: object + additionalProperties: true + AddFields: + title: Add Fields + description: Transformation which adds field to an output record. The path of the added field can be nested. + type: object + required: + - type + - fields + properties: + type: + type: string + enum: [AddFields] + fields: + title: Fields + description: List of transformations (path and corresponding value) that will be added to the record. + type: array + items: + - "$ref": "#/definitions/AddedFieldDefinition" + $parameters: + type: object + additionalProperties: true + ApiKeyAuthenticator: + title: API Key Authenticator + description: Authenticator for requests authenticated with an API token injected as an HTTP request header. + type: object + required: + - type + properties: + type: + type: string + enum: [ApiKeyAuthenticator] + api_token: + title: API Key + description: The API key to inject in the request. Fill it in the user inputs. + type: string + interpolation_context: + - config + examples: + - "{{ config['api_key'] }}" + - "Token token={{ config['api_key'] }}" + header: + title: Header Name + description: The name of the HTTP header that will be set to the API key. This setting is deprecated, use inject_into instead. Header and inject_into can not be defined at the same time. + type: string + interpolation_context: + - config + examples: + - Authorization + - Api-Token + - X-Auth-Token + inject_into: + title: Inject API Key Into Outgoing HTTP Request + description: Configure how the API Key will be sent in requests to the source API. Either inject_into or header has to be defined. + "$ref": "#/definitions/RequestOption" + examples: + - inject_into: header + field_name: Authorization + - inject_into: request_parameter + field_name: authKey + $parameters: + type: object + additionalProperties: true + AuthFlow: + title: "Auth flow" + description: |- + Additional and optional specification object to describe what an 'advanced' Auth flow would need to function. + - A connector should be able to fully function with the configuration as described by the ConnectorSpecification in a 'basic' mode. + - The 'advanced' mode provides easier UX for the user with UI improvements and automations. However, this requires further setup on the + server side by instance or workspace admins beforehand. The trade-off is that the user does not have to provide as many technical + inputs anymore and the auth process is faster and easier to complete. + type: object + properties: + auth_flow_type: + title: "Auth flow type" + description: "The type of auth to use" + type: string + enum: ["oauth2.0", "oauth1.0"] # Future auth types should be added here + predicate_key: + title: "Predicate key" + description: JSON path to a field in the connectorSpecification that should exist for the advanced auth to be applicable. + type: array + items: + type: string + examples: + - ["credentials", "auth_type"] + predicate_value: + title: "Predicate value" + description: Value of the predicate_key fields for the advanced auth to be applicable. + type: string + examples: + - "Oauth" + oauth_config_specification: + "$ref": "#/definitions/OAuthConfigSpecification" + BasicHttpAuthenticator: + title: Basic HTTP Authenticator + description: Authenticator for requests authenticated with the Basic HTTP authentication scheme, which encodes a username and an optional password in the Authorization request header. + type: object + required: + - type + - username + properties: + type: + type: string + enum: [BasicHttpAuthenticator] + username: + title: Username + description: The username that will be combined with the password, base64 encoded and used to make requests. Fill it in the user inputs. + type: string + interpolation_context: + - config + examples: + - "{{ config['username'] }}" + - "{{ config['api_key'] }}" + password: + title: Password + description: The password that will be combined with the username, base64 encoded and used to make requests. Fill it in the user inputs. + type: string + default: "" + interpolation_context: + - config + examples: + - "{{ config['password'] }}" + - "" + $parameters: + type: object + additionalProperties: true + BearerAuthenticator: + title: Bearer Token Authenticator + description: "Authenticator for requests authenticated with a bearer token injected as a request header of the form `Authorization: Bearer `." + type: object + required: + - type + - api_token + properties: + type: + type: string + enum: [BearerAuthenticator] + api_token: + title: Bearer Token + description: Token to inject as request header for authenticating with the API. + type: string + interpolation_context: + - config + examples: + - "{{ config['api_key'] }}" + - "{{ config['token'] }}" + $parameters: + type: object + additionalProperties: true + SelectiveAuthenticator: + title: Selective Authenticator + description: Authenticator that selects concrete authenticator based on config property. + type: object + additionalProperties: true + required: + - type + - authenticators + - authenticator_selection_path + properties: + type: + type: string + enum: [SelectiveAuthenticator] + authenticator_selection_path: + title: Authenticator Selection Path + description: Path of the field in config with selected authenticator name + type: array + items: + type: string + examples: + - ["auth"] + - ["auth", "type"] + authenticators: + title: Authenticators + description: Authenticators to select from. + type: object + additionalProperties: + anyOf: + - "$ref": "#/definitions/ApiKeyAuthenticator" + - "$ref": "#/definitions/BasicHttpAuthenticator" + - "$ref": "#/definitions/BearerAuthenticator" + - "$ref": "#/definitions/CustomAuthenticator" + - "$ref": "#/definitions/OAuthAuthenticator" + - "$ref": "#/definitions/JwtAuthenticator" + - "$ref": "#/definitions/NoAuth" + - "$ref": "#/definitions/SessionTokenAuthenticator" + - "$ref": "#/definitions/LegacySessionTokenAuthenticator" + examples: + - authenticators: + token: "#/definitions/ApiKeyAuthenticator" + oauth: "#/definitions/OAuthAuthenticator" + jwt: "#/definitions/JwtAuthenticator" + $parameters: + type: object + additionalProperties: true + CheckStream: + title: Streams to Check + description: Defines the streams to try reading when running a check operation. + type: object + required: + - type + - stream_names + properties: + type: + type: string + enum: [CheckStream] + stream_names: + title: Stream Names + description: Names of the streams to try reading from when running a check operation. + type: array + items: + type: string + examples: + - ["users"] + - ["users", "contacts"] + CompositeErrorHandler: + title: Composite Error Handler + description: Error handler that sequentially iterates over a list of error handlers. + type: object + required: + - type + - error_handlers + properties: + type: + type: string + enum: [CompositeErrorHandler] + error_handlers: + title: Error Handlers + description: List of error handlers to iterate on to determine how to handle a failed response. + type: array + items: + anyOf: + - "$ref": "#/definitions/CompositeErrorHandler" + - "$ref": "#/definitions/DefaultErrorHandler" + $parameters: + type: object + additionalProperties: true + ConcurrencyLevel: + title: Concurrency Level + description: Defines the amount of parallelization for the streams that are being synced. The factor of parallelization is how many partitions or streams are synced at the same time. For example, with a concurrency_level of 10, ten streams or partitions of data will processed at the same time. + type: object + required: + - default_concurrency + properties: + type: + type: string + enum: [ConcurrencyLevel] + default_concurrency: + title: Default Concurrency + description: The amount of concurrency that will applied during a sync. This value can be hardcoded or user-defined in the config if different users have varying volume thresholds in the target API. + anyOf: + - type: integer + - type: string + interpolation_context: + - config + examples: + - 10 + - "{{ config['num_workers'] or 10 }}" + max_concurrency: + title: Max Concurrency + description: The maximum level of concurrency that will be used during a sync. This becomes a required field when the default_concurrency derives from the config, because it serves as a safeguard against a user-defined threshold that is too high. + type: integer + examples: + - 20 + - 100 + $parameters: + type: object + additionalProperties: true + ConstantBackoffStrategy: + title: Constant Backoff + description: Backoff strategy with a constant backoff interval. + type: object + required: + - type + - backoff_time_in_seconds + properties: + type: + type: string + enum: [ConstantBackoffStrategy] + backoff_time_in_seconds: + title: Backoff Time + description: Backoff time in seconds. + anyOf: + - type: number + - type: string + interpolation_context: + - config + examples: + - 30 + - 30.5 + - "{{ config['backoff_time'] }}" + $parameters: + type: object + additionalProperties: true + CursorPagination: + title: Cursor Pagination + description: Pagination strategy that evaluates an interpolated string to define the next page to fetch. + type: object + required: + - type + - cursor_value + properties: + type: + type: string + enum: [CursorPagination] + cursor_value: + title: Cursor Value + description: Value of the cursor defining the next page to fetch. + type: string + interpolation_context: + - config + - headers + - last_page_size + - last_record + - response + examples: + - "{{ headers.link.next.cursor }}" + - "{{ last_record['key'] }}" + - "{{ response['nextPage'] }}" + page_size: + title: Page Size + description: The number of records to include in each pages. + type: integer + examples: + - 100 + stop_condition: + title: Stop Condition + description: Template string evaluating when to stop paginating. + type: string + interpolation_context: + - config + - headers + - last_record + - response + examples: + - "{{ response.data.has_more is false }}" + - "{{ 'next' not in headers['link'] }}" + $parameters: + type: object + additionalProperties: true + CustomAuthenticator: + title: Custom Authenticator + description: Authenticator component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomAuthenticator] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom authentication strategy. Has to be a sub class of DeclarativeAuthenticator. The format is `source_..`. + type: string + additionalProperties: true + examples: + - "source_railz.components.ShortLivedTokenAuthenticator" + $parameters: + type: object + additionalProperties: true + CustomBackoffStrategy: + title: Custom Backoff Strategy + description: Backoff strategy component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomBackoffStrategy] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom backoff strategy. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomBackoffStrategy" + $parameters: + type: object + additionalProperties: true + CustomErrorHandler: + title: Custom Error Handler + description: Error handler component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomErrorHandler] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom error handler. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomErrorHandler" + $parameters: + type: object + additionalProperties: true + CustomIncrementalSync: + title: Custom Incremental Sync + description: Incremental component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + - cursor_field + properties: + type: + type: string + enum: [CustomIncrementalSync] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom incremental sync. The format is `source_..`. + type: string + additionalProperties: true + examples: + - "source_railz.components.MyCustomIncrementalSync" + cursor_field: + description: The location of the value on a record that will be used as a bookmark during sync. + type: string + $parameters: + type: object + additionalProperties: true + CustomPaginationStrategy: + title: Custom Pagination Strategy + description: Pagination strategy component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomPaginationStrategy] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom pagination strategy. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomPaginationStrategy" + $parameters: + type: object + additionalProperties: true + CustomRecordExtractor: + title: Custom Record Extractor + description: Record extractor component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomRecordExtractor] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom record extraction strategy. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomRecordExtractor" + $parameters: + type: object + additionalProperties: true + CustomRecordFilter: + title: Custom Record Filter + description: Record filter component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomRecordFilter] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom record filter strategy. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomCustomRecordFilter" + $parameters: + type: object + additionalProperties: true + CustomRequester: + title: Custom Requester + description: Requester component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomRequester] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom requester strategy. The format is `source_..`. + type: string + additionalProperties: true + examples: + - "source_railz.components.MyCustomRecordExtractor" + $parameters: + type: object + additionalProperties: true + CustomRetriever: + title: Custom Retriever + description: Retriever component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomRetriever] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom retriever strategy. The format is `source_..`. + type: string + additionalProperties: true + examples: + - "source_railz.components.MyCustomRetriever" + $parameters: + type: object + additionalProperties: true + CustomPartitionRouter: + title: Custom Partition Router + description: Partition router component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomPartitionRouter] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom partition router. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomPartitionRouter" + $parameters: + type: object + additionalProperties: true + CustomSchemaLoader: + title: Custom Schema Loader + description: Schema Loader component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomSchemaLoader] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom schema loader. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomSchemaLoader" + $parameters: + type: object + additionalProperties: true + CustomStateMigration: + title: Custom State Migration + description: Apply a custom transformation on the input state. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomStateMigration] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom state migration. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomStateMigration" + $parameters: + type: object + additionalProperties: true + CustomTransformation: + title: Custom Transformation + description: Transformation component whose behavior is derived from a custom code implementation of the connector. + type: object + additionalProperties: true + required: + - type + - class_name + properties: + type: + type: string + enum: [CustomTransformation] + class_name: + title: Class Name + description: Fully-qualified name of the class that will be implementing the custom transformation. The format is `source_..`. + type: string + examples: + - "source_railz.components.MyCustomTransformation" + $parameters: + type: object + additionalProperties: true + LegacyToPerPartitionStateMigration: + title: Legacy To Per-partition-state Migration + description: + 'Transforms the input state for per-partitioned streams from the legacy format to the low-code format. + The cursor field and partition ID fields are automatically extracted from the stream''s DatetimebasedCursor and SubstreamPartitionRouter. + + Example input state: + { + "13506132": { + "last_changed": "2022-12-27T08:34:39+00:00" + } + Example output state: + { + "partition": {"id": "13506132"}, + "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"} + } + ' + type: object + additionalProperties: true + properties: + type: + type: string + enum: [LegacyToPerPartitionStateMigration] + DatetimeBasedCursor: + title: Datetime Based Cursor + description: Cursor to provide incremental capabilities over datetime. + type: object + required: + - type + - cursor_field + - datetime_format + - start_datetime + properties: + type: + type: string + enum: [DatetimeBasedCursor] + cursor_field: + title: Cursor Field + description: The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top. + type: string + interpolation_context: + - config + examples: + - "created_at" + - "{{ config['record_cursor'] }}" + datetime_format: + title: Outgoing Datetime Format + description: | + The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available: + * **%s**: Epoch unix timestamp - `1686218963` + * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456` + * **%ms**: Epoch unix timestamp (milliseconds) - `1686218963123` + * **%a**: Weekday (abbreviated) - `Sun` + * **%A**: Weekday (full) - `Sunday` + * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday) + * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31` + * **%b**: Month (abbreviated) - `Jan` + * **%B**: Month (full) - `January` + * **%m**: Month (zero-padded) - `01`, `02`, ..., `12` + * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99` + * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999` + * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23` + * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12` + * **%p**: AM/PM indicator + * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59` + * **%S**: Second (zero-padded) - `00`, `01`, ..., `59` + * **%f**: Microsecond (zero-padded to 6 digits) - `000000` + * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00` + * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT` + * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366` + * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53` + * **%W**: Week number of the year (starting Monday) - `00`, ..., `53` + * **%c**: Date and time - `Tue Aug 16 21:30:00 1988` + * **%x**: Date standard format - `08/16/1988` + * **%X**: Time standard format - `21:30:00` + * **%%**: Literal '%' character + + Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes). + type: string + examples: + - "%Y-%m-%dT%H:%M:%S.%f%z" + - "%Y-%m-%d" + - "%s" + - "%ms" + - "%s_as_float" + start_datetime: + title: Start Datetime + description: The datetime that determines the earliest record that should be synced. + anyOf: + - type: string + - "$ref": "#/definitions/MinMaxDatetime" + interpolation_context: + - config + examples: + - "2020-01-1T00:00:00Z" + - "{{ config['start_time'] }}" + cursor_datetime_formats: + title: Cursor Datetime Formats + description: The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used. + type: array + items: + type: string + examples: + - "%Y-%m-%dT%H:%M:%S.%f%z" + - "%Y-%m-%d" + - "%s" + cursor_granularity: + title: Cursor Granularity + description: + Smallest increment the datetime_format has (ISO 8601 duration) that is used to ensure the start of a slice does not overlap with the end of the previous one, e.g. for %Y-%m-%d the granularity should + be P1D, for %Y-%m-%dT%H:%M:%SZ the granularity should be PT1S. Given this field is provided, `step` needs to be provided as well. + type: string + examples: + - "PT1S" + end_datetime: + title: End Datetime + description: The datetime that determines the last record that should be synced. If not provided, `{{ now_utc() }}` will be used. + anyOf: + - type: string + - "$ref": "#/definitions/MinMaxDatetime" + interpolation_context: + - config + examples: + - "2021-01-1T00:00:00Z" + - "{{ now_utc() }}" + - "{{ day_delta(-1) }}" + end_time_option: + title: Inject End Time Into Outgoing HTTP Request + description: Optionally configures how the end datetime will be sent in requests to the source API. + "$ref": "#/definitions/RequestOption" + is_data_feed: + title: Whether the target API is formatted as a data feed + description: A data feed API is an API that does not allow filtering and paginates the content from the most recent to the least recent. Given this, the CDK needs to know when to stop paginating and this field will generate a stop condition for pagination. + type: boolean + is_client_side_incremental: + title: Whether the target API does not support filtering and returns all data (the cursor filters records in the client instead of the API side) + description: If the target API endpoint does not take cursor values to filter records and returns all records anyway, the connector with this cursor will filter out records locally, and only emit new records from the last sync, hence incremental. This means that all records would be read from the API, but only new records will be emitted to the destination. + type: boolean + is_compare_strictly: + title: Whether to skip requests if the start time equals the end time + description: Set to True if the target API does not accept queries where the start time equal the end time. + type: boolean + default: False + global_substream_cursor: + title: Whether to store cursor as one value instead of per partition + description: This setting optimizes performance when the parent stream has thousands of partitions by storing the cursor as a single value rather than per partition. Notably, the substream state is updated only at the end of the sync, which helps prevent data loss in case of a sync failure. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/incremental-syncs). + type: boolean + default: false + lookback_window: + title: Lookback Window + description: Time interval before the start_datetime to read data for, e.g. P1M for looking back one month. + type: string + interpolation_context: + - config + examples: + - "P1D" + - "P{{ config['lookback_days'] }}D" + partition_field_end: + title: Partition Field End + description: Name of the partition start time field. + type: string + examples: + - "ending_time" + partition_field_start: + title: Partition Field Start + description: Name of the partition end time field. + type: string + examples: + - "starting_time" + start_time_option: + title: Inject Start Time Into Outgoing HTTP Request + description: Optionally configures how the start datetime will be sent in requests to the source API. + "$ref": "#/definitions/RequestOption" + step: + title: Step + description: The size of the time window (ISO8601 duration). Given this field is provided, `cursor_granularity` needs to be provided as well. + type: string + examples: + - "P1W" + - "{{ config['step_increment'] }}" + $parameters: + type: object + additionalProperties: true + JwtAuthenticator: + title: JWT Authenticator + description: Authenticator for requests using JWT authentication flow. + type: object + required: + - type + - secret_key + - algorithm + properties: + type: + type: string + enum: [JwtAuthenticator] + secret_key: + type: string + description: Secret used to sign the JSON web token. + examples: + - "{{ config['secret_key'] }}" + base64_encode_secret_key: + type: boolean + description: When set to true, the secret key will be base64 encoded prior to being encoded as part of the JWT. Only set to "true" when required by the API. + default: False + algorithm: + type: string + description: Algorithm used to sign the JSON web token. + enum: + [ + "HS256", + "HS384", + "HS512", + "ES256", + "ES256K", + "ES384", + "ES512", + "RS256", + "RS384", + "RS512", + "PS256", + "PS384", + "PS512", + "EdDSA", + ] + examples: + - ES256 + - HS256 + - RS256 + - "{{ config['algorithm'] }}" + token_duration: + type: integer + title: Token Duration + description: The amount of time in seconds a JWT token can be valid after being issued. + default: 1200 + examples: + - 1200 + - 3600 + header_prefix: + type: string + title: Header Prefix + description: The prefix to be used within the Authentication header. + examples: + - "Bearer" + - "Basic" + jwt_headers: + type: object + title: JWT Headers + description: JWT headers used when signing JSON web token. + additionalProperties: false + properties: + kid: + type: string + title: Key Identifier + description: Private key ID for user account. + examples: + - "{{ config['kid'] }}" + typ: + type: string + title: Type + description: The media type of the complete JWT. + default: JWT + examples: + - JWT + cty: + type: string + title: Content Type + description: Content type of JWT header. + examples: + - JWT + additional_jwt_headers: + type: object + title: Additional JWT Headers + description: Additional headers to be included with the JWT headers object. + additionalProperties: true + jwt_payload: + type: object + title: JWT Payload + description: JWT Payload used when signing JSON web token. + additionalProperties: false + properties: + iss: + type: string + title: Issuer + description: The user/principal that issued the JWT. Commonly a value unique to the user. + examples: + - "{{ config['iss'] }}" + sub: + type: string + title: Subject + description: The subject of the JWT. Commonly defined by the API. + aud: + type: string + title: Audience + description: The recipient that the JWT is intended for. Commonly defined by the API. + examples: + - "appstoreconnect-v1" + additional_jwt_payload: + type: object + title: Additional JWT Payload Properties + description: Additional properties to be added to the JWT payload. + additionalProperties: true + $parameters: + type: object + additionalProperties: true + OAuthAuthenticator: + title: OAuth2 + description: Authenticator for requests using OAuth 2.0 authorization flow. + type: object + required: + - type + - client_id + - client_secret + - token_refresh_endpoint + properties: + type: + type: string + enum: [OAuthAuthenticator] + client_id: + title: Client ID + description: The OAuth client ID. Fill it in the user inputs. + type: string + examples: + - "{{ config['client_id }}" + - "{{ config['credentials']['client_id }}" + client_secret: + title: Client Secret + description: The OAuth client secret. Fill it in the user inputs. + type: string + examples: + - "{{ config['client_secret }}" + - "{{ config['credentials']['client_secret }}" + refresh_token: + title: Refresh Token + description: Credential artifact used to get a new access token. + type: string + examples: + - "{{ config['refresh_token'] }}" + - "{{ config['credentials]['refresh_token'] }}" + token_refresh_endpoint: + title: Token Refresh Endpoint + description: The full URL to call to obtain a new access token. + type: string + examples: + - https://connect.squareup.com/oauth2/token + access_token_name: + title: Access Token Property Name + description: The name of the property which contains the access token in the response from the token refresh endpoint. + type: string + default: "access_token" + examples: + - access_token + expires_in_name: + title: Token Expiry Property Name + description: The name of the property which contains the expiry date in the response from the token refresh endpoint. + type: string + default: "expires_in" + examples: + - expires_in + grant_type: + title: Grant Type + description: Specifies the OAuth2 grant type. If set to refresh_token, the refresh_token needs to be provided as well. For client_credentials, only client id and secret are required. Other grant types are not officially supported. + type: string + default: "refresh_token" + examples: + - refresh_token + - client_credentials + refresh_request_body: + title: Refresh Request Body + description: Body of the request sent to get a new access token. + type: object + additionalProperties: true + examples: + - applicationId: "{{ config['application_id'] }}" + applicationSecret: "{{ config['application_secret'] }}" + token: "{{ config['token'] }}" + scopes: + title: Scopes + description: List of scopes that should be granted to the access token. + type: array + items: + type: string + examples: + - [ + "crm.list.read", + "crm.objects.contacts.read", + "crm.schema.contacts.read", + ] + token_expiry_date: + title: Token Expiry Date + description: The access token expiry date. + type: string + examples: + - 2023-04-06T07:12:10.421833+00:00 + - 1680842386 + token_expiry_date_format: + title: Token Expiry Date Format + description: The format of the time to expiration datetime. Provide it if the time is returned as a date-time string instead of seconds. + type: string + examples: + - "%Y-%m-%d %H:%M:%S.%f+00:00" + refresh_token_updater: + title: Token Updater + description: When the token updater is defined, new refresh tokens, access tokens and the access token expiry date are written back from the authentication response to the config object. This is important if the refresh token can only used once. + properties: + refresh_token_name: + title: Refresh Token Property Name + description: The name of the property which contains the updated refresh token in the response from the token refresh endpoint. + type: string + default: "refresh_token" + examples: + - "refresh_token" + access_token_config_path: + title: Config Path To Access Token + description: Config path to the access token. Make sure the field actually exists in the config. + type: array + items: + type: string + default: ["credentials", "access_token"] + examples: + - ["credentials", "access_token"] + - ["access_token"] + refresh_token_config_path: + title: Config Path To Refresh Token + description: Config path to the access token. Make sure the field actually exists in the config. + type: array + items: + type: string + default: ["credentials", "refresh_token"] + examples: + - ["credentials", "refresh_token"] + - ["refresh_token"] + token_expiry_date_config_path: + title: Config Path To Expiry Date + description: Config path to the expiry date. Make sure actually exists in the config. + type: array + items: + type: string + default: ["credentials", "token_expiry_date"] + examples: + - ["credentials", "token_expiry_date"] + refresh_token_error_status_codes: + title: Refresh Token Error Status Codes + description: Status Codes to Identify refresh token error in response (Refresh Token Error Key and Refresh Token Error Values should be also specified). Responses with one of the error status code and containing an error value will be flagged as a config error + type: array + items: + type: integer + default: [] + examples: + - [400, 500] + refresh_token_error_key: + title: Refresh Token Error Key + description: Key to Identify refresh token error in response (Refresh Token Error Status Codes and Refresh Token Error Values should be also specified). + type: string + default: "" + examples: + - "error" + refresh_token_error_values: + title: Refresh Token Error Values + description: 'List of values to check for exception during token refresh process. Used to check if the error found in the response matches the key from the Refresh Token Error Key field (e.g. response={"error": "invalid_grant"}). Only responses with one of the error status code and containing an error value will be flagged as a config error' + type: array + items: + type: string + default: [] + examples: + - ["invalid_grant", "invalid_permissions"] + $parameters: + type: object + additionalProperties: true + DeclarativeStream: + title: Declarative Stream + description: A stream whose behavior is described by a set of declarative low code components. + type: object + additionalProperties: true + required: + - type + - retriever + properties: + type: + type: string + enum: [DeclarativeStream] + retriever: + title: Retriever + description: Component used to coordinate how records are extracted across stream slices and request pages. + anyOf: + - "$ref": "#/definitions/AsyncRetriever" + - "$ref": "#/definitions/CustomRetriever" + - "$ref": "#/definitions/SimpleRetriever" + incremental_sync: + title: Incremental Sync + description: Component used to fetch data incrementally based on a time field in the data. + anyOf: + - "$ref": "#/definitions/CustomIncrementalSync" + - "$ref": "#/definitions/DatetimeBasedCursor" + name: + title: Name + description: The stream name. + type: string + default: "" + example: + - "Users" + primary_key: + title: Primary Key + description: The primary key of the stream. + "$ref": "#/definitions/PrimaryKey" + default: "" + schema_loader: + title: Schema Loader + description: Component used to retrieve the schema for the current stream. + anyOf: + - "$ref": "#/definitions/InlineSchemaLoader" + - "$ref": "#/definitions/JsonFileSchemaLoader" + - "$ref": "#/definitions/CustomSchemaLoader" + # TODO we have move the transformation to the RecordSelector level in the code but kept this here for + # compatibility reason. We should eventually move this to align with the code. + transformations: + title: Transformations + description: A list of transformations to be applied to each output record. + type: array + items: + anyOf: + - "$ref": "#/definitions/AddFields" + - "$ref": "#/definitions/CustomTransformation" + - "$ref": "#/definitions/RemoveFields" + - "$ref": "#/definitions/KeysToLower" + state_migrations: + title: State Migrations + description: Array of state migrations to be applied on the input state + type: array + items: + anyOf: + - "$ref": "#/definitions/LegacyToPerPartitionStateMigration" + - "$ref": "#/definitions/CustomStateMigration" + default: [] + $parameters: + type: object + additional_properties: true + DefaultErrorHandler: + title: Default Error Handler + description: Component defining how to handle errors. Default behavior includes only retrying server errors (HTTP 5XX) and too many requests (HTTP 429) with an exponential backoff. + type: object + required: + - type + properties: + type: + type: string + enum: [DefaultErrorHandler] + backoff_strategies: + title: Backoff Strategies + description: List of backoff strategies to use to determine how long to wait before retrying a retryable request. + type: array + items: + anyOf: + - "$ref": "#/definitions/ConstantBackoffStrategy" + - "$ref": "#/definitions/CustomBackoffStrategy" + - "$ref": "#/definitions/ExponentialBackoffStrategy" + - "$ref": "#/definitions/WaitTimeFromHeader" + - "$ref": "#/definitions/WaitUntilTimeFromHeader" + max_retries: + title: Max Retry Count + description: The maximum number of time to retry a retryable request before giving up and failing. + type: integer + default: 5 + examples: + - 5 + - 0 + - 10 + response_filters: + title: Response Filters + description: List of response filters to iterate on when deciding how to handle an error. When using an array of multiple filters, the filters will be applied sequentially and the response will be selected if it matches any of the filter's predicate. + type: array + items: + "$ref": "#/definitions/HttpResponseFilter" + $parameters: + type: object + additional_properties: true + DefaultPaginator: + title: Default Paginator + description: Default pagination implementation to request pages of results with a fixed size until the pagination strategy no longer returns a next_page_token. + type: object + required: + - type + - pagination_strategy + properties: + type: + type: string + enum: [DefaultPaginator] + pagination_strategy: + title: Pagination Strategy + description: Strategy defining how records are paginated. + anyOf: + - "$ref": "#/definitions/CursorPagination" + - "$ref": "#/definitions/CustomPaginationStrategy" + - "$ref": "#/definitions/OffsetIncrement" + - "$ref": "#/definitions/PageIncrement" + page_size_option: + "$ref": "#/definitions/RequestOption" + page_token_option: + anyOf: + - "$ref": "#/definitions/RequestOption" + - "$ref": "#/definitions/RequestPath" + $parameters: + type: object + additionalProperties: true + DpathExtractor: + title: Dpath Extractor + description: Record extractor that searches a decoded response over a path defined as an array of fields. + type: object + required: + - type + - field_path + properties: + type: + type: string + enum: [DpathExtractor] + field_path: + title: Field Path + description: List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector). + type: array + items: + - type: string + interpolation_content: + - config + examples: + - ["data"] + - ["data", "records"] + - ["data", "{{ parameters.name }}"] + - ["data", "*", "record"] + $parameters: + type: object + additionalProperties: true + ExponentialBackoffStrategy: + title: Exponential Backoff + description: Backoff strategy with an exponential backoff interval. The interval is defined as factor * 2^attempt_count. + type: object + required: + - type + properties: + type: + type: string + enum: [ExponentialBackoffStrategy] + factor: + title: Factor + description: Multiplicative constant applied on each retry. + anyOf: + - type: number + - type: string + default: 5 + interpolation_context: + - config + examples: + - 5 + - 5.5 + - "10" + $parameters: + type: object + additionalProperties: true + SessionTokenAuthenticator: + type: object + required: + - type + - login_requester + - session_token_path + - request_authentication + properties: + type: + type: string + enum: [SessionTokenAuthenticator] + login_requester: + title: Login Requester + description: Description of the request to perform to obtain a session token to perform data requests. The response body is expected to be a JSON object with a session token property. + "$ref": "#/definitions/HttpRequester" + examples: + - type: HttpRequester + url_base: "https://my_api.com" + path: "/login" + authenticator: + type: BasicHttpAuthenticator + username: "{{ config.username }}" + password: "{{ config.password }}" + session_token_path: + title: Session Token Path + description: The path in the response body returned from the login requester to the session token. + examples: + - ["access_token"] + - ["result", "token"] + type: array + items: + type: string + expiration_duration: + title: Expiration Duration + description: The duration in ISO 8601 duration notation after which the session token expires, starting from the time it was obtained. Omitting it will result in the session token being refreshed for every request. + type: string + examples: + - "PT1H" + - "P1D" + request_authentication: + title: Data Request Authentication + description: Authentication method to use for requests sent to the API, specifying how to inject the session token. + anyOf: + - "$ref": "#/definitions/SessionTokenRequestApiKeyAuthenticator" + - "$ref": "#/definitions/SessionTokenRequestBearerAuthenticator" + decoder: + title: Decoder + description: Component used to decode the response. + anyOf: + - "$ref": "#/definitions/JsonDecoder" + - "$ref": "#/definitions/XmlDecoder" + $parameters: + type: object + additionalProperties: true + SessionTokenRequestApiKeyAuthenticator: + type: object + title: API Key Authenticator + description: Authenticator for requests using the session token as an API key that's injected into the request. + required: + - type + - inject_into + properties: + type: + enum: [ApiKey] + inject_into: + title: Inject API Key Into Outgoing HTTP Request + description: Configure how the API Key will be sent in requests to the source API. + "$ref": "#/definitions/RequestOption" + examples: + - inject_into: header + field_name: Authorization + - inject_into: request_parameter + field_name: authKey + SessionTokenRequestBearerAuthenticator: + title: Bearer Authenticator + description: Authenticator for requests using the session token as a standard bearer token. + required: + - type + properties: + type: + enum: [Bearer] + HttpRequester: + title: HTTP Requester + description: Requester submitting HTTP requests and extracting records from the response. + type: object + required: + - type + - path + - url_base + properties: + type: + type: string + enum: [HttpRequester] + url_base: + title: API Base URL + description: Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. + type: string + interpolation_context: + - config + examples: + - "https://connect.squareup.com/v2" + - "{{ config['base_url'] or 'https://app.posthog.com'}}/api/" + path: + title: URL Path + description: Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. + type: string + interpolation_context: + - config + - next_page_token + - stream_interval + - stream_partition + - stream_slice + - stream_state + examples: + - "/products" + - "/quotes/{{ stream_partition['id'] }}/quote_line_groups" + - "/trades/{{ config['symbol_id'] }}/history" + authenticator: + title: Authenticator + description: Authentication method to use for requests sent to the API. + anyOf: + - "$ref": "#/definitions/ApiKeyAuthenticator" + - "$ref": "#/definitions/BasicHttpAuthenticator" + - "$ref": "#/definitions/BearerAuthenticator" + - "$ref": "#/definitions/CustomAuthenticator" + - "$ref": "#/definitions/OAuthAuthenticator" + - "$ref": "#/definitions/JwtAuthenticator" + - "$ref": "#/definitions/NoAuth" + - "$ref": "#/definitions/SessionTokenAuthenticator" + - "$ref": "#/definitions/LegacySessionTokenAuthenticator" + - "$ref": "#/definitions/SelectiveAuthenticator" + error_handler: + title: Error Handler + description: Error handler component that defines how to handle errors. + anyOf: + - "$ref": "#/definitions/DefaultErrorHandler" + - "$ref": "#/definitions/CustomErrorHandler" + - "$ref": "#/definitions/CompositeErrorHandler" + http_method: + title: HTTP Method + description: The HTTP method used to fetch data from the source (can be GET or POST). + type: string + enum: + - GET + - POST + default: GET + examples: + - GET + - POST + request_body_data: + title: Request Body Payload (Non-JSON) + description: Specifies how to populate the body of the request with a non-JSON payload. Plain text will be sent as is, whereas objects will be converted to a urlencoded form. + anyOf: + - type: string + - type: object + additionalProperties: + type: string + interpolation_context: + - next_page_token + - stream_interval + - stream_partition + - stream_slice + - stream_state + examples: + - | + [{"clause": {"type": "timestamp", "operator": 10, "parameters": + [{"value": {{ stream_interval['start_time'] | int * 1000 }} }] + }, "orderBy": 1, "columnName": "Timestamp"}]/ + request_body_json: + title: Request Body JSON Payload + description: Specifies how to populate the body of the request with a JSON payload. Can contain nested objects. + anyOf: + - type: string + - type: object + additionalProperties: true + interpolation_context: + - next_page_token + - stream_interval + - stream_partition + - stream_slice + - stream_state + examples: + - sort_order: "ASC" + sort_field: "CREATED_AT" + - key: "{{ config['value'] }}" + - sort: + field: "updated_at" + order: "ascending" + request_headers: + title: Request Headers + description: Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. + anyOf: + - type: string + - type: object + additionalProperties: + type: string + interpolation_context: + - next_page_token + - stream_interval + - stream_partition + - stream_slice + - stream_state + examples: + - Output-Format: JSON + - Version: "{{ config['version'] }}" + request_parameters: + title: Query Parameters + description: Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + anyOf: + - type: string + - type: object + additionalProperties: + type: string + interpolation_context: + - next_page_token + - stream_interval + - stream_partition + - stream_slice + - stream_state + examples: + - unit: "day" + - query: 'last_event_time BETWEEN TIMESTAMP "{{ stream_interval.start_time }}" AND TIMESTAMP "{{ stream_interval.end_time }}"' + - searchIn: "{{ ','.join(config.get('search_in', [])) }}" + - sort_by[asc]: updated_at + use_cache: + title: Use Cache + description: Enables stream requests caching. This field is automatically set by the CDK. + type: boolean + default: false + $parameters: + type: object + additionalProperties: true + HttpResponseFilter: + description: A filter that is used to select on properties of the HTTP response received. When used with additional filters, a response will be selected if it matches any of the filter's criteria. + type: object + required: + - type + properties: + type: + type: string + enum: [HttpResponseFilter] + action: + title: Action + description: Action to execute if a response matches the filter. + type: string + enum: + - SUCCESS + - FAIL + - RETRY + - IGNORE + - RATE_LIMITED + examples: + - SUCCESS + - FAIL + - RETRY + - IGNORE + - RATE_LIMITED + failure_type: + title: Failure Type + description: Failure type of traced exception if a response matches the filter. + type: string + enum: + - system_error + - config_error + - transient_error + examples: + - system_error + - config_error + - transient_error + error_message: + title: Error Message + description: Error Message to display if the response matches the filter. + type: string + interpolation_context: + - config + - response + - headers + error_message_contains: + title: Error Message Substring + description: Match the response if its error message contains the substring. + type: string + example: + - This API operation is not enabled for this site + http_codes: + title: HTTP Codes + description: Match the response if its HTTP code is included in this list. + type: array + items: + type: integer + uniqueItems: true + examples: + - [420, 429] + - [500] + predicate: + title: Predicate + description: Match the response if the predicate evaluates to true. + type: string + interpolation_context: + - response + - headers + examples: + - "{{ 'Too much requests' in response }}" + - "{{ 'error_code' in response and response['error_code'] == 'ComplexityException' }}" + $parameters: + type: object + additionalProperties: true + InlineSchemaLoader: + title: Inline Schema Loader + description: Loads a schema that is defined directly in the manifest file. + type: object + required: + - type + properties: + type: + type: string + enum: [InlineSchemaLoader] + schema: + title: Schema + description: Describes a streams' schema. Refer to the Data Types documentation for more details on which types are valid. + type: object + JsonFileSchemaLoader: + title: Json File Schema Loader + description: Loads the schema from a json file. + type: object + required: + - type + properties: + type: + type: string + enum: [JsonFileSchemaLoader] + file_path: + title: File Path + description: Path to the JSON file defining the schema. The path is relative to the connector module's root. + type: string + interpolation_context: + - config + example: + - "./schemas/users.json" + $parameters: + type: object + additionalProperties: true + JsonDecoder: + title: Json Decoder + type: object + required: + - type + properties: + type: + type: string + enum: [JsonDecoder] + JsonlDecoder: + title: JSONL Decoder + description: Use this if the response consists of JSON objects separated by new lines (`\n`) in JSONL format. + type: object + required: + - type + properties: + type: + type: string + enum: [JsonlDecoder] + KeysToLower: + title: Keys to Lower Case + description: A transformation that renames all keys to lower case. + type: object + required: + - type + properties: + type: + type: string + enum: [KeysToLower] + $parameters: + type: object + additionalProperties: true + IterableDecoder: + title: Iterable Decoder + description: Use this if the response consists of strings separated by new lines (`\n`). The Decoder will wrap each row into a JSON object with the `record` key. + type: object + required: + - type + properties: + type: + type: string + enum: [IterableDecoder] + XmlDecoder: + title: XML Decoder + description: Use this if the response is XML. + type: object + required: + - type + properties: + type: + type: string + enum: [XmlDecoder] + ListPartitionRouter: + title: List Partition Router + description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests. + type: object + required: + - type + - cursor_field + - values + properties: + type: + type: string + enum: [ListPartitionRouter] + cursor_field: + title: Current Partition Value Identifier + description: While iterating over list values, the name of field used to reference a list value. The partition value can be accessed with string interpolation. e.g. "{{ stream_partition['my_key'] }}" where "my_key" is the value of the cursor_field. + type: string + interpolation_context: + - config + examples: + - "section" + - "{{ config['section_key'] }}" + values: + title: Partition Values + description: The list of attributes being iterated over and used as input for the requests made to the source API. + anyOf: + - type: string + - type: array + items: + type: string + interpolation_context: + - config + examples: + - ["section_a", "section_b", "section_c"] + - "{{ config['sections'] }}" + request_option: + title: Inject Partition Value Into Outgoing HTTP Request + description: A request option describing where the list value should be injected into and under what field name if applicable. + "$ref": "#/definitions/RequestOption" + $parameters: + type: object + additionalProperties: true + MinMaxDatetime: + title: Min-Max Datetime + description: Compares the provided date against optional minimum or maximum times. The max_datetime serves as the ceiling and will be returned when datetime exceeds it. The min_datetime serves as the floor. + type: object + required: + - type + - datetime + properties: + type: + type: string + enum: [MinMaxDatetime] + datetime: + title: Datetime + description: Datetime value. + type: string + interpolation_context: + - config + examples: + - 2021-01-01 + - 2021-01-01T00:00:00Z + - "{{ config['start_time'] }}" + datetime_format: + title: Datetime Format + description: | + Format of the datetime value. Defaults to "%Y-%m-%dT%H:%M:%S.%f%z" if left empty. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available: + * **%s**: Epoch unix timestamp - `1686218963` + * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456` + * **%ms**: Epoch unix timestamp - `1686218963123` + * **%a**: Weekday (abbreviated) - `Sun` + * **%A**: Weekday (full) - `Sunday` + * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday) + * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31` + * **%b**: Month (abbreviated) - `Jan` + * **%B**: Month (full) - `January` + * **%m**: Month (zero-padded) - `01`, `02`, ..., `12` + * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99` + * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999` + * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23` + * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12` + * **%p**: AM/PM indicator + * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59` + * **%S**: Second (zero-padded) - `00`, `01`, ..., `59` + * **%f**: Microsecond (zero-padded to 6 digits) - `000000`, `000001`, ..., `999999` + * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00` + * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT` + * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366` + * **%U**: Week number of the year (Sunday as first day) - `00`, `01`, ..., `53` + * **%W**: Week number of the year (Monday as first day) - `00`, `01`, ..., `53` + * **%c**: Date and time representation - `Tue Aug 16 21:30:00 1988` + * **%x**: Date representation - `08/16/1988` + * **%X**: Time representation - `21:30:00` + * **%%**: Literal '%' character + + Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes). + type: string + default: "" + examples: + - "%Y-%m-%dT%H:%M:%S.%f%z" + - "%Y-%m-%d" + - "%s" + max_datetime: + title: Max Datetime + description: Ceiling applied on the datetime value. Must be formatted with the datetime_format field. + type: string + interpolation_context: + - config + examples: + - "2021-01-01T00:00:00Z" + - "2021-01-01" + min_datetime: + title: Min Datetime + description: Floor applied on the datetime value. Must be formatted with the datetime_format field. + type: string + interpolation_context: + - config + examples: + - "2010-01-01T00:00:00Z" + - "2010-01-01" + $parameters: + type: object + additionalProperties: true + NoAuth: + title: No Authentication + description: Authenticator for requests requiring no authentication. + type: object + required: + - type + properties: + type: + type: string + enum: [NoAuth] + $parameters: + type: object + additionalProperties: true + NoPagination: + title: No Pagination + description: Pagination implementation that never returns a next page. + type: object + required: + - type + properties: + type: + type: string + enum: [NoPagination] + OAuthConfigSpecification: + title: OAuth Config Specification + description: Specification describing how an 'advanced' Auth flow would need to function. + type: object + additionalProperties: true + properties: + oauth_user_input_from_connector_config_specification: + title: "OAuth user input" + description: |- + OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth. + Must be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification + using special annotation 'path_in_connector_config'. + These are input values the user is entering through the UI to authenticate to the connector, that might also shared + as inputs for syncing data via the connector. + Examples: + if no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[] + if connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow, + oauth_user_input_from_connector_config_specification={ + app_id: { + type: string + path_in_connector_config: ['app_id'] + } + } + if connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow, + oauth_user_input_from_connector_config_specification={ + app_id: { + type: string + path_in_connector_config: ['info', 'app_id'] + } + } + type: object + examples: + - app_id: + type: string + path_in_connector_config: ["app_id"] + - app_id: + type: string + path_in_connector_config: ["info", "app_id"] + complete_oauth_output_specification: + title: "OAuth output specification" + description: |- + OAuth specific blob. This is a Json Schema used to validate Json configurations produced by the OAuth flows as they are + returned by the distant OAuth APIs. + Must be a valid JSON describing the fields to merge back to `ConnectorSpecification.connectionSpecification`. + For each field, a special annotation `path_in_connector_config` can be specified to determine where to merge it, + Examples: + complete_oauth_output_specification={ + refresh_token: { + type: string, + path_in_connector_config: ['credentials', 'refresh_token'] + } + } + type: object + additionalProperties: true + examples: + - refresh_token: + type: string, + path_in_connector_config: ["credentials", "refresh_token"] + complete_oauth_server_input_specification: + title: "OAuth input specification" + description: |- + OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations. + Must be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the + server when completing an OAuth flow (typically exchanging an auth code for refresh token). + Examples: + complete_oauth_server_input_specification={ + client_id: { + type: string + }, + client_secret: { + type: string + } + } + type: object + additionalProperties: true + examples: + - client_id: + type: string + client_secret: + type: string + complete_oauth_server_output_specification: + title: "OAuth server output specification" + description: |- + OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations that + also need to be merged back into the connector configuration at runtime. + This is a subset configuration of `complete_oauth_server_input_specification` that filters fields out to retain only the ones that + are necessary for the connector to function with OAuth. (some fields could be used during oauth flows but not needed afterwards, therefore + they would be listed in the `complete_oauth_server_input_specification` but not `complete_oauth_server_output_specification`) + Must be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the + connector when using OAuth flow APIs. + These fields are to be merged back to `ConnectorSpecification.connectionSpecification`. + For each field, a special annotation `path_in_connector_config` can be specified to determine where to merge it, + Examples: + complete_oauth_server_output_specification={ + client_id: { + type: string, + path_in_connector_config: ['credentials', 'client_id'] + }, + client_secret: { + type: string, + path_in_connector_config: ['credentials', 'client_secret'] + } + } + type: object + additionalProperties: true + examples: + - client_id: + type: string, + path_in_connector_config: ["credentials", "client_id"] + client_secret: + type: string, + path_in_connector_config: ["credentials", "client_secret"] + OffsetIncrement: + title: Offset Increment + description: Pagination strategy that returns the number of records reads so far and returns it as the next page token. + type: object + required: + - type + properties: + type: + type: string + enum: [OffsetIncrement] + page_size: + title: Limit + description: The number of records to include in each pages. + anyOf: + - type: integer + - type: string + interpolation_context: + - config + - response + examples: + - 100 + - "{{ config['page_size'] }}" + inject_on_first_request: + title: Inject Offset + description: Using the `offset` with value `0` during the first request + type: boolean + default: false + $parameters: + type: object + additionalProperties: true + PageIncrement: + title: Page Increment + description: Pagination strategy that returns the number of pages reads so far and returns it as the next page token. + type: object + required: + - type + properties: + type: + type: string + enum: [PageIncrement] + page_size: + title: Page Size + description: The number of records to include in each pages. + interpolation_context: + - config + anyOf: + - type: integer + - type: string + examples: + - 100 + - "100" + - "{{ config['page_size'] }}" + start_from_page: + title: Start From Page + description: Index of the first page to request. + type: integer + default: 0 + examples: + - 0 + - 1 + inject_on_first_request: + title: Inject Page Number + description: Using the `page number` with value defined by `start_from_page` during the first request + type: boolean + default: false + $parameters: + type: object + additionalProperties: true + ParentStreamConfig: + title: Parent Stream Config + description: Describes how to construct partitions from the records retrieved from the parent stream.. + type: object + required: + - type + - parent_key + - partition_field + - stream + properties: + type: + type: string + enum: [ParentStreamConfig] + parent_key: + title: Parent Key + description: The primary key of records from the parent stream that will be used during the retrieval of records for the current substream. This parent identifier field is typically a characteristic of the child records being extracted from the source API. + type: string + examples: + - "id" + - "{{ config['parent_record_id'] }}" + stream: + title: Parent Stream + description: Reference to the parent stream. + "$ref": "#/definitions/DeclarativeStream" + partition_field: + title: Current Parent Key Value Identifier + description: While iterating over parent records during a sync, the parent_key value can be referenced by using this field. + type: string + examples: + - "parent_id" + - "{{ config['parent_partition_field'] }}" + request_option: + title: Request Option + description: A request option describing where the parent key value should be injected into and under what field name if applicable. + "$ref": "#/definitions/RequestOption" + incremental_dependency: + title: Incremental Dependency + description: Indicates whether the parent stream should be read incrementally based on updates in the child stream. + type: boolean + default: false + extra_fields: + title: Extra Fields + description: Array of field paths to include as additional fields in the stream slice. Each path is an array of strings representing keys to access fields in the respective parent record. Accessible via `stream_slice.extra_fields`. Missing fields are set to `None`. + interpolation_context: + - config + type: array + items: + type: array + items: + type: string + description: Defines a field path as an array of strings. + examples: + - ["field1"] + - ["nested", "field2"] + $parameters: + type: object + additionalProperties: true + PrimaryKey: + title: Primary Key + description: The stream field to be used to distinguish unique records. Can either be a single field, an array of fields representing a composite key, or an array of arrays representing a composite key where the fields are nested fields. + anyOf: + - type: string + - type: array + items: + type: string + - type: array + items: + type: array + items: + type: string + default: "" + examples: + - id + - ["code", "type"] + RecordFilter: + title: Record Filter + description: Filter applied on a list of records. + type: object + required: + - type + properties: + type: + type: string + enum: [RecordFilter] + condition: + description: The predicate to filter a record. Records will be removed if evaluated to False. + type: string + default: "" + interpolation_context: + - config + - next_page_token + - record + - stream_interval + - stream_partition + - stream_slice + - stream_state + examples: + - "{{ record['created_at'] >= stream_interval['start_time'] }}" + - "{{ record.status in ['active', 'expired'] }}" + $parameters: + type: object + additionalProperties: true + RecordSelector: + title: Record Selector + description: Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering records based on a heuristic. + type: object + required: + - type + - extractor + properties: + type: + type: string + enum: [RecordSelector] + extractor: + anyOf: + - "$ref": "#/definitions/CustomRecordExtractor" + - "$ref": "#/definitions/DpathExtractor" + record_filter: + title: Record Filter + description: Responsible for filtering records to be emitted by the Source. + anyOf: + - "$ref": "#/definitions/CustomRecordFilter" + - "$ref": "#/definitions/RecordFilter" + schema_normalization: + "$ref": "#/definitions/SchemaNormalization" + default: None + $parameters: + type: object + additionalProperties: true + SchemaNormalization: + title: Schema Normalization + description: Responsible for normalization according to the schema. + type: string + enum: + - None + - Default + examples: + - None + - Default + RemoveFields: + title: Remove Fields + description: A transformation which removes fields from a record. The fields removed are designated using FieldPointers. During transformation, if a field or any of its parents does not exist in the record, no error is thrown. + type: object + required: + - type + - field_pointers + properties: + type: + type: string + enum: [RemoveFields] + condition: + description: The predicate to filter a property by a property value. Property will be removed if it is empty OR expression is evaluated to True., + type: string + default: "" + interpolation_context: + - config + - property + - parameters + examples: + - "{{ property|string == '' }}" + - "{{ property is integer }}" + - "{{ property|length > 5 }}" + - "{{ property == 'some_string_to_match' }}" + field_pointers: + title: Field Paths + description: Array of paths defining the field to remove. Each item is an array whose field describe the path of a field to remove. + type: array + items: + items: + type: string + examples: + - ["tags"] + - [["content", "html"], ["content", "plain_text"]] + RequestPath: + title: Request Path + description: Specifies where in the request path a component's value should be inserted. + type: object + required: + - type + properties: + type: + type: string + enum: [RequestPath] + RequestOption: + title: Request Option + description: Specifies the key field and where in the request a component's value should be injected. + type: object + required: + - type + - field_name + - inject_into + properties: + type: + type: string + enum: [RequestOption] + field_name: + title: Request Option + description: Configures which key should be used in the location that the descriptor is being injected into + type: string + examples: + - segment_id + interpolation_context: + - config + - parameters + inject_into: + title: Inject Into + description: Configures where the descriptor should be set on the HTTP requests. Note that request parameters that are already encoded in the URL path will not be duplicated. + enum: + - request_parameter + - header + - body_data + - body_json + examples: + - request_parameter + - header + - body_data + - body_json + Schemas: + title: Schemas + description: The stream schemas representing the shape of the data emitted by the stream. + type: object + additionalProperties: true + LegacySessionTokenAuthenticator: + title: Session Token Authenticator + description: Deprecated - use SessionTokenAuthenticator instead. Authenticator for requests authenticated using session tokens. A session token is a random value generated by a server to identify a specific user for the duration of one interaction session. + type: object + required: + - type + - header + - login_url + - session_token_response_key + - validate_session_url + properties: + type: + type: string + enum: [LegacySessionTokenAuthenticator] + header: + title: Session Request Header + description: The name of the session token header that will be injected in the request + type: string + examples: + - "X-Session" + login_url: + title: Login Path + description: Path of the login URL (do not include the base URL) + type: string + examples: + - session + session_token: + title: Session Token + description: Session token to use if using a pre-defined token. Not needed if authenticating with username + password pair + type: string + example: + - "{{ config['session_token'] }}" + session_token_response_key: + title: Response Token Response Key + description: Name of the key of the session token to be extracted from the response + type: string + examples: + - id + username: + title: Username + description: Username used to authenticate and obtain a session token + type: string + examples: + - " {{ config['username'] }}" + password: + title: Password + description: Password used to authenticate and obtain a session token + type: string + default: "" + examples: + - "{{ config['password'] }}" + - "" + validate_session_url: + title: Validate Session Path + description: Path of the URL to use to validate that the session token is valid (do not include the base URL) + type: string + examples: + - "user/current" + $parameters: + type: object + additionalProperties: true + SimpleRetriever: + description: Retrieves records by synchronously sending requests to fetch records. The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the partition router. + type: object + required: + - type + - record_selector + - requester + properties: + type: + type: string + enum: [SimpleRetriever] + record_selector: + description: Component that describes how to extract records from a HTTP response. + "$ref": "#/definitions/RecordSelector" + requester: + description: Requester component that describes how to prepare HTTP requests to send to the source API. + anyOf: + - "$ref": "#/definitions/CustomRequester" + - "$ref": "#/definitions/HttpRequester" + paginator: + description: Paginator component that describes how to navigate through the API's pages. + anyOf: + - "$ref": "#/definitions/DefaultPaginator" + - "$ref": "#/definitions/NoPagination" + ignore_stream_slicer_parameters_on_paginated_requests: + description: If true, the partition router and incremental request options will be ignored when paginating requests. Request options set directly on the requester will not be ignored. + type: boolean + default: false + partition_router: + title: Partition Router + description: PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing. + default: [] + anyOf: + - "$ref": "#/definitions/CustomPartitionRouter" + - "$ref": "#/definitions/ListPartitionRouter" + - "$ref": "#/definitions/SubstreamPartitionRouter" + - type: array + items: + anyOf: + - "$ref": "#/definitions/CustomPartitionRouter" + - "$ref": "#/definitions/ListPartitionRouter" + - "$ref": "#/definitions/SubstreamPartitionRouter" + decoder: + title: Decoder + description: Component decoding the response so records can be extracted. + anyOf: + - "$ref": "#/definitions/JsonDecoder" + - "$ref": "#/definitions/JsonlDecoder" + - "$ref": "#/definitions/IterableDecoder" + - "$ref": "#/definitions/XmlDecoder" + $parameters: + type: object + additionalProperties: true + AsyncJobStatusMap: + description: Matches the api job status to Async Job Status. + type: object + required: + - running + - completed + - failed + - timeout + properties: + type: + type: string + enum: [AsyncJobStatusMap] + running: + type: array + items: + type: string + completed: + type: array + items: + type: string + failed: + type: array + items: + type: string + timeout: + type: array + items: + type: string + AsyncRetriever: + description: "[Experimental - We expect the interface to change shortly and we reserve the right to not consider this a breaking change] Retrieves records by Asynchronously sending requests to fetch records. The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the partition router." + type: object + required: + - type + - record_selector + - status_mapping + - creation_requester + - polling_requester + - download_requester + - status_extractor + - urls_extractor + properties: + type: + type: string + enum: [AsyncRetriever] + record_selector: + description: Component that describes how to extract records from a HTTP response. + "$ref": "#/definitions/RecordSelector" + status_mapping: + description: Async Job Status to Airbyte CDK Async Job Status mapping. + anyOf: + - "$ref": "#/definitions/AsyncJobStatusMap" + status_extractor: + description: Responsible for fetching the actual status of the async job. + anyOf: + - "$ref": "#/definitions/CustomRecordExtractor" + - "$ref": "#/definitions/DpathExtractor" + urls_extractor: + description: Responsible for fetching the final result `urls` provided by the completed / finished / ready async job. + anyOf: + - "$ref": "#/definitions/CustomRecordExtractor" + - "$ref": "#/definitions/DpathExtractor" + creation_requester: + description: Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job. + anyOf: + - "$ref": "#/definitions/CustomRequester" + - "$ref": "#/definitions/HttpRequester" + polling_requester: + description: Requester component that describes how to prepare HTTP requests to send to the source API to fetch the status of the running async job. + anyOf: + - "$ref": "#/definitions/CustomRequester" + - "$ref": "#/definitions/HttpRequester" + download_requester: + description: Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job. + anyOf: + - "$ref": "#/definitions/CustomRequester" + - "$ref": "#/definitions/HttpRequester" + download_paginator: + description: Paginator component that describes how to navigate through the API's pages during download. + anyOf: + - "$ref": "#/definitions/DefaultPaginator" + - "$ref": "#/definitions/NoPagination" + abort_requester: + description: Requester component that describes how to prepare HTTP requests to send to the source API to abort a job once it is timed out from the source's perspective. + anyOf: + - "$ref": "#/definitions/CustomRequester" + - "$ref": "#/definitions/HttpRequester" + delete_requester: + description: Requester component that describes how to prepare HTTP requests to send to the source API to delete a job once the records are extracted. + anyOf: + - "$ref": "#/definitions/CustomRequester" + - "$ref": "#/definitions/HttpRequester" + partition_router: + title: Partition Router + description: PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing. + default: [] + anyOf: + - "$ref": "#/definitions/CustomPartitionRouter" + - "$ref": "#/definitions/ListPartitionRouter" + - "$ref": "#/definitions/SubstreamPartitionRouter" + - type: array + items: + anyOf: + - "$ref": "#/definitions/CustomPartitionRouter" + - "$ref": "#/definitions/ListPartitionRouter" + - "$ref": "#/definitions/SubstreamPartitionRouter" + decoder: + title: Decoder + description: Component decoding the response so records can be extracted. + anyOf: + - "$ref": "#/definitions/JsonDecoder" + - "$ref": "#/definitions/JsonlDecoder" + - "$ref": "#/definitions/IterableDecoder" + - "$ref": "#/definitions/XmlDecoder" + $parameters: + type: object + additionalProperties: true + Spec: + title: Spec + description: A source specification made up of connector metadata and how it can be configured. + type: object + required: + - type + - connection_specification + properties: + type: + type: string + enum: [Spec] + connection_specification: + title: Connection Specification + description: A connection specification describing how a the connector can be configured. + type: object + additionalProperties: true + documentation_url: + title: Documentation URL + description: URL of the connector's documentation page. + type: string + examples: + - "https://docs.airbyte.com/integrations/sources/dremio" + advanced_auth: + title: Advanced Auth + description: Advanced specification for configuring the authentication flow. + "$ref": "#/definitions/AuthFlow" + SubstreamPartitionRouter: + title: Substream Partition Router + description: Partition router that is used to retrieve records that have been partitioned according to records from the specified parent streams. An example of a parent stream is automobile brands and the substream would be the various car models associated with each branch. + type: object + required: + - type + - parent_stream_configs + properties: + type: + type: string + enum: [SubstreamPartitionRouter] + parent_stream_configs: + title: Parent Stream Configs + description: Specifies which parent streams are being iterated over and how parent records should be used to partition the child stream data set. + type: array + items: + "$ref": "#/definitions/ParentStreamConfig" + $parameters: + type: object + additionalProperties: true + ValueType: + title: Value Type + description: A schema type. + type: string + enum: + - string + - number + - integer + - boolean + WaitTimeFromHeader: + title: Wait Time Extracted From Response Header + description: Extract wait time from a HTTP header in the response. + type: object + required: + - type + - header + properties: + type: + type: string + enum: [WaitTimeFromHeader] + header: + title: Response Header Name + description: The name of the response header defining how long to wait before retrying. + type: string + interpolation_context: + - config + examples: + - "Retry-After" + regex: + title: Extraction Regex + description: Optional regex to apply on the header to extract its value. The regex should define a capture group defining the wait time. + type: string + examples: + - "([-+]?\\d+)" + max_waiting_time_in_seconds: + title: Max Waiting Time in Seconds + description: Given the value extracted from the header is greater than this value, stop the stream. + type: number + examples: + - 3600 + $parameters: + type: object + additionalProperties: true + WaitUntilTimeFromHeader: + title: Wait Until Time Defined In Response Header + description: Extract time at which we can retry the request from response header and wait for the difference between now and that time. + type: object + required: + - type + - header + properties: + type: + type: string + enum: [WaitUntilTimeFromHeader] + header: + title: Response Header + description: The name of the response header defining how long to wait before retrying. + type: string + interpolation_context: + - config + examples: + - wait_time + min_wait: + title: Minimum Wait Time + description: Minimum time to wait before retrying. + anyOf: + - type: number + - type: string + interpolation_context: + - config + examples: + - 10 + - "60" + regex: + title: Extraction Regex + description: Optional regex to apply on the header to extract its value. The regex should define a capture group defining the wait time. + type: string + interpolation_context: + - config + examples: + - "([-+]?\\d+)" + $parameters: + type: object + additionalProperties: true +interpolation: + variables: + - title: config + description: The connector configuration. The object's keys are the same as the the keys defined in the connection specification. + type: object + examples: + - start_date: 2010-01-01 + api_key: "*****" + - title: parameters + description: Additional runtime parameters, to be used for string interpolation. Parameters can be passed down from a parent component to its subcomponents using the $parameters key. This can be used to avoid repetitions. + type: object + examples: + - path: "automations" + data_export_path: "automations" + cursor_field: "updated_at" + - title: headers + description: The HTTP headers from the last response received from the API. The object's keys are the header names from the response. + type: object + examples: + - Server: nginx + Date: Mon, 24 Apr 2023 20:17:21 GMT + Content-Type: application/json + Content-Length: "420" + Connection: keep-alive + referrer-policy: strict-origin-when-cross-origin + x-content-type-options: nosniff + x-ratelimit-limit: "600" + x-ratelimit-remaining: "598" + x-ratelimit-reset: "39" + - title: last_record + description: Last record extracted from the response received from the API. + type: object + examples: + - name: "Test List: 19" + id: 0236d6d2 + contact_count: 20 + _metadata: + self: https://api.sendgrid.com/v3/marketing/lists/0236d6d2 + - title: last_page_size + description: Number of records extracted from the last response received from the API. + type: object + examples: + - 2 + - title: next_page_token + description: Object describing the token to fetch the next page of records. The object has a single key "next_page_token". + type: object + examples: + - next_page_token: 3 + - next_page_token: https://api.sendgrid.com/v3/marketing/lists/0236d6d2-75d2-42c5-962d-603e0deaf8d1 + - title: record + description: The record being processed. The object's keys are the same keys as the records produced by the RecordSelector. + type: object + - title: response + description: The body of the last response received from the API. The object's keys are the same keys as the response body's. + type: object + examples: + - result: + - name: "Test List: 19" + id: 0236d6d2-75d2-42c5-962d-603e0deaf8d1 + contact_count: 20 + _metadata: + self: https://api.sendgrid.com/v3/marketing/lists/0236d6d2 + _metadata: + self: https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token= + next: https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token=0236d6d2 + count: 82 + - title: stream_interval + description: The current stream interval being processed. The keys are defined by the incremental sync component. Default keys are `start_time` and `end_time`. + type: object + examples: + - start_time: "2020-01-01 00:00:00.000+00:00" + end_time: "2020-01-02 00:00:00.000+00:00" + - title: stream_partition + description: The current stream partition being processed. The keys are defined by the partition router component. + type: object + examples: + - survey_id: 1234 + - strategy: DESKTOP + - survey_id: 1234 + strategy: MOBILE + - title: stream_slice + description: This variable is deprecated. Use stream_interval or stream_partition instead. + type: object + - title: stream_state + description: The current state of the stream. The object's keys are defined by the incremental sync's cursor_field the and partition router's values. + type: object + examples: + - created_at: "2020-01-01 00:00:00.000+00:00" + - updated_at: "2020-01-02 00:00:00.000+00:00" + macros: + - title: Now (UTC) + description: Returns the current date and time in the UTC timezone. + arguments: {} + return_type: Datetime + examples: + - "'{{ now_utc() }}' -> '2021-09-01 00:00:00+00:00'" + - "'{{ now_utc().strftime('%Y-%m-%d') }}' -> '2021-09-01'" + - title: Today (UTC) + description: Returns the current date in UTC timezone. The output is a date object. + arguments: {} + return_type: Date + examples: + - "'{{ today_utc() }}' -> '2021-09-01'" + - "'{{ today_utc().strftime('%Y/%m/%d')}}' -> '2021/09/01'" + - title: Timestamp + description: Converts a number or a string representing a datetime (formatted as ISO8601) to a timestamp. If the input is a number, it is converted to an int. If no timezone is specified, the string is interpreted as UTC. + arguments: + datetime: A string formatted as ISO8601 or an integer representing a unix timestamp + return_type: int + examples: + - "'{{ timestamp(1646006400) }}' -> 1646006400" + - "'{{ timestamp('2022-02-28') }}' -> 1646006400" + - "'{{ timestamp('2022-02-28T00:00:00Z') }}' -> 1646006400" + - "'{{ timestamp('2022-02-28 00:00:00Z') }}' -> 1646006400" + - "'{{ timestamp('2022-02-28T00:00:00-08:00') }}' -> 1646035200" + - title: Max + description: Returns the largest object of a iterable, or or two or more arguments. + arguments: + args: iterable or a sequence of two or more arguments + return_type: Any + examples: + - "'{{ max(2, 3) }}' -> 3" + - "'{{ max([2, 3]) }}' -> 3" + - title: Day Delta + description: Returns the datetime of now() + num_days. + arguments: + num_days: The number of days to add to now + format: How to format the output string + return_type: str + examples: + - "'{{ day_delta(1) }}' -> '2021-09-02T00:00:00.000000+0000'" + - "'{{ day_delta(-1) }}' -> '2021-08-31:00:00.000000+0000'" + - "'{{ day_delta(25, format='%Y-%m-%d') }}' -> '2021-09-02'" + - title: Duration + description: Converts an ISO8601 duratioin to datetime.timedelta. + arguments: + duration_string: "A string representing an ISO8601 duration. See https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm for more details." + return_type: datetime.timedelta + examples: + - "'{{ duration('P1D') }}' -> '1 day, 0:00:00'" + - "'{{ duration('P6DT23H') }}' -> '6 days, 23:00:00'" + - "'{{ (now_utc() - duration('P1D')).strftime('%Y-%m-%dT%H:%M:%SZ') }}' -> '2021-08-31T00:00:00Z'" + - title: Format Datetime + description: Converts a datetime or a datetime-string to the specified format. + arguments: + datetime: The datetime object or a string to convert. If datetime is a string, it must be formatted as ISO8601. + format: The datetime format. + input_format: (optional) The datetime format in the case it is an string. + return_type: str + examples: + - "{{ format_datetime(config['start_time'], '%Y-%m-%d') }}" + - "{{ format_datetime(config['start_date'], '%Y-%m-%dT%H:%M:%S.%fZ') }}" + - "{{ format_datetime(config['start_date'], '%Y-%m-%dT%H:%M:%S.%fZ', '%a, %d %b %Y %H:%M:%S %z') }}" + filters: + - title: Hash + description: Convert the specified value to a hashed string. + arguments: + hash_type: Valid hash type for converts ('md5' as default value). + salt: An additional value to further protect sensitive data. + return_type: str + examples: + - "{{ 'Test client_secret' | hash() }} -> '3032d57a12f76b61a820e47b9a5a0cbb'" + - "{{ 'Test client_secret' | hash('md5') }} -> '3032d57a12f76b61a820e47b9a5a0cbb'" + - "{{ 'Test client_secret' | hash('md5', salt='salt') }} -> '5011a0168579c2d94cbbe1c6ad14327c'" + - title: Base64 encoder + description: Convert the specified value to a string in the base64 format. + arguments: {} + return_type: str + examples: + - "{{ 'Test client_secret' | base64encode }} -> 'VGVzdCBjbGllbnRfc2VjcmV0'" + - title: Base64 decoder + description: Decodes the specified base64 format value into a common string. + arguments: {} + return_type: str + examples: + - "{{ 'ZmFrZSByZWZyZXNoX3Rva2VuIHZhbHVl' | base64decode }} -> 'fake refresh_token value'" + - title: String + description: Converts the specified value to a string. + arguments: {} + return_type: str + examples: + - '{{ 1 | string }} -> "1"' + - '{{ ["hello", "world" | string }} -> "["hello", "world"]"' + - title: Regex Search + description: Match the input string against a regular expression and return the first match. + arguments: + regex: The regular expression to search for. It must include a capture group. + return_type: str + examples: + - '{{ "goodbye, cruel world" | regex_search("goodbye,\s(.*)$") }} -> "cruel world"' diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_source.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_source.py new file mode 100644 index 000000000000..9135f2a99b5f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_source.py @@ -0,0 +1,34 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import abstractmethod +from typing import Any, Mapping, Tuple + +from airbyte_cdk.sources.abstract_source import AbstractSource +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker + + +class DeclarativeSource(AbstractSource): + """ + Base class for declarative Source. Concrete sources need to define the connection_checker to use + """ + + @property + @abstractmethod + def connection_checker(self) -> ConnectionChecker: + """Returns the ConnectionChecker to use for the `check` operation""" + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Any]: + """ + :param logger: The source logger + :param config: The user-provided configuration as specified by the source's spec. + This usually contains information required to check connection e.g. tokens, secrets and keys etc. + :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful + and we can connect to the underlying data source using the provided configuration. + Otherwise, the input config cannot be used to connect to the underlying data source, + and the "error" object should describe what went wrong. + The error object will be cast to string to display the problem to the user. + """ + return self.connection_checker.check_connection(self, logger, config) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_stream.py new file mode 100644 index 000000000000..09ce080c8ae4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_stream.py @@ -0,0 +1,211 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +from dataclasses import InitVar, dataclass, field +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.incremental import GlobalSubstreamCursor, PerPartitionCursor, PerPartitionWithGlobalCursor +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration +from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.schema import DefaultSchemaLoader +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.streams.checkpoint import CheckpointMode, CheckpointReader, Cursor, CursorBasedCheckpointReader +from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.types import Config, StreamSlice + + +@dataclass +class DeclarativeStream(Stream): + """ + DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever + + Attributes: + name (str): stream name + primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream + schema_loader (SchemaLoader): The schema loader + retriever (Retriever): The retriever + config (Config): The user-provided configuration as specified by the source's spec + stream_cursor_field (Optional[Union[InterpolatedString, str]]): The cursor field + stream. Transformations are applied in the order in which they are defined. + """ + + retriever: Retriever + config: Config + parameters: InitVar[Mapping[str, Any]] + name: str + primary_key: Optional[Union[str, List[str], List[List[str]]]] + state_migrations: List[StateMigration] = field(repr=True, default_factory=list) + schema_loader: Optional[SchemaLoader] = None + _name: str = field(init=False, repr=False, default="") + _primary_key: str = field(init=False, repr=False, default="") + stream_cursor_field: Optional[Union[InterpolatedString, str]] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._stream_cursor_field = ( + InterpolatedString.create(self.stream_cursor_field, parameters=parameters) + if isinstance(self.stream_cursor_field, str) + else self.stream_cursor_field + ) + self._schema_loader = self.schema_loader if self.schema_loader else DefaultSchemaLoader(config=self.config, parameters=parameters) + + @property # type: ignore + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return self._primary_key + + @primary_key.setter + def primary_key(self, value: str) -> None: + if not isinstance(value, property): + self._primary_key = value + + @property + def exit_on_rate_limit(self) -> bool: + return self.retriever.requester.exit_on_rate_limit # type: ignore # abstract Retriever class has not requester attribute + + @exit_on_rate_limit.setter + def exit_on_rate_limit(self, value: bool) -> None: + self.retriever.requester.exit_on_rate_limit = value # type: ignore[attr-defined] + + @property # type: ignore + def name(self) -> str: + """ + :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. + """ + return self._name + + @name.setter + def name(self, value: str) -> None: + if not isinstance(value, property): + self._name = value + + @property + def state(self) -> MutableMapping[str, Any]: + return self.retriever.state # type: ignore + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + """State setter, accept state serialized by state getter.""" + state: Mapping[str, Any] = value + if self.state_migrations: + for migration in self.state_migrations: + if migration.should_migrate(state): + state = migration.migrate(state) + self.retriever.state = state + + def get_updated_state( + self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] + ) -> MutableMapping[str, Any]: + return self.state + + @property + def cursor_field(self) -> Union[str, List[str]]: + """ + Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. + :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. + """ + cursor = self._stream_cursor_field.eval(self.config) # type: ignore # _stream_cursor_field is always cast to interpolated string + return cursor if cursor else [] + + @property + def is_resumable(self) -> bool: + # Declarative sources always implement state getter/setter, but whether it supports checkpointing is based on + # if the retriever has a cursor defined. + return self.retriever.cursor is not None if hasattr(self.retriever, "cursor") else False + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + """ + :param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state. + """ + if stream_slice is None or stream_slice == {}: + # As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field + # As part of the declarative model without custom components, this should never happen as the CDK would wire up a + # SinglePartitionRouter that would create this StreamSlice properly + # As part of the declarative model with custom components, a user that would return a `None` slice would now have the default + # empty slice which seems to make sense. + stream_slice = StreamSlice(partition={}, cursor_slice={}) + if not isinstance(stream_slice, StreamSlice): + raise ValueError(f"DeclarativeStream does not support stream_slices that are not StreamSlice. Got {stream_slice}") + yield from self.retriever.read_records(self.get_json_schema(), stream_slice) # type: ignore # records are of the correct type + + def get_json_schema(self) -> Mapping[str, Any]: # type: ignore + """ + :return: A dict of the JSON schema representing this stream. + + The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. + Override as needed. + """ + return self._schema_loader.get_json_schema() + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[StreamSlice]]: + """ + Override to define the slices for this stream. See the stream slicing section of the docs for more information. + + :param sync_mode: + :param cursor_field: + :param stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state + :return: + """ + return self.retriever.stream_slices() + + @property + def state_checkpoint_interval(self) -> Optional[int]: + """ + We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but: + * In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the + cursor value once at the end of every slice. + * Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the + important state is the one at the beginning of the slice + """ + return None + + def get_cursor(self) -> Optional[Cursor]: + if self.retriever and isinstance(self.retriever, SimpleRetriever): + return self.retriever.cursor + return None + + def _get_checkpoint_reader( + self, + logger: logging.Logger, + cursor_field: Optional[List[str]], + sync_mode: SyncMode, + stream_state: MutableMapping[str, Any], + ) -> CheckpointReader: + """ + This method is overridden to prevent issues with stream slice classification for incremental streams that have parent streams. + + The classification logic, when used with `itertools.tee`, creates a copy of the stream slices. When `stream_slices` is called + the second time, the parent records generated during the classification phase are lost. This occurs because `itertools.tee` + only buffers the results, meaning the logic in `simple_retriever` that observes and updates the cursor isn't executed again. + + By overriding this method, we ensure that the stream slices are processed correctly and parent records are not lost, + allowing the cursor to function as expected. + """ + mappings_or_slices = self.stream_slices( + cursor_field=cursor_field, + sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior + stream_state=stream_state, + ) + + cursor = self.get_cursor() + checkpoint_mode = self._checkpoint_mode + + if isinstance(cursor, (GlobalSubstreamCursor, PerPartitionCursor, PerPartitionWithGlobalCursor)): + self.has_multiple_slices = True + return CursorBasedCheckpointReader( + stream_slices=mappings_or_slices, + cursor=cursor, + read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH, + ) + + return super()._get_checkpoint_reader(logger, cursor_field, sync_mode, stream_state) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/__init__.py new file mode 100644 index 000000000000..b67561e989ce --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/__init__.py @@ -0,0 +1,11 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder, JsonlDecoder, IterableDecoder +from airbyte_cdk.sources.declarative.decoders.noop_decoder import NoopDecoder +from airbyte_cdk.sources.declarative.decoders.pagination_decoder_decorator import PaginationDecoderDecorator +from airbyte_cdk.sources.declarative.decoders.xml_decoder import XmlDecoder + +__all__ = ["Decoder", "JsonDecoder", "JsonlDecoder", "IterableDecoder", "NoopDecoder", "PaginationDecoderDecorator", "XmlDecoder"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/decoder.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/decoder.py new file mode 100644 index 000000000000..4e8fdd64444f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/decoder.py @@ -0,0 +1,30 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Generator, MutableMapping + +import requests + + +@dataclass +class Decoder: + """ + Decoder strategy to transform a requests.Response into a Mapping[str, Any] + """ + + @abstractmethod + def is_stream_response(self) -> bool: + """ + Set to True if you'd like to use stream=True option in http requester + """ + + @abstractmethod + def decode(self, response: requests.Response) -> Generator[MutableMapping[str, Any], None, None]: + """ + Decodes a requests.Response into a Mapping[str, Any] or an array + :param response: the response to decode + :return: Generator of Mapping describing the response + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/json_decoder.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/json_decoder.py new file mode 100644 index 000000000000..b2c25c3370fe --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/json_decoder.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from dataclasses import InitVar, dataclass +from typing import Any, Generator, Mapping + +import requests +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from orjson import orjson + +logger = logging.getLogger("airbyte") + + +@dataclass +class JsonDecoder(Decoder): + """ + Decoder strategy that returns the json-encoded content of a response, if any. + """ + + parameters: InitVar[Mapping[str, Any]] + + def is_stream_response(self) -> bool: + return False + + def decode(self, response: requests.Response) -> Generator[Mapping[str, Any], None, None]: + """ + Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping. + """ + try: + body_json = response.json() + if not isinstance(body_json, list): + body_json = [body_json] + if len(body_json) == 0: + yield {} + else: + yield from body_json + except requests.exceptions.JSONDecodeError: + logger.warning(f"Response cannot be parsed into json: {response.status_code=}, {response.text=}") + yield {} + + +@dataclass +class IterableDecoder(Decoder): + """ + Decoder strategy that returns the string content of the response, if any. + """ + + parameters: InitVar[Mapping[str, Any]] + + def is_stream_response(self) -> bool: + return True + + def decode(self, response: requests.Response) -> Generator[Mapping[str, Any], None, None]: + for line in response.iter_lines(): + yield {"record": line.decode()} + + +@dataclass +class JsonlDecoder(Decoder): + """ + Decoder strategy that returns the json-encoded content of the response, if any. + """ + + parameters: InitVar[Mapping[str, Any]] + + def is_stream_response(self) -> bool: + return True + + def decode(self, response: requests.Response) -> Generator[Mapping[str, Any], None, None]: + # TODO???: set delimiter? usually it is `\n` but maybe it would be useful to set optional? + # https://github.com/airbytehq/airbyte-internal-issues/issues/8436 + for record in response.iter_lines(): + yield orjson.loads(record) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/noop_decoder.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/noop_decoder.py new file mode 100644 index 000000000000..eb977712a1ea --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/noop_decoder.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import logging +from typing import Any, Generator, Mapping + +import requests +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder + +logger = logging.getLogger("airbyte") + + +class NoopDecoder(Decoder): + def is_stream_response(self) -> bool: + return False + + def decode(self, response: requests.Response) -> Generator[Mapping[str, Any], None, None]: + yield from [{}] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py new file mode 100644 index 000000000000..dadb717a1723 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from dataclasses import dataclass +from typing import Any, Generator, MutableMapping + +import requests +from airbyte_cdk.sources.declarative.decoders import Decoder + +logger = logging.getLogger("airbyte") + + +@dataclass +class PaginationDecoderDecorator(Decoder): + """ + Decoder to wrap other decoders when instantiating a DefaultPaginator in order to bypass decoding if the response is streamed. + """ + + def __init__(self, decoder: Decoder): + self._decoder = decoder + + @property + def decoder(self) -> Decoder: + return self._decoder + + def is_stream_response(self) -> bool: + return self._decoder.is_stream_response() + + def decode(self, response: requests.Response) -> Generator[MutableMapping[str, Any], None, None]: + if self._decoder.is_stream_response(): + logger.warning("Response is streamed and therefore will not be decoded for pagination.") + yield {} + else: + yield from self._decoder.decode(response) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/xml_decoder.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/xml_decoder.py new file mode 100644 index 000000000000..7b598ba851f9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/xml_decoder.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import logging +from dataclasses import InitVar, dataclass +from typing import Any, Generator, Mapping, MutableMapping +from xml.parsers.expat import ExpatError + +import requests +import xmltodict +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder + +logger = logging.getLogger("airbyte") + + +@dataclass +class XmlDecoder(Decoder): + """ + XmlDecoder is a decoder strategy that parses the XML content of the resopnse, and converts it to a dict. + + This class handles XML attributes by prefixing them with an '@' symbol and represents XML text content by using the '#text' key if the element has attributes or the element name/tag. It does not currently support XML namespace declarations. + + Example XML Input: + + + San Francisco + + + Book Title 1 + 10.99 + + + Gadget + 299.99 + A useful gadget + + + + Converted Output: + { + "root": { + "location: { + "@id": "123, + "#text": "San Francisco" + }, + "item": [ + { + "@id": "1", + "@category": "books", + "name": "Book Title 1", + "price": "10.99" + }, + { + "@id": "2", + "@category": "electronics", + "name": "Gadget", + "price": "299.99", + "description": "A useful gadget" + } + ] + } + } + + Notes: + - Attributes of an XML element are prefixed with an '@' symbol in the dictionary output. + - Text content of an XML element is handled in two different ways, depending on whether + the element has attributes. + - If the element has attributes, the text content will be + represented by the "#text" key. + - If the element does not have any attributes, the text content will be + represented by element name. + - Namespace declarations are not supported in the current implementation. + """ + + parameters: InitVar[Mapping[str, Any]] + + def is_stream_response(self) -> bool: + return False + + def decode(self, response: requests.Response) -> Generator[MutableMapping[str, Any], None, None]: + body_xml = response.text + try: + body_json = xmltodict.parse(body_xml) + if not isinstance(body_json, list): + body_json = [body_json] + if len(body_json) == 0: + yield {} + else: + yield from body_json + except ExpatError as exc: + logger.warning(f"Response cannot be parsed from XML: {response.status_code=}, {response.text=}, {exc=}") + yield {} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/exceptions.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/exceptions.py new file mode 100644 index 000000000000..ca67c6a55a34 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/exceptions.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +class ReadException(Exception): + """ + Raise when there is an error reading data from an API Source + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/__init__.py new file mode 100644 index 000000000000..76304b467f43 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/__init__.py @@ -0,0 +1,11 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor +from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector +from airbyte_cdk.sources.declarative.extractors.response_to_file_extractor import ResponseToFileExtractor + +__all__ = ["HttpSelector", "DpathExtractor", "RecordFilter", "RecordSelector", "ResponseToFileExtractor"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/dpath_extractor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/dpath_extractor.py new file mode 100644 index 000000000000..512d6919d07f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/dpath_extractor.py @@ -0,0 +1,81 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Iterable, List, Mapping, MutableMapping, Union + +import dpath +import requests +from airbyte_cdk.sources.declarative.decoders import Decoder, JsonDecoder +from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.types import Config + + +@dataclass +class DpathExtractor(RecordExtractor): + """ + Record extractor that searches a decoded response over a path defined as an array of fields. + + If the field path points to an array, that array is returned. + If the field path points to an object, that object is returned wrapped as an array. + If the field path points to an empty object, an empty array is returned. + If the field path points to a non-existing path, an empty array is returned. + + Examples of instantiating this transform: + ``` + extractor: + type: DpathExtractor + field_path: + - "root" + - "data" + ``` + + ``` + extractor: + type: DpathExtractor + field_path: + - "root" + - "{{ parameters['field'] }}" + ``` + + ``` + extractor: + type: DpathExtractor + field_path: [] + ``` + + Attributes: + field_path (Union[InterpolatedString, str]): Path to the field that should be extracted + config (Config): The user-provided configuration as specified by the source's spec + decoder (Decoder): The decoder responsible to transfom the response in a Mapping + """ + + field_path: List[Union[InterpolatedString, str]] + config: Config + parameters: InitVar[Mapping[str, Any]] + decoder: Decoder = field(default_factory=lambda: JsonDecoder(parameters={})) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._field_path = [InterpolatedString.create(path, parameters=parameters) for path in self.field_path] + for path_index in range(len(self.field_path)): + if isinstance(self.field_path[path_index], str): + self._field_path[path_index] = InterpolatedString.create(self.field_path[path_index], parameters=parameters) + + def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]: + for body in self.decoder.decode(response): + if len(self._field_path) == 0: + extracted = body + else: + path = [path.eval(self.config) for path in self._field_path] + if "*" in path: + extracted = dpath.values(body, path) + else: + extracted = dpath.get(body, path, default=[]) # type: ignore # extracted will be a MutableMapping, given input data structure + if isinstance(extracted, list): + yield from extracted + elif extracted: + yield extracted + else: + yield from [] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/http_selector.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/http_selector.py new file mode 100644 index 000000000000..905477a6c6d9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/http_selector.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from typing import Any, Iterable, Mapping, Optional + +import requests +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState + + +class HttpSelector: + """ + Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering + records based on a heuristic. + """ + + @abstractmethod + def select_records( + self, + response: requests.Response, + stream_state: StreamState, + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Record]: + """ + Selects records from the response + :param response: The response to select the records from + :param stream_state: The stream state + :param records_schema: json schema of records to return + :param stream_slice: The stream slice + :param next_page_token: The paginator token + :return: List of Records selected from the response + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_extractor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_extractor.py new file mode 100644 index 000000000000..5de6a84a7db7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_extractor.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Iterable, Mapping + +import requests + + +@dataclass +class RecordExtractor: + """ + Responsible for translating an HTTP response into a list of records by extracting records from the response. + """ + + @abstractmethod + def extract_records( + self, + response: requests.Response, + ) -> Iterable[Mapping[str, Any]]: + """ + Selects records from the response + :param response: The response to extract the records from + :return: List of Records extracted from the response + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_filter.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_filter.py new file mode 100644 index 000000000000..f396224c12e8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_filter.py @@ -0,0 +1,118 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import datetime +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.incremental import DatetimeBasedCursor, GlobalSubstreamCursor, PerPartitionWithGlobalCursor +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState + + +@dataclass +class RecordFilter: + """ + Filter applied on a list of Records + + config (Config): The user-provided configuration as specified by the source's spec + condition (str): The string representing the predicate to filter a record. Records will be removed if evaluated to False + """ + + parameters: InitVar[Mapping[str, Any]] + config: Config + condition: str = "" + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._filter_interpolator = InterpolatedBoolean(condition=self.condition, parameters=parameters) + + def filter_records( + self, + records: Iterable[Mapping[str, Any]], + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + kwargs = { + "stream_state": stream_state, + "stream_slice": stream_slice, + "next_page_token": next_page_token, + "stream_slice.extra_fields": stream_slice.extra_fields if stream_slice else {}, + } + for record in records: + if self._filter_interpolator.eval(self.config, record=record, **kwargs): + yield record + + +class ClientSideIncrementalRecordFilterDecorator(RecordFilter): + """ + Applies a filter to a list of records to exclude those that are older than the stream_state/start_date. + + :param DatetimeBasedCursor date_time_based_cursor: Cursor used to extract datetime values + :param PerPartitionCursor per_partition_cursor: Optional Cursor used for mapping cursor value in nested stream_state + """ + + def __init__( + self, + date_time_based_cursor: DatetimeBasedCursor, + substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]], + **kwargs: Any, + ): + super().__init__(**kwargs) + self._date_time_based_cursor = date_time_based_cursor + self._substream_cursor = substream_cursor + + @property + def _cursor_field(self) -> str: + return self._date_time_based_cursor.cursor_field.eval(self._date_time_based_cursor.config) # type: ignore # eval returns a string in this context + + @property + def _start_date_from_config(self) -> datetime.datetime: + return self._date_time_based_cursor._start_datetime.get_datetime(self._date_time_based_cursor.config) + + @property + def _end_datetime(self) -> datetime.datetime: + return self._date_time_based_cursor.select_best_end_datetime() + + def filter_records( + self, + records: Iterable[Mapping[str, Any]], + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + state_value = self._get_state_value(stream_state, stream_slice or StreamSlice(partition={}, cursor_slice={})) + filter_date: datetime.datetime = self._get_filter_date(state_value) + records = ( + record + for record in records + if self._end_datetime >= self._date_time_based_cursor.parse_date(record[self._cursor_field]) >= filter_date + ) + if self.condition: + records = super().filter_records( + records=records, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + yield from records + + def _get_state_value(self, stream_state: StreamState, stream_slice: StreamSlice) -> Optional[str]: + """ + Return cursor_value or None in case it was not found. + Cursor_value may be empty if: + 1. It is an initial sync => no stream_state exist at all. + 2. In Parent-child stream, and we already make initial sync, so stream_state is present. + During the second read, we receive one extra record from parent and therefore no stream_state for this record will be found. + + :param StreamState stream_state: State + :param StreamSlice stream_slice: Current Stream slice + :return Optional[str]: cursor_value in case it was found, otherwise None. + """ + state = (self._substream_cursor or self._date_time_based_cursor).select_state(stream_slice) + + return state.get(self._cursor_field) if state else None + + def _get_filter_date(self, state_value: Optional[str]) -> datetime.datetime: + start_date_parsed = self._start_date_from_config + if state_value: + return max(start_date_parsed, self._date_time_based_cursor.parse_date(state_value)) + else: + return start_date_parsed diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_selector.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_selector.py new file mode 100644 index 000000000000..eed33d858228 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_selector.py @@ -0,0 +1,123 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Iterable, List, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector +from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter +from airbyte_cdk.sources.declarative.models import SchemaNormalization +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer + +SCHEMA_TRANSFORMER_TYPE_MAPPING = { + SchemaNormalization.None_: TransformConfig.NoTransform, + SchemaNormalization.Default: TransformConfig.DefaultSchemaNormalization, +} + + +@dataclass +class RecordSelector(HttpSelector): + """ + Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering + records based on a heuristic. + + Attributes: + extractor (RecordExtractor): The record extractor responsible for extracting records from a response + schema_normalization (TypeTransformer): The record normalizer responsible for casting record values to stream schema types + record_filter (RecordFilter): The record filter responsible for filtering extracted records + transformations (List[RecordTransformation]): The transformations to be done on the records + """ + + extractor: RecordExtractor + config: Config + parameters: InitVar[Mapping[str, Any]] + schema_normalization: TypeTransformer + record_filter: Optional[RecordFilter] = None + transformations: List[RecordTransformation] = field(default_factory=lambda: []) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._parameters = parameters + + def select_records( + self, + response: requests.Response, + stream_state: StreamState, + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Record]: + """ + Selects records from the response + :param response: The response to select the records from + :param stream_state: The stream state + :param records_schema: json schema of records to return + :param stream_slice: The stream slice + :param next_page_token: The paginator token + :return: List of Records selected from the response + """ + all_data: Iterable[Mapping[str, Any]] = self.extractor.extract_records(response) + yield from self.filter_and_transform(all_data, stream_state, records_schema, stream_slice, next_page_token) + + def filter_and_transform( + self, + all_data: Iterable[Mapping[str, Any]], + stream_state: StreamState, + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Record]: + """ + There is an issue with the selector as of 2024-08-30: it does technology-agnostic processing like filtering, transformation and + normalization with an API that is technology-specific (as requests.Response is only for HTTP communication using the requests + library). + + Until we decide to move this logic away from the selector, we made this method public so that users like AsyncJobRetriever could + share the logic of doing transformations on a set of records. + """ + filtered_data = self._filter(all_data, stream_state, stream_slice, next_page_token) + transformed_data = self._transform(filtered_data, stream_state, stream_slice) + normalized_data = self._normalize_by_schema(transformed_data, schema=records_schema) + for data in normalized_data: + yield Record(data, stream_slice) + + def _normalize_by_schema( + self, records: Iterable[Mapping[str, Any]], schema: Optional[Mapping[str, Any]] + ) -> Iterable[Mapping[str, Any]]: + if schema: + # record has type Mapping[str, Any], but dict[str, Any] expected + for record in records: + normalized_record = dict(record) + self.schema_normalization.transform(normalized_record, schema) + yield normalized_record + else: + yield from records + + def _filter( + self, + records: Iterable[Mapping[str, Any]], + stream_state: StreamState, + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + ) -> Iterable[Mapping[str, Any]]: + if self.record_filter: + yield from self.record_filter.filter_records( + records, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + else: + yield from records + + def _transform( + self, + records: Iterable[Mapping[str, Any]], + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + ) -> Iterable[Mapping[str, Any]]: + for record in records: + for transformation in self.transformations: + transformation.transform(record, config=self.config, stream_state=stream_state, stream_slice=stream_slice) # type: ignore # record has type Mapping[str, Any], but Dict[str, Any] expected + yield record diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py new file mode 100644 index 000000000000..48ef69e12a02 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py @@ -0,0 +1,162 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +import os +import uuid +import zlib +from contextlib import closing +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple + +import pandas as pd +import requests +from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor +from numpy import nan + +EMPTY_STR: str = "" +DEFAULT_ENCODING: str = "utf-8" +DOWNLOAD_CHUNK_SIZE: int = 1024 * 10 + + +class ResponseToFileExtractor(RecordExtractor): + """ + This class is used when having very big HTTP responses (usually streamed) which would require too much memory so we use disk space as + a tradeoff. + + Eventually, we want to support multiple file type by re-using the file based CDK parsers if possible. However, the lift is too high for + a first iteration so we will only support CSV parsing using pandas as salesforce and sendgrid were doing. + """ + + def __init__(self) -> None: + self.logger = logging.getLogger("airbyte") + + def _get_response_encoding(self, headers: Dict[str, Any]) -> str: + """ + Get the encoding of the response based on the provided headers. This method is heavily inspired by the requests library + implementation. + + Args: + headers (Dict[str, Any]): The headers of the response. + Returns: + str: The encoding of the response. + """ + + content_type = headers.get("content-type") + + if not content_type: + return DEFAULT_ENCODING + + content_type, params = requests.utils.parse_header_links(content_type) + + if "charset" in params: + return params["charset"].strip("'\"") # type: ignore # we assume headers are returned as str + + return DEFAULT_ENCODING + + def _filter_null_bytes(self, b: bytes) -> bytes: + """ + Filter out null bytes from a bytes object. + + Args: + b (bytes): The input bytes object. + Returns: + bytes: The filtered bytes object with null bytes removed. + + Referenced Issue: + https://github.com/airbytehq/airbyte/issues/8300 + """ + + res = b.replace(b"\x00", b"") + if len(res) < len(b): + self.logger.warning("Filter 'null' bytes from string, size reduced %d -> %d chars", len(b), len(res)) + return res + + def _save_to_file(self, response: requests.Response) -> Tuple[str, str]: + """ + Saves the binary data from the given response to a temporary file and returns the filepath and response encoding. + + Args: + response (Optional[requests.Response]): The response object containing the binary data. Defaults to None. + + Returns: + Tuple[str, str]: A tuple containing the filepath of the temporary file and the response encoding. + + Raises: + ValueError: If the temporary file does not exist after saving the binary data. + """ + # set filepath for binary data from response + decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32) + needs_decompression = True # we will assume at first that the response is compressed and change the flag if not + + tmp_file = str(uuid.uuid4()) + with closing(response) as response, open(tmp_file, "wb") as data_file: + response_encoding = self._get_response_encoding(dict(response.headers or {})) + for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE): + try: + if needs_decompression: + data_file.write(decompressor.decompress(chunk)) + needs_decompression = True + else: + data_file.write(self._filter_null_bytes(chunk)) + except zlib.error: + data_file.write(self._filter_null_bytes(chunk)) + needs_decompression = False + + # check the file exists + if os.path.isfile(tmp_file): + return tmp_file, response_encoding + else: + raise ValueError(f"The IO/Error occured while verifying binary data. Tmp file {tmp_file} doesn't exist.") + + def _read_with_chunks(self, path: str, file_encoding: str, chunk_size: int = 100) -> Iterable[Mapping[str, Any]]: + """ + Reads data from a file in chunks and yields each row as a dictionary. + + Args: + path (str): The path to the file to be read. + file_encoding (str): The encoding of the file. + chunk_size (int, optional): The size of each chunk to be read. Defaults to 100. + + Yields: + Mapping[str, Any]: A dictionary representing each row of data. + + Raises: + ValueError: If an IO/Error occurs while reading the temporary data. + """ + + try: + with open(path, "r", encoding=file_encoding) as data: + chunks = pd.read_csv(data, chunksize=chunk_size, iterator=True, dialect="unix", dtype=object) + for chunk in chunks: + chunk = chunk.replace({nan: None}).to_dict(orient="records") + for row in chunk: + yield row + except pd.errors.EmptyDataError as e: + self.logger.info(f"Empty data received. {e}") + yield from [] + except IOError as ioe: + raise ValueError(f"The IO/Error occured while reading tmp data. Called: {path}", ioe) + finally: + # remove binary tmp file, after data is read + os.remove(path) + + def extract_records(self, response: Optional[requests.Response] = None) -> Iterable[Mapping[str, Any]]: + """ + Extracts records from the given response by: + 1) Saving the result to a tmp file + 2) Reading from saved file by chunks to avoid OOM + + Args: + response (Optional[requests.Response]): The response object containing the data. Defaults to None. + + Yields: + Iterable[Mapping[str, Any]]: An iterable of mappings representing the extracted records. + + Returns: + None + """ + if response: + file_path, encoding = self._save_to_file(response) + yield from self._read_with_chunks(file_path, encoding) + else: + yield from [] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/__init__.py new file mode 100644 index 000000000000..11c1cba9913f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/__init__.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import GlobalSubstreamCursor +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import CursorFactory, PerPartitionCursor +from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import PerPartitionWithGlobalCursor +from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor import ( + ChildPartitionResumableFullRefreshCursor, + ResumableFullRefreshCursor, +) + +__all__ = [ + "CursorFactory", + "DatetimeBasedCursor", + "DeclarativeCursor", + "GlobalSubstreamCursor", + "PerPartitionCursor", + "PerPartitionWithGlobalCursor", + "ResumableFullRefreshCursor", + "ChildPartitionResumableFullRefreshCursor", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py new file mode 100644 index 000000000000..6505260c72df --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py @@ -0,0 +1,385 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime +from dataclasses import InitVar, dataclass, field +from datetime import timedelta +from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type +from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState +from isodate import Duration, duration_isoformat, parse_duration + + +@dataclass +class DatetimeBasedCursor(DeclarativeCursor): + """ + Slices the stream over a datetime range and create a state with format {: } + + Given a start time, end time, a step function, and an optional lookback window, + the stream slicer will partition the date range from start time - lookback window to end time. + + The step function is defined as a string of the form ISO8601 duration + + The timestamp format accepts the same format codes as datetime.strfptime, which are + all the format codes required by the 1989 C standard. + Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html + + Attributes: + start_datetime (Union[MinMaxDatetime, str]): the datetime that determines the earliest record that should be synced + end_datetime (Optional[Union[MinMaxDatetime, str]]): the datetime that determines the last record that should be synced + cursor_field (Union[InterpolatedString, str]): record's cursor field + datetime_format (str): format of the datetime + step (Optional[str]): size of the timewindow (ISO8601 duration) + cursor_granularity (Optional[str]): smallest increment the datetime_format has (ISO 8601 duration) that will be used to ensure that the start of a slice does not overlap with the end of the previous one + config (Config): connection config + start_time_option (Optional[RequestOption]): request option for start time + end_time_option (Optional[RequestOption]): request option for end time + partition_field_start (Optional[str]): partition start time field + partition_field_end (Optional[str]): stream slice end time field + lookback_window (Optional[InterpolatedString]): how many days before start_datetime to read data for (ISO8601 duration) + """ + + start_datetime: Union[MinMaxDatetime, str] + cursor_field: Union[InterpolatedString, str] + datetime_format: str + config: Config + parameters: InitVar[Mapping[str, Any]] + _highest_observed_cursor_field_value: Optional[str] = field( + repr=False, default=None + ) # tracks the latest observed datetime, which may not be safe to emit in the case of out-of-order records + _cursor: Optional[str] = field( + repr=False, default=None + ) # tracks the latest observed datetime that is appropriate to emit as stream state + end_datetime: Optional[Union[MinMaxDatetime, str]] = None + step: Optional[Union[InterpolatedString, str]] = None + cursor_granularity: Optional[str] = None + start_time_option: Optional[RequestOption] = None + end_time_option: Optional[RequestOption] = None + partition_field_start: Optional[str] = None + partition_field_end: Optional[str] = None + lookback_window: Optional[Union[InterpolatedString, str]] = None + message_repository: Optional[MessageRepository] = None + is_compare_strictly: Optional[bool] = False + cursor_datetime_formats: List[str] = field(default_factory=lambda: []) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if (self.step and not self.cursor_granularity) or (not self.step and self.cursor_granularity): + raise ValueError( + f"If step is defined, cursor_granularity should be as well and vice-versa. " + f"Right now, step is `{self.step}` and cursor_granularity is `{self.cursor_granularity}`" + ) + self._start_datetime = MinMaxDatetime.create(self.start_datetime, parameters) + self._end_datetime = None if not self.end_datetime else MinMaxDatetime.create(self.end_datetime, parameters) + + self._timezone = datetime.timezone.utc + self._interpolation = JinjaInterpolation() + + self._step = ( + self._parse_timedelta(InterpolatedString.create(self.step, parameters=parameters).eval(self.config)) + if self.step + else datetime.timedelta.max + ) + self._cursor_granularity = self._parse_timedelta(self.cursor_granularity) + self.cursor_field = InterpolatedString.create(self.cursor_field, parameters=parameters) + self._lookback_window = InterpolatedString.create(self.lookback_window, parameters=parameters) if self.lookback_window else None + self._partition_field_start = InterpolatedString.create(self.partition_field_start or "start_time", parameters=parameters) + self._partition_field_end = InterpolatedString.create(self.partition_field_end or "end_time", parameters=parameters) + self._parser = DatetimeParser() + + # If datetime format is not specified then start/end datetime should inherit it from the stream slicer + if not self._start_datetime.datetime_format: + self._start_datetime.datetime_format = self.datetime_format + if self._end_datetime and not self._end_datetime.datetime_format: + self._end_datetime.datetime_format = self.datetime_format + + if not self.cursor_datetime_formats: + self.cursor_datetime_formats = [self.datetime_format] + + def get_stream_state(self) -> StreamState: + return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called + before calling anything else + + :param stream_state: The state of the stream as returned by get_stream_state + """ + self._cursor = stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + """ + Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read. + + :param stream_slice: The current slice, which may or may not contain the most recently observed record + :param record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the + stream state may need to be deferred depending on whether the source reliably orders records by the cursor field. + """ + record_cursor_value = record.get(self.cursor_field.eval(self.config)) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + # if the current record has no cursor value, we cannot meaningfully update the state based on it, so there is nothing more to do + if not record_cursor_value: + return + + start_field = self._partition_field_start.eval(self.config) + end_field = self._partition_field_end.eval(self.config) + is_highest_observed_cursor_value = not self._highest_observed_cursor_field_value or self.parse_date( + record_cursor_value + ) > self.parse_date(self._highest_observed_cursor_field_value) + if ( + self._is_within_daterange_boundaries(record, stream_slice.get(start_field), stream_slice.get(end_field)) # type: ignore # we know that stream_slices for these cursors will use a string representing an unparsed date + and is_highest_observed_cursor_value + ): + self._highest_observed_cursor_field_value = record_cursor_value + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + if stream_slice.partition: + raise ValueError(f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}.") + cursor_value_str_by_cursor_value_datetime = dict( + map( + # we need to ensure the cursor value is preserved as is in the state else the CATs might complain of something like + # 2023-01-04T17:30:19.000Z' <= '2023-01-04T17:30:19.000000Z' + lambda datetime_str: (self.parse_date(datetime_str), datetime_str), # type: ignore # because of the filter on the next line, this will only be called with a str + filter(lambda item: item, [self._cursor, self._highest_observed_cursor_field_value]), + ) + ) + self._cursor = ( + cursor_value_str_by_cursor_value_datetime[max(cursor_value_str_by_cursor_value_datetime.keys())] + if cursor_value_str_by_cursor_value_datetime + else None + ) + + def stream_slices(self) -> Iterable[StreamSlice]: + """ + Partition the daterange into slices of size = step. + + The start of the window is the minimum datetime between start_datetime - lookback_window and the stream_state's datetime + The end of the window is the minimum datetime between the start of the window and end_datetime. + + :return: + """ + end_datetime = self.select_best_end_datetime() + start_datetime = self._calculate_earliest_possible_value(self.select_best_end_datetime()) + return self._partition_daterange(start_datetime, end_datetime, self._step) + + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + # Datetime based cursors operate over slices made up of datetime ranges. Stream state is based on the progress + # through each slice and does not belong to a specific slice. We just return stream state as it is. + return self.get_stream_state() + + def _calculate_earliest_possible_value(self, end_datetime: datetime.datetime) -> datetime.datetime: + lookback_delta = self._parse_timedelta(self._lookback_window.eval(self.config) if self._lookback_window else "P0D") + earliest_possible_start_datetime = min(self._start_datetime.get_datetime(self.config), end_datetime) + try: + cursor_datetime = self._calculate_cursor_datetime_from_state(self.get_stream_state()) - lookback_delta + except OverflowError: + # cursor_datetime defers to the minimum date if it does not exist in the state. Trying to subtract + # a timedelta from the minimum datetime results in an OverflowError + cursor_datetime = self._calculate_cursor_datetime_from_state(self.get_stream_state()) + return max(earliest_possible_start_datetime, cursor_datetime) + + def select_best_end_datetime(self) -> datetime.datetime: + """ + Returns the optimal end datetime. + This method compares the current datetime with a pre-configured end datetime + and returns the earlier of the two. If no pre-configured end datetime is set, + the current datetime is returned. + + :return datetime.datetime: The best end datetime, which is either the current datetime or the pre-configured end datetime, whichever is earlier. + """ + now = datetime.datetime.now(tz=self._timezone) + if not self._end_datetime: + return now + return min(self._end_datetime.get_datetime(self.config), now) + + def _calculate_cursor_datetime_from_state(self, stream_state: Mapping[str, Any]) -> datetime.datetime: + if self.cursor_field.eval(self.config, stream_state=stream_state) in stream_state: # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + return self.parse_date(stream_state[self.cursor_field.eval(self.config)]) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) + + def _format_datetime(self, dt: datetime.datetime) -> str: + return self._parser.format(dt, self.datetime_format) + + def _partition_daterange( + self, start: datetime.datetime, end: datetime.datetime, step: Union[datetime.timedelta, Duration] + ) -> List[StreamSlice]: + start_field = self._partition_field_start.eval(self.config) + end_field = self._partition_field_end.eval(self.config) + dates = [] + + while self._is_within_date_range(start, end): + next_start = self._evaluate_next_start_date_safely(start, step) + end_date = self._get_date(next_start - self._cursor_granularity, end, min) + dates.append( + StreamSlice( + partition={}, cursor_slice={start_field: self._format_datetime(start), end_field: self._format_datetime(end_date)} + ) + ) + start = next_start + return dates + + def _is_within_date_range(self, start: datetime.datetime, end: datetime.datetime) -> bool: + if self.is_compare_strictly: + return start < end + return start <= end + + def _evaluate_next_start_date_safely(self, start: datetime.datetime, step: datetime.timedelta) -> datetime.datetime: + """ + Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date + This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code + would have broken anyway. + """ + try: + return start + step + except OverflowError: + return datetime.datetime.max.replace(tzinfo=datetime.timezone.utc) + + def _get_date( + self, + cursor_value: datetime.datetime, + default_date: datetime.datetime, + comparator: Callable[[datetime.datetime, datetime.datetime], datetime.datetime], + ) -> datetime.datetime: + cursor_date = cursor_value or default_date + return comparator(cursor_date, default_date) + + def parse_date(self, date: str) -> datetime.datetime: + for datetime_format in self.cursor_datetime_formats + [self.datetime_format]: + try: + return self._parser.parse(date, datetime_format) + except ValueError: + pass + raise ValueError(f"No format in {self.cursor_datetime_formats} matching {date}") + + @classmethod + def _parse_timedelta(cls, time_str: Optional[str]) -> Union[datetime.timedelta, Duration]: + """ + :return Parses an ISO 8601 durations into datetime.timedelta or Duration objects. + """ + if not time_str: + return datetime.timedelta(0) + return parse_duration(time_str) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.request_parameter, stream_slice) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.header, stream_slice) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_data, stream_slice) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_json, stream_slice) + + def request_kwargs(self) -> Mapping[str, Any]: + # Never update kwargs + return {} + + def _get_request_options(self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice]) -> Mapping[str, Any]: + options: MutableMapping[str, Any] = {} + if not stream_slice: + return options + if self.start_time_option and self.start_time_option.inject_into == option_type: + options[self.start_time_option.field_name.eval(config=self.config)] = stream_slice.get( # type: ignore # field_name is always casted to an interpolated string + self._partition_field_start.eval(self.config) + ) + if self.end_time_option and self.end_time_option.inject_into == option_type: + options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(self._partition_field_end.eval(self.config)) # type: ignore # field_name is always casted to an interpolated string + return options + + def should_be_synced(self, record: Record) -> bool: + cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + record_cursor_value = record.get(cursor_field) + if not record_cursor_value: + self._send_log( + Level.WARN, + f"Could not find cursor field `{cursor_field}` in record. The incremental sync will assume it needs to be synced", + ) + return True + latest_possible_cursor_value = self.select_best_end_datetime() + earliest_possible_cursor_value = self._calculate_earliest_possible_value(latest_possible_cursor_value) + return self._is_within_daterange_boundaries(record, earliest_possible_cursor_value, latest_possible_cursor_value) + + def _is_within_daterange_boundaries( + self, record: Record, start_datetime_boundary: Union[datetime.datetime, str], end_datetime_boundary: Union[datetime.datetime, str] + ) -> bool: + cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + record_cursor_value = record.get(cursor_field) + if not record_cursor_value: + self._send_log( + Level.WARN, + f"Could not find cursor field `{cursor_field}` in record. The record will not be considered when emitting sync state", + ) + return False + if isinstance(start_datetime_boundary, str): + start_datetime_boundary = self.parse_date(start_datetime_boundary) + if isinstance(end_datetime_boundary, str): + end_datetime_boundary = self.parse_date(end_datetime_boundary) + return start_datetime_boundary <= self.parse_date(record_cursor_value) <= end_datetime_boundary + + def _send_log(self, level: Level, message: str) -> None: + if self.message_repository: + self.message_repository.emit_message( + AirbyteMessage( + type=Type.LOG, + log=AirbyteLogMessage(level=level, message=message), + ) + ) + + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ + first_cursor_value = first.get(cursor_field) + second_cursor_value = second.get(cursor_field) + if first_cursor_value and second_cursor_value: + return self.parse_date(first_cursor_value) >= self.parse_date(second_cursor_value) + elif first_cursor_value: + return True + else: + return False + + def set_runtime_lookback_window(self, lookback_window_in_seconds: int) -> None: + """ + Updates the lookback window based on a given number of seconds if the new duration + is greater than the currently configured lookback window. + + :param lookback_window_in_seconds: The lookback duration in seconds to potentially update to. + """ + runtime_lookback_window = duration_isoformat(timedelta(seconds=lookback_window_in_seconds)) + config_lookback = parse_duration(self._lookback_window.eval(self.config) if self._lookback_window else "P0D") + + # Check if the new runtime lookback window is greater than the current config lookback + if parse_duration(runtime_lookback_window) > config_lookback: + self._lookback_window = InterpolatedString.create(runtime_lookback_window, parameters={}) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/declarative_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/declarative_cursor.py new file mode 100644 index 000000000000..adb64d119039 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/declarative_cursor.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from abc import ABC + +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.streams.checkpoint.cursor import Cursor + + +class DeclarativeCursor(Cursor, StreamSlicer, ABC): + """ + DeclarativeCursors are components that allow for checkpointing syncs. In addition to managing the fetching and updating of + state, declarative cursors also manage stream slicing and injecting slice values into outbound requests. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py new file mode 100644 index 000000000000..f7454ef083a5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py @@ -0,0 +1,333 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import threading +import time +from typing import Any, Callable, Iterable, Mapping, Optional, TypeVar, Union + +from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState + +T = TypeVar("T") + + +def iterate_with_last_flag_and_state( + generator: Iterable[T], get_stream_state_func: Callable[[], Optional[Mapping[str, StreamState]]] +) -> Iterable[tuple[T, bool, Any]]: + """ + Iterates over the given generator, yielding tuples containing the element, a flag + indicating whether it's the last element in the generator, and the result of + `get_stream_state_func` applied to the element. + + Args: + generator: The iterable to iterate over. + get_stream_state_func: A function that takes an element from the generator and + returns its state. + + Returns: + An iterator that yields tuples of the form (element, is_last, state). + """ + + iterator = iter(generator) + + try: + current = next(iterator) + state = get_stream_state_func() + except StopIteration: + return # Return an empty iterator + + for next_item in iterator: + yield current, False, state + current = next_item + state = get_stream_state_func() + + yield current, True, state + + +class Timer: + """ + A simple timer class that measures elapsed time in seconds using a high-resolution performance counter. + """ + + def __init__(self) -> None: + self._start: Optional[int] = None + + def start(self) -> None: + self._start = time.perf_counter_ns() + + def finish(self) -> int: + if self._start: + return ((time.perf_counter_ns() - self._start) / 1e9).__ceil__() + else: + raise RuntimeError("Global substream cursor timer not started") + + +class GlobalSubstreamCursor(DeclarativeCursor): + """ + The GlobalSubstreamCursor is designed to track the state of substreams using a single global cursor. + This class is beneficial for streams with many partitions, as it allows the state to be managed globally + instead of per partition, simplifying state management and reducing the size of state messages. + + This cursor is activated by setting the `global_substream_cursor` parameter for incremental sync. + + Warnings: + - This class enforces a minimal lookback window for substream based on the duration of the previous sync to avoid losing records. This lookback ensures that any records added or updated during the sync are captured in subsequent syncs. + - The global cursor is updated only at the end of the sync. If the sync ends prematurely (e.g., due to an exception), the state will not be updated. + - When using the `incremental_dependency` option, the sync will progress through parent records, preventing the sync from getting infinitely stuck. However, it is crucial to understand the requirements for both the `global_substream_cursor` and `incremental_dependency` options to avoid data loss. + """ + + def __init__(self, stream_cursor: DatetimeBasedCursor, partition_router: PartitionRouter): + self._stream_cursor = stream_cursor + self._partition_router = partition_router + self._timer = Timer() + self._lock = threading.Lock() + self._slice_semaphore = threading.Semaphore(0) # Start with 0, indicating no slices being tracked + self._all_slices_yielded = False + self._lookback_window: Optional[int] = None + self._current_partition: Optional[Mapping[str, Any]] = None + self._last_slice: bool = False + self._parent_state: Optional[Mapping[str, Any]] = None + + def start_slices_generation(self) -> None: + self._timer.start() + + def stream_slices(self) -> Iterable[StreamSlice]: + """ + Generates stream slices, ensuring the last slice is properly flagged and processed. + + This method creates a sequence of stream slices by iterating over partitions and cursor slices. + It holds onto one slice in memory to set `_all_slices_yielded` to `True` before yielding the + final slice. A semaphore is used to track the processing of slices, ensuring that `close_slice` + is called only after all slices have been processed. + + We expect the following events: + * Yields all the slices except the last one. At this point, `close_slice` won't actually close the global slice as `self._all_slices_yielded == False` + * Release the semaphore one last time before setting `self._all_slices_yielded = True`. This will cause `close_slice` to know about all the slices before we indicate that all slices have been yielded so the left side of `if self._all_slices_yielded and self._slice_semaphore._value == 0` will be false if not everything is closed + * Setting `self._all_slices_yielded = True`. We do that before actually yielding the last slice as the caller of `stream_slices` might stop iterating at any point and hence the code after `yield` might not be executed + * Yield the last slice. At that point, once there are as many slices yielded as closes, the global slice will be closed too + """ + slice_generator = ( + StreamSlice(partition=partition, cursor_slice=cursor_slice) + for partition in self._partition_router.stream_slices() + for cursor_slice in self._stream_cursor.stream_slices() + ) + + self.start_slices_generation() + for slice, last, state in iterate_with_last_flag_and_state(slice_generator, self._partition_router.get_stream_state): + self._parent_state = state + self.register_slice(last) + yield slice + self._parent_state = self._partition_router.get_stream_state() + + def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]: + slice_generator = ( + StreamSlice(partition=partition, cursor_slice=cursor_slice) for cursor_slice in self._stream_cursor.stream_slices() + ) + + yield from slice_generator + + def register_slice(self, last: bool) -> None: + """ + Tracks the processing of a stream slice. + + Releases the semaphore for each slice. If it's the last slice (`last=True`), + sets `_all_slices_yielded` to `True` to indicate no more slices will be processed. + + Args: + last (bool): True if the current slice is the last in the sequence. + """ + self._slice_semaphore.release() + if last: + self._all_slices_yielded = True + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Set the initial state for the cursors. + + This method initializes the state for the global cursor using the provided stream state. + + Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router + does not have parent streams, this step will be skipped due to the default PartitionRouter implementation. + + Args: + stream_state (StreamState): The state of the streams to be set. The format of the stream state should be: + { + "state": { + "last_updated": "2023-05-27T00:00:00Z" + }, + "parent_state": { + "parent_stream_name": { + "last_updated": "2023-05-27T00:00:00Z" + } + }, + "lookback_window": 132 + } + """ + if not stream_state: + return + + if "lookback_window" in stream_state: + self._lookback_window = stream_state["lookback_window"] + self._inject_lookback_into_stream_cursor(stream_state["lookback_window"]) + + if "state" in stream_state: + self._stream_cursor.set_initial_state(stream_state["state"]) + elif "states" not in stream_state: + # We assume that `stream_state` is in the old global format + # Example: {"global_state_format_key": "global_state_format_value"} + self._stream_cursor.set_initial_state(stream_state) + + # Set parent state for partition routers based on parent streams + self._partition_router.set_initial_state(stream_state) + + def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None: + """ + Modifies the stream cursor's lookback window based on the duration of the previous sync. + This adjustment ensures the cursor is set to the minimal lookback window necessary for + avoiding missing data. + + Parameters: + lookback_window (int): The lookback duration in seconds to be set, derived from + the previous sync. + + Raises: + ValueError: If the cursor does not support dynamic lookback window adjustments. + """ + if hasattr(self._stream_cursor, "set_runtime_lookback_window"): + self._stream_cursor.set_runtime_lookback_window(lookback_window) + else: + raise ValueError("The cursor class for Global Substream Cursor does not have a set_runtime_lookback_window method") + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + self._stream_cursor.observe(StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), record) + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + """ + Close the current stream slice. + + This method is called when a stream slice is completed. For the global parent cursor, we close the child cursor + only after reading all slices. This ensures that we do not miss any child records from a later parent record + if the child cursor is earlier than a record from the first parent record. + + Args: + stream_slice (StreamSlice): The stream slice to be closed. + *args (Any): Additional arguments. + """ + with self._lock: + self._slice_semaphore.acquire() + if self._all_slices_yielded and self._slice_semaphore._value == 0: + self._lookback_window = self._timer.finish() + self._stream_cursor.close_slice(StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), *args) + + def get_stream_state(self) -> StreamState: + state: dict[str, Any] = {"state": self._stream_cursor.get_stream_state()} + + if self._parent_state: + state["parent_state"] = self._parent_state + + if self._lookback_window is not None: + state["lookback_window"] = self._lookback_window + + return state + + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + # stream_slice is ignored as cursor is global + return self._stream_cursor.get_stream_state() + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + if stream_slice: + return self._partition_router.get_request_params( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._stream_cursor.get_request_params( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request params") + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + if stream_slice: + return self._partition_router.get_request_headers( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._stream_cursor.get_request_headers( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request headers") + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + if stream_slice: + return self._partition_router.get_request_body_data( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._stream_cursor.get_request_body_data( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request body data") + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + if stream_slice: + return self._partition_router.get_request_body_json( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._stream_cursor.get_request_body_json( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request body json") + + def should_be_synced(self, record: Record) -> bool: + return self._stream_cursor.should_be_synced(self._convert_record_to_cursor_record(record)) + + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + return self._stream_cursor.is_greater_than_or_equal( + self._convert_record_to_cursor_record(first), self._convert_record_to_cursor_record(second) + ) + + @staticmethod + def _convert_record_to_cursor_record(record: Record) -> Record: + return Record( + record.data, + StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice) if record.associated_slice else None, + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py new file mode 100644 index 000000000000..86236ec92230 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py @@ -0,0 +1,314 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from collections import OrderedDict +from typing import Any, Callable, Iterable, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import PerPartitionKeySerializer +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState + +logger = logging.getLogger("airbyte") + + +class CursorFactory: + def __init__(self, create_function: Callable[[], DeclarativeCursor]): + self._create_function = create_function + + def create(self) -> DeclarativeCursor: + return self._create_function() + + +class PerPartitionCursor(DeclarativeCursor): + """ + Manages state per partition when a stream has many partitions, to prevent data loss or duplication. + + **Partition Limitation and Limit Reached Logic** + + - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000). + - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition. + - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded. + + The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage. + + - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly. + - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors. + + This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed. + """ + + DEFAULT_MAX_PARTITIONS_NUMBER = 10000 + _NO_STATE: Mapping[str, Any] = {} + _NO_CURSOR_STATE: Mapping[str, Any] = {} + _KEY = 0 + _VALUE = 1 + _state_to_migrate_from: Mapping[str, Any] = {} + + def __init__(self, cursor_factory: CursorFactory, partition_router: PartitionRouter): + self._cursor_factory = cursor_factory + self._partition_router = partition_router + # The dict is ordered to ensure that once the maximum number of partitions is reached, + # the oldest partitions can be efficiently removed, maintaining the most recent partitions. + self._cursor_per_partition: OrderedDict[str, DeclarativeCursor] = OrderedDict() + self._over_limit = 0 + self._partition_serializer = PerPartitionKeySerializer() + + def stream_slices(self) -> Iterable[StreamSlice]: + slices = self._partition_router.stream_slices() + for partition in slices: + yield from self.generate_slices_from_partition(partition) + + def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]: + # Ensure the maximum number of partitions is not exceeded + self._ensure_partition_limit() + + cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition)) + if not cursor: + partition_state = self._state_to_migrate_from if self._state_to_migrate_from else self._NO_CURSOR_STATE + cursor = self._create_cursor(partition_state) + self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor + + for cursor_slice in cursor.stream_slices(): + yield StreamSlice(partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields) + + def _ensure_partition_limit(self) -> None: + """ + Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped. + """ + while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1: + self._over_limit += 1 + oldest_partition = self._cursor_per_partition.popitem(last=False)[0] # Remove the oldest partition + logger.warning( + f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}." + ) + + def limit_reached(self) -> bool: + return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Set the initial state for the cursors. + + This method initializes the state for each partition cursor using the provided stream state. + If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state. + + Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router + does not have parent streams, this step will be skipped due to the default PartitionRouter implementation. + + Args: + stream_state (StreamState): The state of the streams to be set. The format of the stream state should be: + { + "states": [ + { + "partition": { + "partition_key": "value" + }, + "cursor": { + "last_updated": "2023-05-27T00:00:00Z" + } + } + ], + "parent_state": { + "parent_stream_name": { + "last_updated": "2023-05-27T00:00:00Z" + } + } + } + """ + if not stream_state: + return + + if "states" not in stream_state: + # We assume that `stream_state` is in a global format that can be applied to all partitions. + # Example: {"global_state_format_key": "global_state_format_value"} + self._state_to_migrate_from = stream_state + + else: + for state in stream_state["states"]: + self._cursor_per_partition[self._to_partition_key(state["partition"])] = self._create_cursor(state["cursor"]) + + # set default state for missing partitions if it is per partition with fallback to global + if "state" in stream_state: + self._state_to_migrate_from = stream_state["state"] + + # Set parent state for partition routers based on parent streams + self._partition_router.set_initial_state(stream_state) + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe( + StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), record + ) + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + try: + self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].close_slice( + StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), *args + ) + except KeyError as exception: + raise ValueError( + f"Partition {str(exception)} could not be found in current state based on the record. This is unexpected because " + f"we should only update state for partitions that were emitted during `stream_slices`" + ) + + def get_stream_state(self) -> StreamState: + states = [] + for partition_tuple, cursor in self._cursor_per_partition.items(): + cursor_state = cursor.get_stream_state() + if cursor_state: + states.append( + { + "partition": self._to_dict(partition_tuple), + "cursor": cursor_state, + } + ) + state: dict[str, Any] = {"states": states} + + parent_state = self._partition_router.get_stream_state() + if parent_state: + state["parent_state"] = parent_state + return state + + def _get_state_for_partition(self, partition: Mapping[str, Any]) -> Optional[StreamState]: + cursor = self._cursor_per_partition.get(self._to_partition_key(partition)) + if cursor: + return cursor.get_stream_state() + + return None + + @staticmethod + def _is_new_state(stream_state: Mapping[str, Any]) -> bool: + return not bool(stream_state) + + def _to_partition_key(self, partition: Mapping[str, Any]) -> str: + return self._partition_serializer.to_partition_key(partition) + + def _to_dict(self, partition_key: str) -> Mapping[str, Any]: + return self._partition_serializer.to_partition(partition_key) + + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + if not stream_slice: + raise ValueError("A partition needs to be provided in order to extract a state") + + if not stream_slice: + return None + + return self._get_state_for_partition(stream_slice.partition) + + def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor: + cursor = self._cursor_factory.create() + cursor.set_initial_state(cursor_state) + return cursor + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + if stream_slice: + return self._partition_router.get_request_params( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].get_request_params( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request params") + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + if stream_slice: + return self._partition_router.get_request_headers( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].get_request_headers( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request headers") + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + if stream_slice: + return self._partition_router.get_request_body_data( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].get_request_body_data( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request body data") + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + if stream_slice: + return self._partition_router.get_request_body_json( # type: ignore # this always returns a mapping + stream_state=stream_state, + stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}), + next_page_token=next_page_token, + ) | self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].get_request_body_json( + stream_state=stream_state, + stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), + next_page_token=next_page_token, + ) + else: + raise ValueError("A partition needs to be provided in order to get request body json") + + def should_be_synced(self, record: Record) -> bool: + return self._get_cursor(record).should_be_synced(self._convert_record_to_cursor_record(record)) + + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + if not first.associated_slice or not second.associated_slice: + raise ValueError(f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}") + if first.associated_slice.partition != second.associated_slice.partition: + raise ValueError( + f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}" + ) + + return self._get_cursor(first).is_greater_than_or_equal( + self._convert_record_to_cursor_record(first), self._convert_record_to_cursor_record(second) + ) + + @staticmethod + def _convert_record_to_cursor_record(record: Record) -> Record: + return Record( + record.data, + StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice) if record.associated_slice else None, + ) + + def _get_cursor(self, record: Record) -> DeclarativeCursor: + if not record.associated_slice: + raise ValueError("Invalid state as stream slices that are emitted should refer to an existing cursor") + partition_key = self._to_partition_key(record.associated_slice.partition) + if partition_key not in self._cursor_per_partition: + raise ValueError("Invalid state as stream slices that are emitted should refer to an existing cursor") + cursor = self._cursor_per_partition[partition_key] + return cursor diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py new file mode 100644 index 000000000000..d5ad6b40d1b8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py @@ -0,0 +1,188 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from typing import Any, Iterable, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import GlobalSubstreamCursor, iterate_with_last_flag_and_state +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import CursorFactory, PerPartitionCursor +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState + + +class PerPartitionWithGlobalCursor(DeclarativeCursor): + """ + Manages state for streams with multiple partitions, with an optional fallback to a global cursor when specific conditions are met. + + This cursor handles partitioned streams by maintaining individual state per partition using `PerPartitionCursor`. If the number of partitions exceeds a defined limit, it switches to a global cursor (`GlobalSubstreamCursor`) to manage state more efficiently. + + **Overview** + + - **Partition-Based State**: Initially manages state per partition to ensure accurate processing of each partition's data. + - **Global Fallback**: Switches to a global cursor when the partition limit is exceeded to handle state management more effectively. + + **Switching Logic** + + - Monitors the number of partitions. + - If `PerPartitionCursor.limit_reached()` returns `True`, sets `_use_global_cursor` to `True`, activating the global cursor. + + **Active Cursor Selection** + + - Uses the `_get_active_cursor()` helper method to select the active cursor based on the `_use_global_cursor` flag. + - This simplifies the logic and ensures consistent cursor usage across methods. + + **State Structure Example** + + ```json + { + "states": [ + { + "partition": {"partition_key": "partition_1"}, + "cursor": {"cursor_field": "2021-01-15"} + }, + { + "partition": {"partition_key": "partition_2"}, + "cursor": {"cursor_field": "2021-02-14"} + } + ], + "state": { + "cursor_field": "2021-02-15" + }, + "use_global_cursor": false + } + ``` + + In this example, the cursor is using partition-based state management (`"use_global_cursor": false`), maintaining separate cursor states for each partition. + + **Usage Scenario** + + Suitable for streams where the number of partitions may vary significantly, requiring dynamic switching between per-partition and global state management to ensure data consistency and efficient synchronization. + """ + + def __init__(self, cursor_factory: CursorFactory, partition_router: PartitionRouter, stream_cursor: DatetimeBasedCursor): + self._partition_router = partition_router + self._per_partition_cursor = PerPartitionCursor(cursor_factory, partition_router) + self._global_cursor = GlobalSubstreamCursor(stream_cursor, partition_router) + self._use_global_cursor = False + self._current_partition: Optional[Mapping[str, Any]] = None + self._last_slice: bool = False + self._parent_state: Optional[Mapping[str, Any]] = None + + def _get_active_cursor(self) -> Union[PerPartitionCursor, GlobalSubstreamCursor]: + return self._global_cursor if self._use_global_cursor else self._per_partition_cursor + + def stream_slices(self) -> Iterable[StreamSlice]: + self._global_cursor.start_slices_generation() + + # Iterate through partitions and process slices + for partition, is_last_partition, parent_state in iterate_with_last_flag_and_state( + self._partition_router.stream_slices(), self._partition_router.get_stream_state + ): + # Generate slices for the current cursor and handle the last slice using the flag + self._parent_state = parent_state + for slice, is_last_slice, _ in iterate_with_last_flag_and_state( + self._get_active_cursor().generate_slices_from_partition(partition=partition), lambda: None + ): + self._global_cursor.register_slice(is_last_slice and is_last_partition) + yield slice + self._parent_state = self._partition_router.get_stream_state() + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Set the initial state for the cursors. + """ + self._use_global_cursor = stream_state.get("use_global_cursor", False) + + self._parent_state = stream_state.get("parent_state", {}) + + self._global_cursor.set_initial_state(stream_state) + if not self._use_global_cursor: + self._per_partition_cursor.set_initial_state(stream_state) + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + if not self._use_global_cursor and self._per_partition_cursor.limit_reached(): + self._use_global_cursor = True + + if not self._use_global_cursor: + self._per_partition_cursor.observe(stream_slice, record) + self._global_cursor.observe(stream_slice, record) + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + if not self._use_global_cursor: + self._per_partition_cursor.close_slice(stream_slice, *args) + self._global_cursor.close_slice(stream_slice, *args) + + def get_stream_state(self) -> StreamState: + final_state: MutableMapping[str, Any] = {"use_global_cursor": self._use_global_cursor} + + final_state.update(self._global_cursor.get_stream_state()) + if not self._use_global_cursor: + final_state.update(self._per_partition_cursor.get_stream_state()) + + final_state["parent_state"] = self._parent_state + if not final_state.get("parent_state"): + del final_state["parent_state"] + + return final_state + + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + return self._get_active_cursor().select_state(stream_slice) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_active_cursor().get_request_params( + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=next_page_token, + ) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_active_cursor().get_request_headers( + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=next_page_token, + ) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return self._get_active_cursor().get_request_body_data( + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=next_page_token, + ) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_active_cursor().get_request_body_json( + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=next_page_token, + ) + + def should_be_synced(self, record: Record) -> bool: + return self._global_cursor.should_be_synced(record) or self._per_partition_cursor.should_be_synced(record) + + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + return self._global_cursor.is_greater_than_or_equal(first, second) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py new file mode 100644 index 000000000000..499220a4c4fd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py @@ -0,0 +1,120 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, Mapping, Optional + +from airbyte_cdk.sources.declarative.incremental import DeclarativeCursor +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState +from airbyte_cdk.sources.streams.checkpoint.checkpoint_reader import FULL_REFRESH_COMPLETE_STATE + + +@dataclass +class ResumableFullRefreshCursor(DeclarativeCursor): + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._cursor: StreamState = {} + + def get_stream_state(self) -> StreamState: + return self._cursor + + def set_initial_state(self, stream_state: StreamState) -> None: + self._cursor = stream_state + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + """ + Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records. + """ + pass + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + # The ResumableFullRefreshCursor doesn't support nested streams yet so receiving a partition is unexpected + if stream_slice.partition: + raise ValueError(f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}.") + self._cursor = stream_slice.cursor_slice + + def should_be_synced(self, record: Record) -> bool: + """ + Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages + that don't have filterable bounds. We should always return them. + """ + return True + + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + """ + RFR record don't have ordering to be compared between one another. + """ + return False + + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + # A top-level RFR cursor only manages the state of a single partition + return self._cursor + + def stream_slices(self) -> Iterable[StreamSlice]: + """ + Resumable full refresh cursors only return a single slice and can't perform partitioning because iteration is done per-page + along an unbounded set. + """ + yield from [StreamSlice(cursor_slice=self._cursor, partition={})] + + # This is an interesting pattern that might not seem obvious at first glance. This cursor itself has no functional need to + # inject any request values into the outbound response because the up-to-date pagination state is already loaded and + # maintained by the paginator component + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + +@dataclass +class ChildPartitionResumableFullRefreshCursor(ResumableFullRefreshCursor): + """ + The Sub-stream Resumable Cursor for Full-Refresh substreams. + Follows the parent type `ResumableFullRefreshCursor` with a small override, + to provide the ability to close the substream's slice once it has finished processing. + + Check the `close_slice` method overide for more info about the actual behaviour of this cursor. + """ + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + """ + Once the current slice has finished syncing: + - paginator returns None + - no more slices to process + + we assume that the records are processed and emitted already, + thus we have to set the cursor to ` __ab_full_refresh_sync_complete: true `, + otherwise there is a risk of Inf. Loop processing the same slice. + """ + self._cursor = FULL_REFRESH_COMPLETE_STATE diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/__init__.py new file mode 100644 index 000000000000..d721b99f1e29 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/__init__.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + +__all__ = ["InterpolatedBoolean", "InterpolatedMapping", "InterpolatedString"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/filters.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/filters.py new file mode 100644 index 000000000000..52d76cab6423 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/filters.py @@ -0,0 +1,120 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import base64 +import hashlib +import json +import re +from typing import Any, Optional + + +def hash(value: Any, hash_type: str = "md5", salt: Optional[str] = None) -> str: + """ + Implementation of a custom Jinja2 hash filter + Hash type defaults to 'md5' if one is not specified. + + If you are using this has function for GDPR compliance, then + you should probably also pass in a salt as discussed in: + https://security.stackexchange.com/questions/202022/hashing-email-addresses-for-gdpr-compliance + + This can be used in a low code connector definition under the AddFields transformation. + For example: + + rates_stream: + $ref: "#/definitions/base_stream" + $parameters: + name: "rates" + primary_key: "date" + path: "/exchangerates_data/latest" + transformations: + - type: AddFields + fields: + - path: ["some_new_path"] + value: "{{ record['rates']['CAD'] | hash('md5', 'mysalt') }}" + + + + :param value: value to be hashed + :param hash_type: valid hash type + :param salt: a salt that will be combined with the value to ensure that the hash created for a given value on this system + is different from the hash created for that value on other systems. + :return: computed hash as a hexadecimal string + """ + hash_func = getattr(hashlib, hash_type, None) + + if hash_func: + hash_obj = hash_func() + hash_obj.update(str(value).encode("utf-8")) + if salt: + hash_obj.update(str(salt).encode("utf-8")) + computed_hash: str = hash_obj.hexdigest() + else: + raise AttributeError("No hashing function named {hname}".format(hname=hash_type)) + + return computed_hash + + +def base64encode(value: str) -> str: + """ + Implementation of a custom Jinja2 base64encode filter + + For example: + + OAuthAuthenticator: + $ref: "#/definitions/OAuthAuthenticator" + $parameters: + name: "client_id" + value: "{{ config['client_id'] | base64encode }}" + + :param value: value to be encoded in base64 + :return: base64 encoded string + """ + + return base64.b64encode(value.encode("utf-8")).decode() + + +def base64decode(value: str) -> str: + """ + Implementation of a custom Jinja2 base64decode filter + + For example: + + OAuthAuthenticator: + $ref: "#/definitions/OAuthAuthenticator" + $parameters: + name: "client_id" + value: "{{ config['client_id'] | base64decode }}" + + :param value: value to be decoded from base64 + :return: base64 decoded string + """ + + return base64.b64decode(value.encode("utf-8")).decode() + + +def string(value: Any) -> str: + """ + Converts the input value to a string. + If the value is already a string, it is returned as is. + Otherwise, the value is interpreted as a json object and wrapped in triple-quotes so it's evalued as a string by the JinjaInterpolation + :param value: the value to convert to a string + :return: string representation of the input value + """ + if isinstance(value, str): + return value + ret = f'"""{json.dumps(value)}"""' + return ret + + +def regex_search(value: str, regex: str) -> str: + """ + Match a regular expression against a string and return the first match group if it exists. + """ + match = re.search(regex, value) + if match and len(match.groups()) > 0: + return match.group(1) + return "" + + +_filters_list = [hash, base64encode, base64decode, string, regex_search] +filters = {f.__name__: f for f in _filters_list} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py new file mode 100644 index 000000000000..e3c3d6a68290 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Final, List, Mapping + +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.types import Config + +FALSE_VALUES: Final[List[Any]] = ["False", "false", "{}", "[]", "()", "", "0", "0.0", {}, False, [], (), set()] + + +@dataclass +class InterpolatedBoolean: + f""" + Wrapper around a string to be evaluated to a boolean value. + The string will be evaluated as False if it interpolates to a value in {FALSE_VALUES} + + Attributes: + condition (str): The string representing the condition to evaluate to a boolean + """ + condition: str + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._default = "False" + self._interpolation = JinjaInterpolation() + self._parameters = parameters + + def eval(self, config: Config, **additional_parameters: Any) -> bool: + """ + Interpolates the predicate condition string using the config and other optional arguments passed as parameter. + + :param config: The user-provided configuration as specified by the source's spec + :param additional_parameters: Optional parameters used for interpolation + :return: The interpolated string + """ + if isinstance(self.condition, bool): + return self.condition + else: + evaluated = self._interpolation.eval( + self.condition, config, self._default, parameters=self._parameters, **additional_parameters + ) + if evaluated in FALSE_VALUES: + return False + # The presence of a value is generally regarded as truthy, so we treat it as such + return True diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py new file mode 100644 index 000000000000..b0f26e0d9b77 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py @@ -0,0 +1,52 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from dataclasses import InitVar, dataclass +from typing import Any, Dict, Mapping, Optional + +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.types import Config + + +@dataclass +class InterpolatedMapping: + """ + Wrapper around a Mapping[str, str] where both the keys and values are to be interpolated. + + Attributes: + mapping (Mapping[str, str]): to be evaluated + """ + + mapping: Mapping[str, str] + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Optional[Mapping[str, Any]]) -> None: + self._interpolation = JinjaInterpolation() + self._parameters = parameters + + def eval(self, config: Config, **additional_parameters: Any) -> Dict[str, Any]: + """ + Wrapper around a Mapping[str, str] that allows for both keys and values to be interpolated. + + :param config: The user-provided configuration as specified by the source's spec + :param additional_parameters: Optional parameters used for interpolation + :return: The interpolated mapping + """ + valid_key_types = additional_parameters.pop("valid_key_types", (str,)) + valid_value_types = additional_parameters.pop("valid_value_types", None) + return { + self._interpolation.eval( + name, config, valid_types=valid_key_types, parameters=self._parameters, **additional_parameters + ): self._eval(value, config, valid_types=valid_value_types, **additional_parameters) + for name, value in self.mapping.items() + } + + def _eval(self, value: str, config: Config, **kwargs: Any) -> Any: + # The values in self._mapping can be of Any type + # We only want to interpolate them if they are strings + if isinstance(value, str): + return self._interpolation.eval(value, config, parameters=self._parameters, **kwargs) + else: + return value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py new file mode 100644 index 000000000000..6c0afde2e44b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.types import Config + +NestedMappingEntry = Union[dict[str, "NestedMapping"], list["NestedMapping"], str, int, float, bool, None] +NestedMapping = Union[dict[str, NestedMappingEntry], str] + + +@dataclass +class InterpolatedNestedMapping: + """ + Wrapper around a nested dict which can contain lists and primitive values where both the keys and values are interpolated recursively. + + Attributes: + mapping (NestedMapping): to be evaluated + """ + + mapping: NestedMapping + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Optional[Mapping[str, Any]]) -> None: + self._interpolation = JinjaInterpolation() + self._parameters = parameters + + def eval(self, config: Config, **additional_parameters: Any) -> Any: + return self._eval(self.mapping, config, **additional_parameters) + + def _eval(self, value: Union[NestedMapping, NestedMappingEntry], config: Config, **kwargs: Any) -> Any: + # Recursively interpolate dictionaries and lists + if isinstance(value, str): + return self._interpolation.eval(value, config, parameters=self._parameters, **kwargs) + elif isinstance(value, dict): + interpolated_dict = {self._eval(k, config, **kwargs): self._eval(v, config, **kwargs) for k, v in value.items()} + return {k: v for k, v in interpolated_dict.items() if v is not None} + elif isinstance(value, list): + return [self._eval(v, config, **kwargs) for v in value] + else: + return value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_string.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_string.py new file mode 100644 index 000000000000..393abc9483b8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_string.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.types import Config + + +@dataclass +class InterpolatedString: + """ + Wrapper around a raw string to be interpolated with the Jinja2 templating engine + + Attributes: + string (str): The string to evalute + default (Optional[str]): The default value to return if the evaluation returns an empty string + parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + string: str + parameters: InitVar[Mapping[str, Any]] + default: Optional[str] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self.default = self.default or self.string + self._interpolation = JinjaInterpolation() + self._parameters = parameters + # indicates whether passed string is just a plain string, not Jinja template + # This allows for optimization, but we do not know it yet at this stage + self._is_plain_string = None + + def eval(self, config: Config, **kwargs: Any) -> Any: + """ + Interpolates the input string using the config and other optional arguments passed as parameter. + + :param config: The user-provided configuration as specified by the source's spec + :param kwargs: Optional parameters used for interpolation + :return: The interpolated string + """ + if self._is_plain_string: + return self.string + if self._is_plain_string is None: + # Let's check whether output from evaluation is the same as input. + # This indicates occurrence of a plain string, not a template and we can skip Jinja in subsequent runs. + evaluated = self._interpolation.eval(self.string, config, self.default, parameters=self._parameters, **kwargs) + self._is_plain_string = self.string == evaluated + return evaluated + return self._interpolation.eval(self.string, config, self.default, parameters=self._parameters, **kwargs) + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, InterpolatedString): + return False + return self.string == other.string and self.default == other.default + + @classmethod + def create( + cls, + string_or_interpolated: Union["InterpolatedString", str], + *, + parameters: Mapping[str, Any], + ) -> "InterpolatedString": + """ + Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString. + + :param string_or_interpolated: either a raw string or an InterpolatedString. + :param parameters: parameters propagated from parent component + :return: InterpolatedString representing the input string. + """ + if isinstance(string_or_interpolated, str): + return InterpolatedString(string=string_or_interpolated, parameters=parameters) + else: + return string_or_interpolated diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolation.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolation.py new file mode 100644 index 000000000000..8a8f05a5ba71 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolation.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Any, Optional + +from airbyte_cdk.sources.types import Config + + +class Interpolation(ABC): + """ + Strategy for evaluating the interpolated value of a string at runtime using Jinja. + """ + + @abstractmethod + def eval(self, input_str: str, config: Config, default: Optional[str] = None, **additional_options: Any) -> Any: + """ + Interpolates the input string using the config, and additional options passed as parameter. + + :param input_str: The string to interpolate + :param config: The user-provided configuration as specified by the source's spec + :param default: Default value to return if the evaluation returns an empty string + :param additional_options: Optional parameters used for interpolation + :return: The interpolated string + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/jinja.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/jinja.py new file mode 100644 index 000000000000..45a93e58252f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/jinja.py @@ -0,0 +1,142 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import ast +from functools import cache +from typing import Any, Mapping, Optional, Tuple, Type + +from airbyte_cdk.sources.declarative.interpolation.filters import filters +from airbyte_cdk.sources.declarative.interpolation.interpolation import Interpolation +from airbyte_cdk.sources.declarative.interpolation.macros import macros +from airbyte_cdk.sources.types import Config +from jinja2 import meta +from jinja2.environment import Template +from jinja2.exceptions import UndefinedError +from jinja2.sandbox import SandboxedEnvironment + + +class StreamPartitionAccessEnvironment(SandboxedEnvironment): + """ + Currently, source-jira is setting an attribute to StreamSlice specific to its use case which because of the PerPartitionCursor is set to + StreamSlice._partition but not exposed through StreamSlice.partition. This is a patch to still allow source-jira to have access to this + parameter + """ + + def is_safe_attribute(self, obj: Any, attr: str, value: Any) -> bool: + if attr in ["_partition"]: + return True + return super().is_safe_attribute(obj, attr, value) # type: ignore # for some reason, mypy says 'Returning Any from function declared to return "bool"' + + +class JinjaInterpolation(Interpolation): + """ + Interpolation strategy using the Jinja2 template engine. + + If the input string is a raw string, the interpolated string will be the same. + `eval("hello world") -> "hello world"` + + The engine will evaluate the content passed within {{}}, interpolating the keys from the config and context-specific arguments. + `eval("hello {{ name }}", name="airbyte") -> "hello airbyte")` + `eval("hello {{ config.name }}", config={"name": "airbyte"}) -> "hello airbyte")` + + In additional to passing additional values through the kwargs argument, macros can be called from within the string interpolation. + For example, + "{{ max(2, 3) }}" will return 3 + + Additional information on jinja templating can be found at https://jinja.palletsprojects.com/en/3.1.x/templates/# + """ + + # These aliases are used to deprecate existing keywords without breaking all existing connectors. + ALIASES = { + "stream_interval": "stream_slice", # Use stream_interval to access incremental_sync values + "stream_partition": "stream_slice", # Use stream_partition to access partition router's values + } + + # These extensions are not installed so they're not currently a problem, + # but we're still explicitely removing them from the jinja context. + # At worst, this is documentation that we do NOT want to include these extensions because of the potential security risks + RESTRICTED_EXTENSIONS = ["jinja2.ext.loopcontrols"] # Adds support for break continue in loops + + # By default, these Python builtin functions are available in the Jinja context. + # We explicitely remove them because of the potential security risk. + # Please add a unit test to test_jinja.py when adding a restriction. + RESTRICTED_BUILTIN_FUNCTIONS = ["range"] # The range function can cause very expensive computations + + def __init__(self) -> None: + self._environment = StreamPartitionAccessEnvironment() + self._environment.filters.update(**filters) + self._environment.globals.update(**macros) + + for extension in self.RESTRICTED_EXTENSIONS: + self._environment.extensions.pop(extension, None) + for builtin in self.RESTRICTED_BUILTIN_FUNCTIONS: + self._environment.globals.pop(builtin, None) + + def eval( + self, + input_str: str, + config: Config, + default: Optional[str] = None, + valid_types: Optional[Tuple[Type[Any]]] = None, + **additional_parameters: Any, + ) -> Any: + context = {"config": config, **additional_parameters} + + for alias, equivalent in self.ALIASES.items(): + if alias in context: + # This is unexpected. We could ignore or log a warning, but failing loudly should result in fewer surprises + raise ValueError( + f"Found reserved keyword {alias} in interpolation context. This is unexpected and indicative of a bug in the CDK." + ) + elif equivalent in context: + context[alias] = context[equivalent] + + try: + if isinstance(input_str, str): + result = self._eval(input_str, context) + if result: + return self._literal_eval(result, valid_types) + else: + # If input is not a string, return it as is + raise Exception(f"Expected a string, got {input_str}") + except UndefinedError: + pass + # If result is empty or resulted in an undefined error, evaluate and return the default string + return self._literal_eval(self._eval(default, context), valid_types) + + def _literal_eval(self, result: Optional[str], valid_types: Optional[Tuple[Type[Any]]]) -> Any: + try: + evaluated = ast.literal_eval(result) # type: ignore # literal_eval is able to handle None + except (ValueError, SyntaxError): + return result + if not valid_types or (valid_types and isinstance(evaluated, valid_types)): + return evaluated + return result + + def _eval(self, s: Optional[str], context: Mapping[str, Any]) -> Optional[str]: + try: + undeclared = self._find_undeclared_variables(s) + undeclared_not_in_context = {var for var in undeclared if var not in context} + if undeclared_not_in_context: + raise ValueError(f"Jinja macro has undeclared variables: {undeclared_not_in_context}. Context: {context}") + return self._compile(s).render(context) # type: ignore # from_string is able to handle None + except TypeError: + # The string is a static value, not a jinja template + # It can be returned as is + return s + + @cache + def _find_undeclared_variables(self, s: Optional[str]) -> Template: + """ + Find undeclared variables and cache them + """ + ast = self._environment.parse(s) # type: ignore # parse is able to handle None + return meta.find_undeclared_variables(ast) + + @cache + def _compile(self, s: Optional[str]) -> Template: + """ + We must cache the Jinja Template ourselves because we're using `from_string` instead of a template loader + """ + return self._environment.from_string(s) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/macros.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/macros.py new file mode 100644 index 000000000000..f50444347cd3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/macros.py @@ -0,0 +1,140 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import builtins +import datetime +import typing +from typing import Optional, Union + +import isodate +import pytz +from dateutil import parser +from isodate import parse_duration + +""" +This file contains macros that can be evaluated by a `JinjaInterpolation` object +""" + + +def now_utc() -> datetime.datetime: + """ + Current local date and time in UTC timezone + + Usage: + `"{{ now_utc() }}"` + """ + return datetime.datetime.now(datetime.timezone.utc) + + +def today_utc() -> datetime.date: + """ + Current date in UTC timezone + + Usage: + `"{{ today_utc() }}"` + """ + return datetime.datetime.now(datetime.timezone.utc).date() + + +def today_with_timezone(timezone: str) -> datetime.date: + """ + Current date in custom timezone + + :param timezone: timezone expressed as IANA keys format. Example: "Pacific/Tarawa" + :return: + """ + return datetime.datetime.now(tz=pytz.timezone(timezone)).date() + + +def timestamp(dt: Union[float, str]) -> Union[int, float]: + """ + Converts a number or a string to a timestamp + + If dt is a number, then convert to an int + If dt is a string, then parse it using dateutil.parser + + Usage: + `"{{ timestamp(1658505815.223235) }}" + + :param dt: datetime to convert to timestamp + :return: unix timestamp + """ + if isinstance(dt, (int, float)): + return int(dt) + else: + return _str_to_datetime(dt).astimezone(pytz.utc).timestamp() + + +def _str_to_datetime(s: str) -> datetime.datetime: + parsed_date = parser.isoparse(s) + if not parsed_date.tzinfo: + # Assume UTC if the input does not contain a timezone + parsed_date = parsed_date.replace(tzinfo=pytz.utc) + return parsed_date.astimezone(pytz.utc) + + +def max(*args: typing.Any) -> typing.Any: + """ + Returns biggest object of an iterable, or two or more arguments. + + max(iterable, *[, default=obj, key=func]) -> value + max(arg1, arg2, *args, *[, key=func]) -> value + + Usage: + `"{{ max(2,3) }}" + + With a single iterable argument, return its biggest item. The + default keyword-only argument specifies an object to return if + the provided iterable is empty. + With two or more arguments, return the largest argument. + :param args: args to compare + :return: largest argument + """ + return builtins.max(*args) + + +def day_delta(num_days: int, format: str = "%Y-%m-%dT%H:%M:%S.%f%z") -> str: + """ + Returns datetime of now() + num_days + + Usage: + `"{{ day_delta(25) }}"` + + :param num_days: number of days to add to current date time + :return: datetime formatted as RFC3339 + """ + return (datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(days=num_days)).strftime(format) + + +def duration(datestring: str) -> Union[datetime.timedelta, isodate.Duration]: + """ + Converts ISO8601 duration to datetime.timedelta + + Usage: + `"{{ now_utc() - duration('P1D') }}"` + """ + return parse_duration(datestring) # type: ignore # mypy thinks this returns Any for some reason + + +def format_datetime(dt: Union[str, datetime.datetime], format: str, input_format: Optional[str] = None) -> str: + """ + Converts datetime to another format + + Usage: + `"{{ format_datetime(config.start_date, '%Y-%m-%d') }}"` + + CPython Datetime package has known bug with `stfrtime` method: '%s' formatting uses locale timezone + https://github.com/python/cpython/issues/77169 + https://github.com/python/cpython/issues/56959 + """ + if isinstance(dt, datetime.datetime): + return dt.strftime(format) + dt_datetime = datetime.datetime.strptime(dt, input_format) if input_format else _str_to_datetime(dt) + if format == "%s": + return str(int(dt_datetime.timestamp())) + return dt_datetime.strftime(format) + + +_macros_list = [now_utc, today_utc, timestamp, max, day_delta, duration, format_datetime, today_with_timezone] +macros = {f.__name__: f for f in _macros_list} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/manifest_declarative_source.py new file mode 100644 index 000000000000..842d4e944454 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -0,0 +1,236 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +import pkgutil +import re +from copy import deepcopy +from importlib import metadata +from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple, Union + +import yaml +from airbyte_cdk.models import ( + AirbyteConnectionStatus, + AirbyteMessage, + AirbyteStateMessage, + ConfiguredAirbyteCatalog, + ConnectorSpecification, +) +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CheckStream as CheckStreamModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import DeclarativeStream as DeclarativeStreamModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel +from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import ManifestComponentTransformer +from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ManifestReferenceResolver +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ModelToComponentFactory +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.types import ConnectionDefinition +from airbyte_cdk.sources.utils.slice_logger import AlwaysLogSliceLogger, DebugSliceLogger, SliceLogger +from jsonschema.exceptions import ValidationError +from jsonschema.validators import validate + + +class ManifestDeclarativeSource(DeclarativeSource): + """Declarative source defined by a manifest of low-code components that define source connector behavior""" + + def __init__( + self, + source_config: ConnectionDefinition, + debug: bool = False, + emit_connector_builder_messages: bool = False, + component_factory: Optional[ModelToComponentFactory] = None, + ): + """ + :param source_config(Mapping[str, Any]): The manifest of low-code components that describe the source connector + :param debug(bool): True if debug mode is enabled + :param component_factory(ModelToComponentFactory): optional factory if ModelToComponentFactory's default behaviour needs to be tweaked + """ + self.logger = logging.getLogger(f"airbyte.{self.name}") + + # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing + manifest = dict(source_config) + if "type" not in manifest: + manifest["type"] = "DeclarativeSource" + + resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest) + propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters("", resolved_source_config, {}) + self._source_config = propagated_source_config + self._debug = debug + self._emit_connector_builder_messages = emit_connector_builder_messages + self._constructor = component_factory if component_factory else ModelToComponentFactory(emit_connector_builder_messages) + self._message_repository = self._constructor.get_message_repository() + self._slice_logger: SliceLogger = AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger() + + self._validate_source() + + @property + def resolved_manifest(self) -> Mapping[str, Any]: + return self._source_config + + @property + def message_repository(self) -> Union[None, MessageRepository]: + return self._message_repository + + @property + def connection_checker(self) -> ConnectionChecker: + check = self._source_config["check"] + if "type" not in check: + check["type"] = "CheckStream" + check_stream = self._constructor.create_component( + CheckStreamModel, check, dict(), emit_connector_builder_messages=self._emit_connector_builder_messages + ) + if isinstance(check_stream, ConnectionChecker): + return check_stream + else: + raise ValueError(f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}") + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + self._emit_manifest_debug_message(extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)}) + stream_configs = self._stream_configs(self._source_config) + + source_streams = [ + self._constructor.create_component( + DeclarativeStreamModel, stream_config, config, emit_connector_builder_messages=self._emit_connector_builder_messages + ) + for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) + ] + + return source_streams + + @staticmethod + def _initialize_cache_for_parent_streams(stream_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + parent_streams = set() + + def update_with_cache_parent_configs(parent_configs: list[dict[str, Any]]) -> None: + for parent_config in parent_configs: + parent_streams.add(parent_config["stream"]["name"]) + parent_config["stream"]["retriever"]["requester"]["use_cache"] = True + + for stream_config in stream_configs: + if stream_config.get("incremental_sync", {}).get("parent_stream"): + parent_streams.add(stream_config["incremental_sync"]["parent_stream"]["name"]) + stream_config["incremental_sync"]["parent_stream"]["retriever"]["requester"]["use_cache"] = True + + elif stream_config.get("retriever", {}).get("partition_router", {}): + partition_router = stream_config["retriever"]["partition_router"] + + if isinstance(partition_router, dict) and partition_router.get("parent_stream_configs"): + update_with_cache_parent_configs(partition_router["parent_stream_configs"]) + elif isinstance(partition_router, list): + for router in partition_router: + if router.get("parent_stream_configs"): + update_with_cache_parent_configs(router["parent_stream_configs"]) + + for stream_config in stream_configs: + if stream_config["name"] in parent_streams: + stream_config["retriever"]["requester"]["use_cache"] = True + + return stream_configs + + def spec(self, logger: logging.Logger) -> ConnectorSpecification: + """ + Returns the connector specification (spec) as defined in the Airbyte Protocol. The spec is an object describing the possible + configurations (e.g: username and password) which can be configured when running this connector. For low-code connectors, this + will first attempt to load the spec from the manifest's spec block, otherwise it will load it from "spec.yaml" or "spec.json" + in the project root. + """ + self._configure_logger_level(logger) + self._emit_manifest_debug_message(extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)}) + + spec = self._source_config.get("spec") + if spec: + if "type" not in spec: + spec["type"] = "Spec" + spec_component = self._constructor.create_component(SpecModel, spec, dict()) + return spec_component.generate_spec() + else: + return super().spec(logger) + + def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: + self._configure_logger_level(logger) + return super().check(logger, config) + + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: Optional[List[AirbyteStateMessage]] = None, + ) -> Iterator[AirbyteMessage]: + self._configure_logger_level(logger) + yield from super().read(logger, config, catalog, state) + + def _configure_logger_level(self, logger: logging.Logger) -> None: + """ + Set the log level to logging.DEBUG if debug mode is enabled + """ + if self._debug: + logger.setLevel(logging.DEBUG) + + def _validate_source(self) -> None: + """ + Validates the connector manifest against the declarative component schema + """ + try: + raw_component_schema = pkgutil.get_data("airbyte_cdk", "sources/declarative/declarative_component_schema.yaml") + if raw_component_schema is not None: + declarative_component_schema = yaml.load(raw_component_schema, Loader=yaml.SafeLoader) + else: + raise RuntimeError("Failed to read manifest component json schema required for validation") + except FileNotFoundError as e: + raise FileNotFoundError(f"Failed to read manifest component json schema required for validation: {e}") + + streams = self._source_config.get("streams") + if not streams: + raise ValidationError(f"A valid manifest should have at least one stream defined. Got {streams}") + + try: + validate(self._source_config, declarative_component_schema) + except ValidationError as e: + raise ValidationError("Validation against json schema defined in declarative_component_schema.yaml schema failed") from e + + cdk_version = metadata.version("airbyte_cdk") + cdk_major, cdk_minor, cdk_patch = self._get_version_parts(cdk_version, "airbyte-cdk") + manifest_version = self._source_config.get("version") + if manifest_version is None: + raise RuntimeError( + "Manifest version is not defined in the manifest. This is unexpected since it should be a required field. Please contact support." + ) + manifest_major, manifest_minor, manifest_patch = self._get_version_parts(manifest_version, "manifest") + + if cdk_major < manifest_major or (cdk_major == manifest_major and cdk_minor < manifest_minor): + raise ValidationError( + f"The manifest version {manifest_version} is greater than the airbyte-cdk package version ({cdk_version}). Your " + f"manifest may contain features that are not in the current CDK version." + ) + elif manifest_major == 0 and manifest_minor < 29: + raise ValidationError( + f"The low-code framework was promoted to Beta in airbyte-cdk version 0.29.0 and contains many breaking changes to the " + f"language. The manifest version {manifest_version} is incompatible with the airbyte-cdk package version " + f"{cdk_version} which contains these breaking changes." + ) + + @staticmethod + def _get_version_parts(version: str, version_type: str) -> Tuple[int, int, int]: + """ + Takes a semantic version represented as a string and splits it into a tuple of its major, minor, and patch versions. + """ + version_parts = re.split(r"\.", version) + if len(version_parts) != 3 or not all([part.isdigit() for part in version_parts]): + raise ValidationError(f"The {version_type} version {version} specified is not a valid version format (ex. 1.2.3)") + return tuple(int(part) for part in version_parts) # type: ignore # We already verified there were 3 parts and they are all digits + + def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: + # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config + stream_configs: List[Dict[str, Any]] = manifest.get("streams", []) + for s in stream_configs: + if "type" not in s: + s["type"] = "DeclarativeStream" + return stream_configs + + def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None: + self.logger.debug("declarative source created from manifest", extra=extra_args) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py new file mode 100644 index 000000000000..361f81bf8b21 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from typing import Any, Mapping + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration +from airbyte_cdk.sources.declarative.models import DatetimeBasedCursor, SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ParentStreamConfig + + +def _is_already_migrated(stream_state: Mapping[str, Any]) -> bool: + return "states" in stream_state + + +class LegacyToPerPartitionStateMigration(StateMigration): + """ + Transforms the input state for per-partitioned streams from the legacy format to the low-code format. + The cursor field and partition ID fields are automatically extracted from the stream's DatetimebasedCursor and SubstreamPartitionRouter. + + Example input state: + { + "13506132": { + "last_changed": "2022-12-27T08:34:39+00:00" + } + Example output state: + { + "partition": {"id": "13506132"}, + "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"} + } + """ + + def __init__( + self, + partition_router: SubstreamPartitionRouter, + cursor: DatetimeBasedCursor, + config: Mapping[str, Any], + parameters: Mapping[str, Any], + ): + self._partition_router = partition_router + self._cursor = cursor + self._config = config + self._parameters = parameters + self._partition_key_field = InterpolatedString.create( + self._get_partition_field(self._partition_router), parameters=self._parameters + ).eval(self._config) + self._cursor_field = InterpolatedString.create(self._cursor.cursor_field, parameters=self._parameters).eval(self._config) + + def _get_partition_field(self, partition_router: SubstreamPartitionRouter) -> str: + parent_stream_config = partition_router.parent_stream_configs[0] + + # Retrieve the partition field with a condition, as properties are returned as a dictionary for custom components. + partition_field = ( + parent_stream_config.partition_field + if isinstance(parent_stream_config, ParentStreamConfig) + else parent_stream_config.get("partition_field") # type: ignore # See above comment on why parent_stream_config might be a dict + ) + + return partition_field + + def should_migrate(self, stream_state: Mapping[str, Any]) -> bool: + if _is_already_migrated(stream_state): + return False + + # There is exactly one parent stream + number_of_parent_streams = len(self._partition_router.parent_stream_configs) + if number_of_parent_streams != 1: + # There should be exactly one parent stream + return False + """ + The expected state format is + "" : { + "" : "" + } + """ + if stream_state: + for key, value in stream_state.items(): + if isinstance(value, dict): + keys = list(value.keys()) + if len(keys) != 1: + # The input partitioned state should only have one key + return False + if keys[0] != self._cursor_field: + # Unexpected key. Found {keys[0]}. Expected {self._cursor.cursor_field} + return False + return True + + def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]: + states = [{"partition": {self._partition_key_field: key}, "cursor": value} for key, value in stream_state.items()] + return {"states": states} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/state_migration.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/state_migration.py new file mode 100644 index 000000000000..9cf7f3cfe08c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/migrations/state_migration.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from abc import abstractmethod +from typing import Any, Mapping + + +class StateMigration: + @abstractmethod + def should_migrate(self, stream_state: Mapping[str, Any]) -> bool: + """ + Check if the stream_state should be migrated + + :param stream_state: The stream_state to potentially migrate + :return: true if the state is of the expected format and should be migrated. False otherwise. + """ + + @abstractmethod + def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]: + """ + Migrate the stream_state. Assumes should_migrate(stream_state) returned True. + + :param stream_state: The stream_state to migrate + :return: The migrated stream_state + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/__init__.py new file mode 100644 index 000000000000..81f2e2f3351e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/__init__.py @@ -0,0 +1,2 @@ +# generated by bin/generate_component_manifest_files.py +from .declarative_component_schema import * diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/declarative_component_schema.py new file mode 100644 index 000000000000..eeb773b2522f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -0,0 +1,1729 @@ +# generated by datamodel-codegen: +# filename: declarative_component_schema.yaml + +from __future__ import annotations + +from enum import Enum +from typing import Any, Dict, List, Optional, Union + +from pydantic.v1 import BaseModel, Extra, Field +from typing_extensions import Literal + + +class AuthFlowType(Enum): + oauth2_0 = 'oauth2.0' + oauth1_0 = 'oauth1.0' + + +class BasicHttpAuthenticator(BaseModel): + type: Literal['BasicHttpAuthenticator'] + username: str = Field( + ..., + description='The username that will be combined with the password, base64 encoded and used to make requests. Fill it in the user inputs.', + examples=["{{ config['username'] }}", "{{ config['api_key'] }}"], + title='Username', + ) + password: Optional[str] = Field( + '', + description='The password that will be combined with the username, base64 encoded and used to make requests. Fill it in the user inputs.', + examples=["{{ config['password'] }}", ''], + title='Password', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class BearerAuthenticator(BaseModel): + type: Literal['BearerAuthenticator'] + api_token: str = Field( + ..., + description='Token to inject as request header for authenticating with the API.', + examples=["{{ config['api_key'] }}", "{{ config['token'] }}"], + title='Bearer Token', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CheckStream(BaseModel): + type: Literal['CheckStream'] + stream_names: List[str] = Field( + ..., + description='Names of the streams to try reading from when running a check operation.', + examples=[['users'], ['users', 'contacts']], + title='Stream Names', + ) + + +class ConcurrencyLevel(BaseModel): + type: Optional[Literal['ConcurrencyLevel']] = None + default_concurrency: Union[int, str] = Field( + ..., + description='The amount of concurrency that will applied during a sync. This value can be hardcoded or user-defined in the config if different users have varying volume thresholds in the target API.', + examples=[10, "{{ config['num_workers'] or 10 }}"], + title='Default Concurrency', + ) + max_concurrency: Optional[int] = Field( + None, + description='The maximum level of concurrency that will be used during a sync. This becomes a required field when the default_concurrency derives from the config, because it serves as a safeguard against a user-defined threshold that is too high.', + examples=[20, 100], + title='Max Concurrency', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class ConstantBackoffStrategy(BaseModel): + type: Literal['ConstantBackoffStrategy'] + backoff_time_in_seconds: Union[float, str] = Field( + ..., + description='Backoff time in seconds.', + examples=[30, 30.5, "{{ config['backoff_time'] }}"], + title='Backoff Time', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CursorPagination(BaseModel): + type: Literal['CursorPagination'] + cursor_value: str = Field( + ..., + description='Value of the cursor defining the next page to fetch.', + examples=[ + '{{ headers.link.next.cursor }}', + "{{ last_record['key'] }}", + "{{ response['nextPage'] }}", + ], + title='Cursor Value', + ) + page_size: Optional[int] = Field( + None, + description='The number of records to include in each pages.', + examples=[100], + title='Page Size', + ) + stop_condition: Optional[str] = Field( + None, + description='Template string evaluating when to stop paginating.', + examples=[ + '{{ response.data.has_more is false }}', + "{{ 'next' not in headers['link'] }}", + ], + title='Stop Condition', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomAuthenticator(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomAuthenticator'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom authentication strategy. Has to be a sub class of DeclarativeAuthenticator. The format is `source_..`.', + examples=['source_railz.components.ShortLivedTokenAuthenticator'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomBackoffStrategy(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomBackoffStrategy'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom backoff strategy. The format is `source_..`.', + examples=['source_railz.components.MyCustomBackoffStrategy'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomErrorHandler(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomErrorHandler'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom error handler. The format is `source_..`.', + examples=['source_railz.components.MyCustomErrorHandler'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomIncrementalSync(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomIncrementalSync'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom incremental sync. The format is `source_..`.', + examples=['source_railz.components.MyCustomIncrementalSync'], + title='Class Name', + ) + cursor_field: str = Field( + ..., + description='The location of the value on a record that will be used as a bookmark during sync.', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomPaginationStrategy(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomPaginationStrategy'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom pagination strategy. The format is `source_..`.', + examples=['source_railz.components.MyCustomPaginationStrategy'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomRecordExtractor(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomRecordExtractor'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom record extraction strategy. The format is `source_..`.', + examples=['source_railz.components.MyCustomRecordExtractor'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomRecordFilter(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomRecordFilter'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom record filter strategy. The format is `source_..`.', + examples=['source_railz.components.MyCustomCustomRecordFilter'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomRequester(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomRequester'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom requester strategy. The format is `source_..`.', + examples=['source_railz.components.MyCustomRecordExtractor'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomRetriever(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomRetriever'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom retriever strategy. The format is `source_..`.', + examples=['source_railz.components.MyCustomRetriever'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomPartitionRouter(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomPartitionRouter'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom partition router. The format is `source_..`.', + examples=['source_railz.components.MyCustomPartitionRouter'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomSchemaLoader(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomSchemaLoader'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom schema loader. The format is `source_..`.', + examples=['source_railz.components.MyCustomSchemaLoader'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomStateMigration(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomStateMigration'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom state migration. The format is `source_..`.', + examples=['source_railz.components.MyCustomStateMigration'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class CustomTransformation(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['CustomTransformation'] + class_name: str = Field( + ..., + description='Fully-qualified name of the class that will be implementing the custom transformation. The format is `source_..`.', + examples=['source_railz.components.MyCustomTransformation'], + title='Class Name', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class LegacyToPerPartitionStateMigration(BaseModel): + class Config: + extra = Extra.allow + + type: Optional[Literal['LegacyToPerPartitionStateMigration']] = None + + +class Algorithm(Enum): + HS256 = 'HS256' + HS384 = 'HS384' + HS512 = 'HS512' + ES256 = 'ES256' + ES256K = 'ES256K' + ES384 = 'ES384' + ES512 = 'ES512' + RS256 = 'RS256' + RS384 = 'RS384' + RS512 = 'RS512' + PS256 = 'PS256' + PS384 = 'PS384' + PS512 = 'PS512' + EdDSA = 'EdDSA' + + +class JwtHeaders(BaseModel): + class Config: + extra = Extra.forbid + + kid: Optional[str] = Field( + None, + description='Private key ID for user account.', + examples=["{{ config['kid'] }}"], + title='Key Identifier', + ) + typ: Optional[str] = Field( + 'JWT', + description='The media type of the complete JWT.', + examples=['JWT'], + title='Type', + ) + cty: Optional[str] = Field( + None, + description='Content type of JWT header.', + examples=['JWT'], + title='Content Type', + ) + + +class JwtPayload(BaseModel): + class Config: + extra = Extra.forbid + + iss: Optional[str] = Field( + None, + description='The user/principal that issued the JWT. Commonly a value unique to the user.', + examples=["{{ config['iss'] }}"], + title='Issuer', + ) + sub: Optional[str] = Field( + None, + description='The subject of the JWT. Commonly defined by the API.', + title='Subject', + ) + aud: Optional[str] = Field( + None, + description='The recipient that the JWT is intended for. Commonly defined by the API.', + examples=['appstoreconnect-v1'], + title='Audience', + ) + + +class JwtAuthenticator(BaseModel): + type: Literal['JwtAuthenticator'] + secret_key: str = Field( + ..., + description='Secret used to sign the JSON web token.', + examples=["{{ config['secret_key'] }}"], + ) + base64_encode_secret_key: Optional[bool] = Field( + False, + description='When set to true, the secret key will be base64 encoded prior to being encoded as part of the JWT. Only set to "true" when required by the API.', + ) + algorithm: Algorithm = Field( + ..., + description='Algorithm used to sign the JSON web token.', + examples=['ES256', 'HS256', 'RS256', "{{ config['algorithm'] }}"], + ) + token_duration: Optional[int] = Field( + 1200, + description='The amount of time in seconds a JWT token can be valid after being issued.', + examples=[1200, 3600], + title='Token Duration', + ) + header_prefix: Optional[str] = Field( + None, + description='The prefix to be used within the Authentication header.', + examples=['Bearer', 'Basic'], + title='Header Prefix', + ) + jwt_headers: Optional[JwtHeaders] = Field( + None, + description='JWT headers used when signing JSON web token.', + title='JWT Headers', + ) + additional_jwt_headers: Optional[Dict[str, Any]] = Field( + None, + description='Additional headers to be included with the JWT headers object.', + title='Additional JWT Headers', + ) + jwt_payload: Optional[JwtPayload] = Field( + None, + description='JWT Payload used when signing JSON web token.', + title='JWT Payload', + ) + additional_jwt_payload: Optional[Dict[str, Any]] = Field( + None, + description='Additional properties to be added to the JWT payload.', + title='Additional JWT Payload Properties', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class RefreshTokenUpdater(BaseModel): + refresh_token_name: Optional[str] = Field( + 'refresh_token', + description='The name of the property which contains the updated refresh token in the response from the token refresh endpoint.', + examples=['refresh_token'], + title='Refresh Token Property Name', + ) + access_token_config_path: Optional[List[str]] = Field( + ['credentials', 'access_token'], + description='Config path to the access token. Make sure the field actually exists in the config.', + examples=[['credentials', 'access_token'], ['access_token']], + title='Config Path To Access Token', + ) + refresh_token_config_path: Optional[List[str]] = Field( + ['credentials', 'refresh_token'], + description='Config path to the access token. Make sure the field actually exists in the config.', + examples=[['credentials', 'refresh_token'], ['refresh_token']], + title='Config Path To Refresh Token', + ) + token_expiry_date_config_path: Optional[List[str]] = Field( + ['credentials', 'token_expiry_date'], + description='Config path to the expiry date. Make sure actually exists in the config.', + examples=[['credentials', 'token_expiry_date']], + title='Config Path To Expiry Date', + ) + refresh_token_error_status_codes: Optional[List[int]] = Field( + [], + description='Status Codes to Identify refresh token error in response (Refresh Token Error Key and Refresh Token Error Values should be also specified). Responses with one of the error status code and containing an error value will be flagged as a config error', + examples=[[400, 500]], + title='Refresh Token Error Status Codes', + ) + refresh_token_error_key: Optional[str] = Field( + '', + description='Key to Identify refresh token error in response (Refresh Token Error Status Codes and Refresh Token Error Values should be also specified).', + examples=['error'], + title='Refresh Token Error Key', + ) + refresh_token_error_values: Optional[List[str]] = Field( + [], + description='List of values to check for exception during token refresh process. Used to check if the error found in the response matches the key from the Refresh Token Error Key field (e.g. response={"error": "invalid_grant"}). Only responses with one of the error status code and containing an error value will be flagged as a config error', + examples=[['invalid_grant', 'invalid_permissions']], + title='Refresh Token Error Values', + ) + + +class OAuthAuthenticator(BaseModel): + type: Literal['OAuthAuthenticator'] + client_id: str = Field( + ..., + description='The OAuth client ID. Fill it in the user inputs.', + examples=["{{ config['client_id }}", "{{ config['credentials']['client_id }}"], + title='Client ID', + ) + client_secret: str = Field( + ..., + description='The OAuth client secret. Fill it in the user inputs.', + examples=[ + "{{ config['client_secret }}", + "{{ config['credentials']['client_secret }}", + ], + title='Client Secret', + ) + refresh_token: Optional[str] = Field( + None, + description='Credential artifact used to get a new access token.', + examples=[ + "{{ config['refresh_token'] }}", + "{{ config['credentials]['refresh_token'] }}", + ], + title='Refresh Token', + ) + token_refresh_endpoint: str = Field( + ..., + description='The full URL to call to obtain a new access token.', + examples=['https://connect.squareup.com/oauth2/token'], + title='Token Refresh Endpoint', + ) + access_token_name: Optional[str] = Field( + 'access_token', + description='The name of the property which contains the access token in the response from the token refresh endpoint.', + examples=['access_token'], + title='Access Token Property Name', + ) + expires_in_name: Optional[str] = Field( + 'expires_in', + description='The name of the property which contains the expiry date in the response from the token refresh endpoint.', + examples=['expires_in'], + title='Token Expiry Property Name', + ) + grant_type: Optional[str] = Field( + 'refresh_token', + description='Specifies the OAuth2 grant type. If set to refresh_token, the refresh_token needs to be provided as well. For client_credentials, only client id and secret are required. Other grant types are not officially supported.', + examples=['refresh_token', 'client_credentials'], + title='Grant Type', + ) + refresh_request_body: Optional[Dict[str, Any]] = Field( + None, + description='Body of the request sent to get a new access token.', + examples=[ + { + 'applicationId': "{{ config['application_id'] }}", + 'applicationSecret': "{{ config['application_secret'] }}", + 'token': "{{ config['token'] }}", + } + ], + title='Refresh Request Body', + ) + scopes: Optional[List[str]] = Field( + None, + description='List of scopes that should be granted to the access token.', + examples=[ + ['crm.list.read', 'crm.objects.contacts.read', 'crm.schema.contacts.read'] + ], + title='Scopes', + ) + token_expiry_date: Optional[str] = Field( + None, + description='The access token expiry date.', + examples=['2023-04-06T07:12:10.421833+00:00', 1680842386], + title='Token Expiry Date', + ) + token_expiry_date_format: Optional[str] = Field( + None, + description='The format of the time to expiration datetime. Provide it if the time is returned as a date-time string instead of seconds.', + examples=['%Y-%m-%d %H:%M:%S.%f+00:00'], + title='Token Expiry Date Format', + ) + refresh_token_updater: Optional[RefreshTokenUpdater] = Field( + None, + description='When the token updater is defined, new refresh tokens, access tokens and the access token expiry date are written back from the authentication response to the config object. This is important if the refresh token can only used once.', + title='Token Updater', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class DpathExtractor(BaseModel): + type: Literal['DpathExtractor'] + field_path: List[str] = Field( + ..., + description='List of potentially nested fields describing the full path of the field to extract. Use "*" to extract all values from an array. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/record-selector).', + examples=[ + ['data'], + ['data', 'records'], + ['data', '{{ parameters.name }}'], + ['data', '*', 'record'], + ], + title='Field Path', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class ExponentialBackoffStrategy(BaseModel): + type: Literal['ExponentialBackoffStrategy'] + factor: Optional[Union[float, str]] = Field( + 5, + description='Multiplicative constant applied on each retry.', + examples=[5, 5.5, '10'], + title='Factor', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class SessionTokenRequestBearerAuthenticator(BaseModel): + type: Literal['Bearer'] + + +class HttpMethod(Enum): + GET = 'GET' + POST = 'POST' + + +class Action(Enum): + SUCCESS = 'SUCCESS' + FAIL = 'FAIL' + RETRY = 'RETRY' + IGNORE = 'IGNORE' + RATE_LIMITED = 'RATE_LIMITED' + + +class FailureType(Enum): + system_error = 'system_error' + config_error = 'config_error' + transient_error = 'transient_error' + + +class HttpResponseFilter(BaseModel): + type: Literal['HttpResponseFilter'] + action: Optional[Action] = Field( + None, + description='Action to execute if a response matches the filter.', + examples=['SUCCESS', 'FAIL', 'RETRY', 'IGNORE', 'RATE_LIMITED'], + title='Action', + ) + failure_type: Optional[FailureType] = Field( + None, + description='Failure type of traced exception if a response matches the filter.', + examples=['system_error', 'config_error', 'transient_error'], + title='Failure Type', + ) + error_message: Optional[str] = Field( + None, + description='Error Message to display if the response matches the filter.', + title='Error Message', + ) + error_message_contains: Optional[str] = Field( + None, + description='Match the response if its error message contains the substring.', + example=['This API operation is not enabled for this site'], + title='Error Message Substring', + ) + http_codes: Optional[List[int]] = Field( + None, + description='Match the response if its HTTP code is included in this list.', + examples=[[420, 429], [500]], + title='HTTP Codes', + ) + predicate: Optional[str] = Field( + None, + description='Match the response if the predicate evaluates to true.', + examples=[ + "{{ 'Too much requests' in response }}", + "{{ 'error_code' in response and response['error_code'] == 'ComplexityException' }}", + ], + title='Predicate', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class InlineSchemaLoader(BaseModel): + type: Literal['InlineSchemaLoader'] + schema_: Optional[Dict[str, Any]] = Field( + None, + alias='schema', + description='Describes a streams\' schema. Refer to the Data Types documentation for more details on which types are valid.', + title='Schema', + ) + + +class JsonFileSchemaLoader(BaseModel): + type: Literal['JsonFileSchemaLoader'] + file_path: Optional[str] = Field( + None, + description="Path to the JSON file defining the schema. The path is relative to the connector module's root.", + example=['./schemas/users.json'], + title='File Path', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class JsonDecoder(BaseModel): + type: Literal['JsonDecoder'] + + +class JsonlDecoder(BaseModel): + type: Literal['JsonlDecoder'] + + +class KeysToLower(BaseModel): + type: Literal['KeysToLower'] + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class IterableDecoder(BaseModel): + type: Literal['IterableDecoder'] + + +class XmlDecoder(BaseModel): + type: Literal['XmlDecoder'] + + +class MinMaxDatetime(BaseModel): + type: Literal['MinMaxDatetime'] + datetime: str = Field( + ..., + description='Datetime value.', + examples=['2021-01-01', '2021-01-01T00:00:00Z', "{{ config['start_time'] }}"], + title='Datetime', + ) + datetime_format: Optional[str] = Field( + '', + description='Format of the datetime value. Defaults to "%Y-%m-%dT%H:%M:%S.%f%z" if left empty. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456`\n * **%ms**: Epoch unix timestamp - `1686218963123`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`, `000001`, ..., `999999`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (Sunday as first day) - `00`, `01`, ..., `53`\n * **%W**: Week number of the year (Monday as first day) - `00`, `01`, ..., `53`\n * **%c**: Date and time representation - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date representation - `08/16/1988`\n * **%X**: Time representation - `21:30:00`\n * **%%**: Literal \'%\' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n', + examples=['%Y-%m-%dT%H:%M:%S.%f%z', '%Y-%m-%d', '%s'], + title='Datetime Format', + ) + max_datetime: Optional[str] = Field( + None, + description='Ceiling applied on the datetime value. Must be formatted with the datetime_format field.', + examples=['2021-01-01T00:00:00Z', '2021-01-01'], + title='Max Datetime', + ) + min_datetime: Optional[str] = Field( + None, + description='Floor applied on the datetime value. Must be formatted with the datetime_format field.', + examples=['2010-01-01T00:00:00Z', '2010-01-01'], + title='Min Datetime', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class NoAuth(BaseModel): + type: Literal['NoAuth'] + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class NoPagination(BaseModel): + type: Literal['NoPagination'] + + +class OAuthConfigSpecification(BaseModel): + class Config: + extra = Extra.allow + + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {'app_id': {'type': 'string', 'path_in_connector_config': ['app_id']}}, + { + 'app_id': { + 'type': 'string', + 'path_in_connector_config': ['info', 'app_id'], + } + }, + ], + title='OAuth user input', + ) + ) + complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations produced by the OAuth flows as they are\nreturned by the distant OAuth APIs.\nMust be a valid JSON describing the fields to merge back to `ConnectorSpecification.connectionSpecification`.\nFor each field, a special annotation `path_in_connector_config` can be specified to determine where to merge it,\nExamples:\n complete_oauth_output_specification={\n refresh_token: {\n type: string,\n path_in_connector_config: ['credentials', 'refresh_token']\n }\n }", + examples=[ + { + 'refresh_token': { + 'type': 'string,', + 'path_in_connector_config': ['credentials', 'refresh_token'], + } + } + ], + title='OAuth output specification', + ) + complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( + None, + description='OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }', + examples=[ + {'client_id': {'type': 'string'}, 'client_secret': {'type': 'string'}} + ], + title='OAuth input specification', + ) + complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations that\nalso need to be merged back into the connector configuration at runtime.\nThis is a subset configuration of `complete_oauth_server_input_specification` that filters fields out to retain only the ones that\nare necessary for the connector to function with OAuth. (some fields could be used during oauth flows but not needed afterwards, therefore\nthey would be listed in the `complete_oauth_server_input_specification` but not `complete_oauth_server_output_specification`)\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nconnector when using OAuth flow APIs.\nThese fields are to be merged back to `ConnectorSpecification.connectionSpecification`.\nFor each field, a special annotation `path_in_connector_config` can be specified to determine where to merge it,\nExamples:\n complete_oauth_server_output_specification={\n client_id: {\n type: string,\n path_in_connector_config: ['credentials', 'client_id']\n },\n client_secret: {\n type: string,\n path_in_connector_config: ['credentials', 'client_secret']\n }\n }", + examples=[ + { + 'client_id': { + 'type': 'string,', + 'path_in_connector_config': ['credentials', 'client_id'], + }, + 'client_secret': { + 'type': 'string,', + 'path_in_connector_config': ['credentials', 'client_secret'], + }, + } + ], + title='OAuth server output specification', + ) + + +class OffsetIncrement(BaseModel): + type: Literal['OffsetIncrement'] + page_size: Optional[Union[int, str]] = Field( + None, + description='The number of records to include in each pages.', + examples=[100, "{{ config['page_size'] }}"], + title='Limit', + ) + inject_on_first_request: Optional[bool] = Field( + False, + description='Using the `offset` with value `0` during the first request', + title='Inject Offset', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class PageIncrement(BaseModel): + type: Literal['PageIncrement'] + page_size: Optional[Union[int, str]] = Field( + None, + description='The number of records to include in each pages.', + examples=[100, '100', "{{ config['page_size'] }}"], + title='Page Size', + ) + start_from_page: Optional[int] = Field( + 0, + description='Index of the first page to request.', + examples=[0, 1], + title='Start From Page', + ) + inject_on_first_request: Optional[bool] = Field( + False, + description='Using the `page number` with value defined by `start_from_page` during the first request', + title='Inject Page Number', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class PrimaryKey(BaseModel): + __root__: Union[str, List[str], List[List[str]]] = Field( + ..., + description='The stream field to be used to distinguish unique records. Can either be a single field, an array of fields representing a composite key, or an array of arrays representing a composite key where the fields are nested fields.', + examples=['id', ['code', 'type']], + title='Primary Key', + ) + + +class RecordFilter(BaseModel): + type: Literal['RecordFilter'] + condition: Optional[str] = Field( + '', + description='The predicate to filter a record. Records will be removed if evaluated to False.', + examples=[ + "{{ record['created_at'] >= stream_interval['start_time'] }}", + "{{ record.status in ['active', 'expired'] }}", + ], + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class SchemaNormalization(Enum): + None_ = 'None' + Default = 'Default' + + +class RemoveFields(BaseModel): + type: Literal['RemoveFields'] + condition: Optional[str] = Field( + '', + description='The predicate to filter a property by a property value. Property will be removed if it is empty OR expression is evaluated to True.,', + examples=[ + "{{ property|string == '' }}", + '{{ property is integer }}', + '{{ property|length > 5 }}', + "{{ property == 'some_string_to_match' }}", + ], + ) + field_pointers: List[List[str]] = Field( + ..., + description='Array of paths defining the field to remove. Each item is an array whose field describe the path of a field to remove.', + examples=[['tags'], [['content', 'html'], ['content', 'plain_text']]], + title='Field Paths', + ) + + +class RequestPath(BaseModel): + type: Literal['RequestPath'] + + +class InjectInto(Enum): + request_parameter = 'request_parameter' + header = 'header' + body_data = 'body_data' + body_json = 'body_json' + + +class RequestOption(BaseModel): + type: Literal['RequestOption'] + field_name: str = Field( + ..., + description='Configures which key should be used in the location that the descriptor is being injected into', + examples=['segment_id'], + title='Request Option', + ) + inject_into: InjectInto = Field( + ..., + description='Configures where the descriptor should be set on the HTTP requests. Note that request parameters that are already encoded in the URL path will not be duplicated.', + examples=['request_parameter', 'header', 'body_data', 'body_json'], + title='Inject Into', + ) + + +class Schemas(BaseModel): + pass + + class Config: + extra = Extra.allow + + +class LegacySessionTokenAuthenticator(BaseModel): + type: Literal['LegacySessionTokenAuthenticator'] + header: str = Field( + ..., + description='The name of the session token header that will be injected in the request', + examples=['X-Session'], + title='Session Request Header', + ) + login_url: str = Field( + ..., + description='Path of the login URL (do not include the base URL)', + examples=['session'], + title='Login Path', + ) + session_token: Optional[str] = Field( + None, + description='Session token to use if using a pre-defined token. Not needed if authenticating with username + password pair', + example=["{{ config['session_token'] }}"], + title='Session Token', + ) + session_token_response_key: str = Field( + ..., + description='Name of the key of the session token to be extracted from the response', + examples=['id'], + title='Response Token Response Key', + ) + username: Optional[str] = Field( + None, + description='Username used to authenticate and obtain a session token', + examples=[" {{ config['username'] }}"], + title='Username', + ) + password: Optional[str] = Field( + '', + description='Password used to authenticate and obtain a session token', + examples=["{{ config['password'] }}", ''], + title='Password', + ) + validate_session_url: str = Field( + ..., + description='Path of the URL to use to validate that the session token is valid (do not include the base URL)', + examples=['user/current'], + title='Validate Session Path', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class AsyncJobStatusMap(BaseModel): + type: Optional[Literal['AsyncJobStatusMap']] = None + running: List[str] + completed: List[str] + failed: List[str] + timeout: List[str] + + +class ValueType(Enum): + string = 'string' + number = 'number' + integer = 'integer' + boolean = 'boolean' + + +class WaitTimeFromHeader(BaseModel): + type: Literal['WaitTimeFromHeader'] + header: str = Field( + ..., + description='The name of the response header defining how long to wait before retrying.', + examples=['Retry-After'], + title='Response Header Name', + ) + regex: Optional[str] = Field( + None, + description='Optional regex to apply on the header to extract its value. The regex should define a capture group defining the wait time.', + examples=['([-+]?\\d+)'], + title='Extraction Regex', + ) + max_waiting_time_in_seconds: Optional[float] = Field( + None, + description='Given the value extracted from the header is greater than this value, stop the stream.', + examples=[3600], + title='Max Waiting Time in Seconds', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class WaitUntilTimeFromHeader(BaseModel): + type: Literal['WaitUntilTimeFromHeader'] + header: str = Field( + ..., + description='The name of the response header defining how long to wait before retrying.', + examples=['wait_time'], + title='Response Header', + ) + min_wait: Optional[Union[float, str]] = Field( + None, + description='Minimum time to wait before retrying.', + examples=[10, '60'], + title='Minimum Wait Time', + ) + regex: Optional[str] = Field( + None, + description='Optional regex to apply on the header to extract its value. The regex should define a capture group defining the wait time.', + examples=['([-+]?\\d+)'], + title='Extraction Regex', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class AddedFieldDefinition(BaseModel): + type: Literal['AddedFieldDefinition'] + path: List[str] = Field( + ..., + description='List of strings defining the path where to add the value on the record.', + examples=[['segment_id'], ['metadata', 'segment_id']], + title='Path', + ) + value: str = Field( + ..., + description="Value of the new field. Use {{ record['existing_field'] }} syntax to refer to other fields in the record.", + examples=[ + "{{ record['updates'] }}", + "{{ record['MetaData']['LastUpdatedTime'] }}", + "{{ stream_partition['segment_id'] }}", + ], + title='Value', + ) + value_type: Optional[ValueType] = Field( + None, + description='Type of the value. If not specified, the type will be inferred from the value.', + title='Value Type', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class AddFields(BaseModel): + type: Literal['AddFields'] + fields: List[AddedFieldDefinition] = Field( + ..., + description='List of transformations (path and corresponding value) that will be added to the record.', + title='Fields', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class ApiKeyAuthenticator(BaseModel): + type: Literal['ApiKeyAuthenticator'] + api_token: Optional[str] = Field( + None, + description='The API key to inject in the request. Fill it in the user inputs.', + examples=["{{ config['api_key'] }}", "Token token={{ config['api_key'] }}"], + title='API Key', + ) + header: Optional[str] = Field( + None, + description='The name of the HTTP header that will be set to the API key. This setting is deprecated, use inject_into instead. Header and inject_into can not be defined at the same time.', + examples=['Authorization', 'Api-Token', 'X-Auth-Token'], + title='Header Name', + ) + inject_into: Optional[RequestOption] = Field( + None, + description='Configure how the API Key will be sent in requests to the source API. Either inject_into or header has to be defined.', + examples=[ + {'inject_into': 'header', 'field_name': 'Authorization'}, + {'inject_into': 'request_parameter', 'field_name': 'authKey'}, + ], + title='Inject API Key Into Outgoing HTTP Request', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class AuthFlow(BaseModel): + auth_flow_type: Optional[AuthFlowType] = Field( + None, description='The type of auth to use', title='Auth flow type' + ) + predicate_key: Optional[List[str]] = Field( + None, + description='JSON path to a field in the connectorSpecification that should exist for the advanced auth to be applicable.', + examples=[['credentials', 'auth_type']], + title='Predicate key', + ) + predicate_value: Optional[str] = Field( + None, + description='Value of the predicate_key fields for the advanced auth to be applicable.', + examples=['Oauth'], + title='Predicate value', + ) + oauth_config_specification: Optional[OAuthConfigSpecification] = None + + +class DatetimeBasedCursor(BaseModel): + type: Literal['DatetimeBasedCursor'] + cursor_field: str = Field( + ..., + description='The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.', + examples=['created_at', "{{ config['record_cursor'] }}"], + title='Cursor Field', + ) + datetime_format: str = Field( + ..., + description='The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456`\n * **%ms**: Epoch unix timestamp (milliseconds) - `1686218963123`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal \'%\' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n', + examples=['%Y-%m-%dT%H:%M:%S.%f%z', '%Y-%m-%d', '%s', '%ms', '%s_as_float'], + title='Outgoing Datetime Format', + ) + start_datetime: Union[str, MinMaxDatetime] = Field( + ..., + description='The datetime that determines the earliest record that should be synced.', + examples=['2020-01-1T00:00:00Z', "{{ config['start_time'] }}"], + title='Start Datetime', + ) + cursor_datetime_formats: Optional[List[str]] = Field( + None, + description='The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.', + title='Cursor Datetime Formats', + ) + cursor_granularity: Optional[str] = Field( + None, + description='Smallest increment the datetime_format has (ISO 8601 duration) that is used to ensure the start of a slice does not overlap with the end of the previous one, e.g. for %Y-%m-%d the granularity should be P1D, for %Y-%m-%dT%H:%M:%SZ the granularity should be PT1S. Given this field is provided, `step` needs to be provided as well.', + examples=['PT1S'], + title='Cursor Granularity', + ) + end_datetime: Optional[Union[str, MinMaxDatetime]] = Field( + None, + description='The datetime that determines the last record that should be synced. If not provided, `{{ now_utc() }}` will be used.', + examples=['2021-01-1T00:00:00Z', '{{ now_utc() }}', '{{ day_delta(-1) }}'], + title='End Datetime', + ) + end_time_option: Optional[RequestOption] = Field( + None, + description='Optionally configures how the end datetime will be sent in requests to the source API.', + title='Inject End Time Into Outgoing HTTP Request', + ) + is_data_feed: Optional[bool] = Field( + None, + description='A data feed API is an API that does not allow filtering and paginates the content from the most recent to the least recent. Given this, the CDK needs to know when to stop paginating and this field will generate a stop condition for pagination.', + title='Whether the target API is formatted as a data feed', + ) + is_client_side_incremental: Optional[bool] = Field( + None, + description='If the target API endpoint does not take cursor values to filter records and returns all records anyway, the connector with this cursor will filter out records locally, and only emit new records from the last sync, hence incremental. This means that all records would be read from the API, but only new records will be emitted to the destination.', + title='Whether the target API does not support filtering and returns all data (the cursor filters records in the client instead of the API side)', + ) + is_compare_strictly: Optional[bool] = Field( + False, + description='Set to True if the target API does not accept queries where the start time equal the end time.', + title='Whether to skip requests if the start time equals the end time', + ) + global_substream_cursor: Optional[bool] = Field( + False, + description='This setting optimizes performance when the parent stream has thousands of partitions by storing the cursor as a single value rather than per partition. Notably, the substream state is updated only at the end of the sync, which helps prevent data loss in case of a sync failure. See more info in the [docs](https://docs.airbyte.com/connector-development/config-based/understanding-the-yaml-file/incremental-syncs).', + title='Whether to store cursor as one value instead of per partition', + ) + lookback_window: Optional[str] = Field( + None, + description='Time interval before the start_datetime to read data for, e.g. P1M for looking back one month.', + examples=['P1D', "P{{ config['lookback_days'] }}D"], + title='Lookback Window', + ) + partition_field_end: Optional[str] = Field( + None, + description='Name of the partition start time field.', + examples=['ending_time'], + title='Partition Field End', + ) + partition_field_start: Optional[str] = Field( + None, + description='Name of the partition end time field.', + examples=['starting_time'], + title='Partition Field Start', + ) + start_time_option: Optional[RequestOption] = Field( + None, + description='Optionally configures how the start datetime will be sent in requests to the source API.', + title='Inject Start Time Into Outgoing HTTP Request', + ) + step: Optional[str] = Field( + None, + description='The size of the time window (ISO8601 duration). Given this field is provided, `cursor_granularity` needs to be provided as well.', + examples=['P1W', "{{ config['step_increment'] }}"], + title='Step', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class DefaultErrorHandler(BaseModel): + type: Literal['DefaultErrorHandler'] + backoff_strategies: Optional[ + List[ + Union[ + ConstantBackoffStrategy, + CustomBackoffStrategy, + ExponentialBackoffStrategy, + WaitTimeFromHeader, + WaitUntilTimeFromHeader, + ] + ] + ] = Field( + None, + description='List of backoff strategies to use to determine how long to wait before retrying a retryable request.', + title='Backoff Strategies', + ) + max_retries: Optional[int] = Field( + 5, + description='The maximum number of time to retry a retryable request before giving up and failing.', + examples=[5, 0, 10], + title='Max Retry Count', + ) + response_filters: Optional[List[HttpResponseFilter]] = Field( + None, + description="List of response filters to iterate on when deciding how to handle an error. When using an array of multiple filters, the filters will be applied sequentially and the response will be selected if it matches any of the filter's predicate.", + title='Response Filters', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class DefaultPaginator(BaseModel): + type: Literal['DefaultPaginator'] + pagination_strategy: Union[ + CursorPagination, CustomPaginationStrategy, OffsetIncrement, PageIncrement + ] = Field( + ..., + description='Strategy defining how records are paginated.', + title='Pagination Strategy', + ) + page_size_option: Optional[RequestOption] = None + page_token_option: Optional[Union[RequestOption, RequestPath]] = None + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class SessionTokenRequestApiKeyAuthenticator(BaseModel): + type: Literal['ApiKey'] + inject_into: RequestOption = Field( + ..., + description='Configure how the API Key will be sent in requests to the source API.', + examples=[ + {'inject_into': 'header', 'field_name': 'Authorization'}, + {'inject_into': 'request_parameter', 'field_name': 'authKey'}, + ], + title='Inject API Key Into Outgoing HTTP Request', + ) + + +class ListPartitionRouter(BaseModel): + type: Literal['ListPartitionRouter'] + cursor_field: str = Field( + ..., + description='While iterating over list values, the name of field used to reference a list value. The partition value can be accessed with string interpolation. e.g. "{{ stream_partition[\'my_key\'] }}" where "my_key" is the value of the cursor_field.', + examples=['section', "{{ config['section_key'] }}"], + title='Current Partition Value Identifier', + ) + values: Union[str, List[str]] = Field( + ..., + description='The list of attributes being iterated over and used as input for the requests made to the source API.', + examples=[['section_a', 'section_b', 'section_c'], "{{ config['sections'] }}"], + title='Partition Values', + ) + request_option: Optional[RequestOption] = Field( + None, + description='A request option describing where the list value should be injected into and under what field name if applicable.', + title='Inject Partition Value Into Outgoing HTTP Request', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class RecordSelector(BaseModel): + type: Literal['RecordSelector'] + extractor: Union[CustomRecordExtractor, DpathExtractor] + record_filter: Optional[Union[CustomRecordFilter, RecordFilter]] = Field( + None, + description='Responsible for filtering records to be emitted by the Source.', + title='Record Filter', + ) + schema_normalization: Optional[SchemaNormalization] = SchemaNormalization.None_ + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class Spec(BaseModel): + type: Literal['Spec'] + connection_specification: Dict[str, Any] = Field( + ..., + description='A connection specification describing how a the connector can be configured.', + title='Connection Specification', + ) + documentation_url: Optional[str] = Field( + None, + description="URL of the connector's documentation page.", + examples=['https://docs.airbyte.com/integrations/sources/dremio'], + title='Documentation URL', + ) + advanced_auth: Optional[AuthFlow] = Field( + None, + description='Advanced specification for configuring the authentication flow.', + title='Advanced Auth', + ) + + +class CompositeErrorHandler(BaseModel): + type: Literal['CompositeErrorHandler'] + error_handlers: List[Union[CompositeErrorHandler, DefaultErrorHandler]] = Field( + ..., + description='List of error handlers to iterate on to determine how to handle a failed response.', + title='Error Handlers', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class DeclarativeSource(BaseModel): + class Config: + extra = Extra.forbid + + type: Literal['DeclarativeSource'] + check: CheckStream + streams: List[DeclarativeStream] + version: str = Field( + ..., + description='The version of the Airbyte CDK used to build and test the source.', + ) + schemas: Optional[Schemas] = None + definitions: Optional[Dict[str, Any]] = None + spec: Optional[Spec] = None + concurrency_level: Optional[ConcurrencyLevel] = None + metadata: Optional[Dict[str, Any]] = Field( + None, + description='For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.', + ) + description: Optional[str] = Field( + None, + description='A description of the connector. It will be presented on the Source documentation page.', + ) + + +class SelectiveAuthenticator(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['SelectiveAuthenticator'] + authenticator_selection_path: List[str] = Field( + ..., + description='Path of the field in config with selected authenticator name', + examples=[['auth'], ['auth', 'type']], + title='Authenticator Selection Path', + ) + authenticators: Dict[ + str, + Union[ + ApiKeyAuthenticator, + BasicHttpAuthenticator, + BearerAuthenticator, + CustomAuthenticator, + OAuthAuthenticator, + JwtAuthenticator, + NoAuth, + SessionTokenAuthenticator, + LegacySessionTokenAuthenticator, + ], + ] = Field( + ..., + description='Authenticators to select from.', + examples=[ + { + 'authenticators': { + 'token': '#/definitions/ApiKeyAuthenticator', + 'oauth': '#/definitions/OAuthAuthenticator', + 'jwt': '#/definitions/JwtAuthenticator', + } + } + ], + title='Authenticators', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class DeclarativeStream(BaseModel): + class Config: + extra = Extra.allow + + type: Literal['DeclarativeStream'] + retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field( + ..., + description='Component used to coordinate how records are extracted across stream slices and request pages.', + title='Retriever', + ) + incremental_sync: Optional[Union[CustomIncrementalSync, DatetimeBasedCursor]] = ( + Field( + None, + description='Component used to fetch data incrementally based on a time field in the data.', + title='Incremental Sync', + ) + ) + name: Optional[str] = Field( + '', description='The stream name.', example=['Users'], title='Name' + ) + primary_key: Optional[PrimaryKey] = Field( + '', description='The primary key of the stream.', title='Primary Key' + ) + schema_loader: Optional[ + Union[InlineSchemaLoader, JsonFileSchemaLoader, CustomSchemaLoader] + ] = Field( + None, + description='Component used to retrieve the schema for the current stream.', + title='Schema Loader', + ) + transformations: Optional[ + List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]] + ] = Field( + None, + description='A list of transformations to be applied to each output record.', + title='Transformations', + ) + state_migrations: Optional[ + List[Union[LegacyToPerPartitionStateMigration, CustomStateMigration]] + ] = Field( + [], + description='Array of state migrations to be applied on the input state', + title='State Migrations', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class SessionTokenAuthenticator(BaseModel): + type: Literal['SessionTokenAuthenticator'] + login_requester: HttpRequester = Field( + ..., + description='Description of the request to perform to obtain a session token to perform data requests. The response body is expected to be a JSON object with a session token property.', + examples=[ + { + 'type': 'HttpRequester', + 'url_base': 'https://my_api.com', + 'path': '/login', + 'authenticator': { + 'type': 'BasicHttpAuthenticator', + 'username': '{{ config.username }}', + 'password': '{{ config.password }}', + }, + } + ], + title='Login Requester', + ) + session_token_path: List[str] = Field( + ..., + description='The path in the response body returned from the login requester to the session token.', + examples=[['access_token'], ['result', 'token']], + title='Session Token Path', + ) + expiration_duration: Optional[str] = Field( + None, + description='The duration in ISO 8601 duration notation after which the session token expires, starting from the time it was obtained. Omitting it will result in the session token being refreshed for every request.', + examples=['PT1H', 'P1D'], + title='Expiration Duration', + ) + request_authentication: Union[ + SessionTokenRequestApiKeyAuthenticator, SessionTokenRequestBearerAuthenticator + ] = Field( + ..., + description='Authentication method to use for requests sent to the API, specifying how to inject the session token.', + title='Data Request Authentication', + ) + decoder: Optional[Union[JsonDecoder, XmlDecoder]] = Field( + None, description='Component used to decode the response.', title='Decoder' + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class HttpRequester(BaseModel): + type: Literal['HttpRequester'] + url_base: str = Field( + ..., + description='Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.', + examples=[ + 'https://connect.squareup.com/v2', + "{{ config['base_url'] or 'https://app.posthog.com'}}/api/", + ], + title='API Base URL', + ) + path: str = Field( + ..., + description='Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.', + examples=[ + '/products', + "/quotes/{{ stream_partition['id'] }}/quote_line_groups", + "/trades/{{ config['symbol_id'] }}/history", + ], + title='URL Path', + ) + authenticator: Optional[ + Union[ + ApiKeyAuthenticator, + BasicHttpAuthenticator, + BearerAuthenticator, + CustomAuthenticator, + OAuthAuthenticator, + JwtAuthenticator, + NoAuth, + SessionTokenAuthenticator, + LegacySessionTokenAuthenticator, + SelectiveAuthenticator, + ] + ] = Field( + None, + description='Authentication method to use for requests sent to the API.', + title='Authenticator', + ) + error_handler: Optional[ + Union[DefaultErrorHandler, CustomErrorHandler, CompositeErrorHandler] + ] = Field( + None, + description='Error handler component that defines how to handle errors.', + title='Error Handler', + ) + http_method: Optional[HttpMethod] = Field( + HttpMethod.GET, + description='The HTTP method used to fetch data from the source (can be GET or POST).', + examples=['GET', 'POST'], + title='HTTP Method', + ) + request_body_data: Optional[Union[str, Dict[str, str]]] = Field( + None, + description='Specifies how to populate the body of the request with a non-JSON payload. Plain text will be sent as is, whereas objects will be converted to a urlencoded form.', + examples=[ + '[{"clause": {"type": "timestamp", "operator": 10, "parameters":\n [{"value": {{ stream_interval[\'start_time\'] | int * 1000 }} }]\n }, "orderBy": 1, "columnName": "Timestamp"}]/\n' + ], + title='Request Body Payload (Non-JSON)', + ) + request_body_json: Optional[Union[str, Dict[str, Any]]] = Field( + None, + description='Specifies how to populate the body of the request with a JSON payload. Can contain nested objects.', + examples=[ + {'sort_order': 'ASC', 'sort_field': 'CREATED_AT'}, + {'key': "{{ config['value'] }}"}, + {'sort': {'field': 'updated_at', 'order': 'ascending'}}, + ], + title='Request Body JSON Payload', + ) + request_headers: Optional[Union[str, Dict[str, str]]] = Field( + None, + description='Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.', + examples=[{'Output-Format': 'JSON'}, {'Version': "{{ config['version'] }}"}], + title='Request Headers', + ) + request_parameters: Optional[Union[str, Dict[str, str]]] = Field( + None, + description='Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.', + examples=[ + {'unit': 'day'}, + { + 'query': 'last_event_time BETWEEN TIMESTAMP "{{ stream_interval.start_time }}" AND TIMESTAMP "{{ stream_interval.end_time }}"' + }, + {'searchIn': "{{ ','.join(config.get('search_in', [])) }}"}, + {'sort_by[asc]': 'updated_at'}, + ], + title='Query Parameters', + ) + use_cache: Optional[bool] = Field( + False, + description='Enables stream requests caching. This field is automatically set by the CDK.', + title='Use Cache', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class ParentStreamConfig(BaseModel): + type: Literal['ParentStreamConfig'] + parent_key: str = Field( + ..., + description='The primary key of records from the parent stream that will be used during the retrieval of records for the current substream. This parent identifier field is typically a characteristic of the child records being extracted from the source API.', + examples=['id', "{{ config['parent_record_id'] }}"], + title='Parent Key', + ) + stream: DeclarativeStream = Field( + ..., description='Reference to the parent stream.', title='Parent Stream' + ) + partition_field: str = Field( + ..., + description='While iterating over parent records during a sync, the parent_key value can be referenced by using this field.', + examples=['parent_id', "{{ config['parent_partition_field'] }}"], + title='Current Parent Key Value Identifier', + ) + request_option: Optional[RequestOption] = Field( + None, + description='A request option describing where the parent key value should be injected into and under what field name if applicable.', + title='Request Option', + ) + incremental_dependency: Optional[bool] = Field( + False, + description='Indicates whether the parent stream should be read incrementally based on updates in the child stream.', + title='Incremental Dependency', + ) + extra_fields: Optional[List[List[str]]] = Field( + None, + description='Array of field paths to include as additional fields in the stream slice. Each path is an array of strings representing keys to access fields in the respective parent record. Accessible via `stream_slice.extra_fields`. Missing fields are set to `None`.', + title='Extra Fields', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class SimpleRetriever(BaseModel): + type: Literal['SimpleRetriever'] + record_selector: RecordSelector = Field( + ..., + description='Component that describes how to extract records from a HTTP response.', + ) + requester: Union[CustomRequester, HttpRequester] = Field( + ..., + description='Requester component that describes how to prepare HTTP requests to send to the source API.', + ) + paginator: Optional[Union[DefaultPaginator, NoPagination]] = Field( + None, + description="Paginator component that describes how to navigate through the API's pages.", + ) + ignore_stream_slicer_parameters_on_paginated_requests: Optional[bool] = Field( + False, + description='If true, the partition router and incremental request options will be ignored when paginating requests. Request options set directly on the requester will not be ignored.', + ) + partition_router: Optional[ + Union[ + CustomPartitionRouter, + ListPartitionRouter, + SubstreamPartitionRouter, + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], + ] + ] = Field( + [], + description='PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing.', + title='Partition Router', + ) + decoder: Optional[Union[JsonDecoder, JsonlDecoder, IterableDecoder, XmlDecoder]] = ( + Field( + None, + description='Component decoding the response so records can be extracted.', + title='Decoder', + ) + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class AsyncRetriever(BaseModel): + type: Literal['AsyncRetriever'] + record_selector: RecordSelector = Field( + ..., + description='Component that describes how to extract records from a HTTP response.', + ) + status_mapping: AsyncJobStatusMap = Field( + ..., description='Async Job Status to Airbyte CDK Async Job Status mapping.' + ) + status_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field( + ..., description='Responsible for fetching the actual status of the async job.' + ) + urls_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field( + ..., + description='Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.', + ) + creation_requester: Union[CustomRequester, HttpRequester] = Field( + ..., + description='Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.', + ) + polling_requester: Union[CustomRequester, HttpRequester] = Field( + ..., + description='Requester component that describes how to prepare HTTP requests to send to the source API to fetch the status of the running async job.', + ) + download_requester: Union[CustomRequester, HttpRequester] = Field( + ..., + description='Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job.', + ) + download_paginator: Optional[Union[DefaultPaginator, NoPagination]] = Field( + None, + description="Paginator component that describes how to navigate through the API's pages during download.", + ) + abort_requester: Optional[Union[CustomRequester, HttpRequester]] = Field( + None, + description="Requester component that describes how to prepare HTTP requests to send to the source API to abort a job once it is timed out from the source's perspective.", + ) + delete_requester: Optional[Union[CustomRequester, HttpRequester]] = Field( + None, + description='Requester component that describes how to prepare HTTP requests to send to the source API to delete a job once the records are extracted.', + ) + partition_router: Optional[ + Union[ + CustomPartitionRouter, + ListPartitionRouter, + SubstreamPartitionRouter, + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], + ] + ] = Field( + [], + description='PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing.', + title='Partition Router', + ) + decoder: Optional[Union[JsonDecoder, JsonlDecoder, IterableDecoder, XmlDecoder]] = ( + Field( + None, + description='Component decoding the response so records can be extracted.', + title='Decoder', + ) + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +class SubstreamPartitionRouter(BaseModel): + type: Literal['SubstreamPartitionRouter'] + parent_stream_configs: List[ParentStreamConfig] = Field( + ..., + description='Specifies which parent streams are being iterated over and how parent records should be used to partition the child stream data set.', + title='Parent Stream Configs', + ) + parameters: Optional[Dict[str, Any]] = Field(None, alias='$parameters') + + +CompositeErrorHandler.update_forward_refs() +DeclarativeSource.update_forward_refs() +SelectiveAuthenticator.update_forward_refs() +DeclarativeStream.update_forward_refs() +SessionTokenAuthenticator.update_forward_refs() +SimpleRetriever.update_forward_refs() +AsyncRetriever.update_forward_refs() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py new file mode 100644 index 000000000000..d6fdee69562f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py @@ -0,0 +1,21 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +class CircularReferenceException(Exception): + """ + Raised when a circular reference is detected in a manifest. + """ + + def __init__(self, reference: str) -> None: + super().__init__(f"Circular reference found: {reference}") + + +class UndefinedReferenceException(Exception): + """ + Raised when refering to an undefined reference. + """ + + def __init__(self, path: str, reference: str) -> None: + super().__init__(f"Undefined reference {reference} from {path}") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py new file mode 100644 index 000000000000..7b8b221c68df --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py @@ -0,0 +1,150 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +import typing +from typing import Any, Mapping + +PARAMETERS_STR = "$parameters" + + +DEFAULT_MODEL_TYPES: Mapping[str, str] = { + # CompositeErrorHandler + "CompositeErrorHandler.error_handlers": "DefaultErrorHandler", + # CursorPagination + "CursorPagination.decoder": "JsonDecoder", + # DatetimeBasedCursor + "DatetimeBasedCursor.end_datetime": "MinMaxDatetime", + "DatetimeBasedCursor.end_time_option": "RequestOption", + "DatetimeBasedCursor.start_datetime": "MinMaxDatetime", + "DatetimeBasedCursor.start_time_option": "RequestOption", + # CustomIncrementalSync + "CustomIncrementalSync.end_datetime": "MinMaxDatetime", + "CustomIncrementalSync.end_time_option": "RequestOption", + "CustomIncrementalSync.start_datetime": "MinMaxDatetime", + "CustomIncrementalSync.start_time_option": "RequestOption", + # DeclarativeSource + "DeclarativeSource.check": "CheckStream", + "DeclarativeSource.spec": "Spec", + "DeclarativeSource.streams": "DeclarativeStream", + # DeclarativeStream + "DeclarativeStream.retriever": "SimpleRetriever", + "DeclarativeStream.schema_loader": "JsonFileSchemaLoader", + # DefaultErrorHandler + "DefaultErrorHandler.response_filters": "HttpResponseFilter", + # DefaultPaginator + "DefaultPaginator.decoder": "JsonDecoder", + "DefaultPaginator.page_size_option": "RequestOption", + # DpathExtractor + "DpathExtractor.decoder": "JsonDecoder", + # HttpRequester + "HttpRequester.error_handler": "DefaultErrorHandler", + # ListPartitionRouter + "ListPartitionRouter.request_option": "RequestOption", + # ParentStreamConfig + "ParentStreamConfig.request_option": "RequestOption", + "ParentStreamConfig.stream": "DeclarativeStream", + # RecordSelector + "RecordSelector.extractor": "DpathExtractor", + "RecordSelector.record_filter": "RecordFilter", + # SimpleRetriever + "SimpleRetriever.paginator": "NoPagination", + "SimpleRetriever.record_selector": "RecordSelector", + "SimpleRetriever.requester": "HttpRequester", + # SubstreamPartitionRouter + "SubstreamPartitionRouter.parent_stream_configs": "ParentStreamConfig", + # AddFields + "AddFields.fields": "AddedFieldDefinition", + # CustomPartitionRouter + "CustomPartitionRouter.parent_stream_configs": "ParentStreamConfig", +} + +# We retain a separate registry for custom components to automatically insert the type if it is missing. This is intended to +# be a short term fix because once we have migrated, then type and class_name should be requirements for all custom components. +CUSTOM_COMPONENTS_MAPPING: Mapping[str, str] = { + "CompositeErrorHandler.backoff_strategies": "CustomBackoffStrategy", + "DeclarativeStream.retriever": "CustomRetriever", + "DeclarativeStream.transformations": "CustomTransformation", + "DefaultErrorHandler.backoff_strategies": "CustomBackoffStrategy", + "DefaultPaginator.pagination_strategy": "CustomPaginationStrategy", + "HttpRequester.authenticator": "CustomAuthenticator", + "HttpRequester.error_handler": "CustomErrorHandler", + "RecordSelector.extractor": "CustomRecordExtractor", + "SimpleRetriever.partition_router": "CustomPartitionRouter", +} + + +class ManifestComponentTransformer: + def propagate_types_and_parameters( + self, parent_field_identifier: str, declarative_component: Mapping[str, Any], parent_parameters: Mapping[str, Any] + ) -> Mapping[str, Any]: + """ + Recursively transforms the specified declarative component and subcomponents to propagate parameters and insert the + default component type if it was not already present. The resulting transformed components are a deep copy of the input + components, not an in-place transformation. + + :param declarative_component: The current component that is having type and parameters added + :param parent_field_identifier: The name of the field of the current component coming from the parent component + :param parent_parameters: The parameters set on parent components defined before the current component + :return: A deep copy of the transformed component with types and parameters persisted to it + """ + propagated_component = dict(copy.deepcopy(declarative_component)) + if "type" not in propagated_component: + # If the component has class_name we assume that this is a reference to a custom component. This is a slight change to + # existing behavior because we originally allowed for either class or type to be specified. After the pydantic migration, + # class_name will only be a valid field on custom components and this change reflects that. I checked, and we currently + # have no low-code connectors that use class_name except for custom components. + if "class_name" in propagated_component: + found_type = CUSTOM_COMPONENTS_MAPPING.get(parent_field_identifier) + else: + found_type = DEFAULT_MODEL_TYPES.get(parent_field_identifier) + if found_type: + propagated_component["type"] = found_type + + # When there is no resolved type, we're not processing a component (likely a regular object) and don't need to propagate parameters + # When the type refers to a json schema, we're not processing a component as well. This check is currently imperfect as there could + # be json_schema are not objects but we believe this is not likely in our case because: + # * records are Mapping so objects hence SchemaLoader root should be an object + # * connection_specification is a Mapping + if "type" not in propagated_component or self._is_json_schema_object(propagated_component): + return propagated_component + + # Combines parameters defined at the current level with parameters from parent components. Parameters at the current + # level take precedence + current_parameters = dict(copy.deepcopy(parent_parameters)) + component_parameters = propagated_component.pop(PARAMETERS_STR, {}) + current_parameters = {**current_parameters, **component_parameters} + + # Parameters should be applied to the current component fields with the existing field taking precedence over parameters if + # both exist + for parameter_key, parameter_value in current_parameters.items(): + propagated_component[parameter_key] = propagated_component.get(parameter_key) or parameter_value + + for field_name, field_value in propagated_component.items(): + if isinstance(field_value, dict): + # We exclude propagating a parameter that matches the current field name because that would result in an infinite cycle + excluded_parameter = current_parameters.pop(field_name, None) + parent_type_field_identifier = f"{propagated_component.get('type')}.{field_name}" + propagated_component[field_name] = self.propagate_types_and_parameters( + parent_type_field_identifier, field_value, current_parameters + ) + if excluded_parameter: + current_parameters[field_name] = excluded_parameter + elif isinstance(field_value, typing.List): + # We exclude propagating a parameter that matches the current field name because that would result in an infinite cycle + excluded_parameter = current_parameters.pop(field_name, None) + for i, element in enumerate(field_value): + if isinstance(element, dict): + parent_type_field_identifier = f"{propagated_component.get('type')}.{field_name}" + field_value[i] = self.propagate_types_and_parameters(parent_type_field_identifier, element, current_parameters) + if excluded_parameter: + current_parameters[field_name] = excluded_parameter + + if current_parameters: + propagated_component[PARAMETERS_STR] = current_parameters + return propagated_component + + @staticmethod + def _is_json_schema_object(propagated_component: Mapping[str, Any]) -> bool: + return propagated_component.get("type") == "object" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py new file mode 100644 index 000000000000..66bf3d5ef856 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -0,0 +1,206 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import re +from typing import Any, Mapping, Set, Tuple, Union + +from airbyte_cdk.sources.declarative.parsers.custom_exceptions import CircularReferenceException, UndefinedReferenceException + +REF_TAG = "$ref" + + +class ManifestReferenceResolver: + """ + An incoming manifest can contain references to values previously defined. + This parser will dereference these values to produce a complete ConnectionDefinition. + + References can be defined using a #/ string. + ``` + key: 1234 + reference: "#/key" + ``` + will produce the following definition: + ``` + key: 1234 + reference: 1234 + ``` + This also works with objects: + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: "#/key_value_pairs" + ``` + will produce the following definition: + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: + k1: v1 + k2: v2 + ``` + + The $ref keyword can be used to refer to an object and enhance it with addition key-value pairs + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: + $ref: "#/key_value_pairs" + k3: v3 + ``` + will produce the following definition: + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: + k1: v1 + k2: v2 + k3: v3 + ``` + + References can also point to nested values. + Nested references are ambiguous because one could define a key containing with `.` + in this example, we want to refer to the limit key in the dict object: + ``` + dict: + limit: 50 + limit_ref: "#/dict/limit" + ``` + will produce the following definition: + ``` + dict + limit: 50 + limit-ref: 50 + ``` + + whereas here we want to access the `nested/path` value. + ``` + nested: + path: "first one" + nested/path: "uh oh" + value: "#/nested/path + ``` + will produce the following definition: + ``` + nested: + path: "first one" + nested/path: "uh oh" + value: "uh oh" + ``` + + to resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward + until we find a key with the given path, or until there is nothing to traverse. + """ + + def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]: + """ + :param manifest: incoming manifest that could have references to previously defined components + :return: + """ + return self._evaluate_node(manifest, manifest, set()) # type: ignore[no-any-return] + + def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any: + if isinstance(node, dict): + evaluated_dict = {k: self._evaluate_node(v, manifest, visited) for k, v in node.items() if not self._is_ref_key(k)} + if REF_TAG in node: + # The node includes a $ref key, so we splat the referenced value(s) into the evaluated dict + evaluated_ref = self._evaluate_node(node[REF_TAG], manifest, visited) + if not isinstance(evaluated_ref, dict): + return evaluated_ref + else: + # The values defined on the component take precedence over the reference values + return evaluated_ref | evaluated_dict + else: + return evaluated_dict + elif isinstance(node, list): + return [self._evaluate_node(v, manifest, visited) for v in node] + elif self._is_ref(node): + if node in visited: + raise CircularReferenceException(node) + visited.add(node) + ret = self._evaluate_node(self._lookup_ref_value(node, manifest), manifest, visited) + visited.remove(node) + return ret + else: + return node + + def _lookup_ref_value(self, ref: str, manifest: Mapping[str, Any]) -> Any: + ref_match = re.match(r"#/(.*)", ref) + if not ref_match: + raise ValueError(f"Invalid reference format {ref}") + try: + path = ref_match.groups()[0] + return self._read_ref_value(path, manifest) + except (AttributeError, KeyError, IndexError): + raise UndefinedReferenceException(path, ref) + + @staticmethod + def _is_ref(node: Any) -> bool: + return isinstance(node, str) and node.startswith("#/") + + @staticmethod + def _is_ref_key(key: str) -> bool: + return bool(key == REF_TAG) + + @staticmethod + def _read_ref_value(ref: str, manifest_node: Mapping[str, Any]) -> Any: + """ + Read the value at the referenced location of the manifest. + + References are ambiguous because one could define a key containing `/` + In this example, we want to refer to the `limit` key in the `dict` object: + dict: + limit: 50 + limit_ref: "#/dict/limit" + + Whereas here we want to access the `nested/path` value. + nested: + path: "first one" + nested/path: "uh oh" + value: "#/nested/path" + + To resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward + until we find a key with the given path, or until there is nothing to traverse. + + Consider the path foo/bar/baz. To resolve the ambiguity, we first try 'foo/bar/baz' in its entirety as a top-level key. If this + fails, we try 'foo' as the top-level key, and if this succeeds, pass 'bar/baz' on as the key to be tried at the next level. + """ + while ref: + try: + return manifest_node[ref] + except (KeyError, TypeError): + head, ref = _parse_path(ref) + manifest_node = manifest_node[head] # type: ignore # Couldn't figure out how to fix this since manifest_node can get reassigned into other types like lists + return manifest_node + + +def _parse_path(ref: str) -> Tuple[Union[str, int], str]: + """ + Return the next path component, together with the rest of the path. + + A path component may be a string key, or an int index. + + >>> _parse_path("foo/bar") + "foo", "bar" + >>> _parse_path("foo/7/8/bar") + "foo", "7/8/bar" + >>> _parse_path("7/8/bar") + 7, "8/bar" + >>> _parse_path("8/bar") + 8, "bar" + >>> _parse_path("8foo/bar") + "8foo", "bar" + """ + match = re.match(r"([^/]*)/?(.*)", ref) + if match: + first, rest = match.groups() + try: + return int(first), rest + except ValueError: + return first, rest + else: + raise ValueError(f"Invalid path {ref} specified") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py new file mode 100644 index 000000000000..f0eb4f388854 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -0,0 +1,1614 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from __future__ import annotations + +import datetime +import importlib +import inspect +import re +from functools import partial +from typing import Any, Callable, Dict, List, Mapping, MutableMapping, Optional, Tuple, Type, Union, get_args, get_origin, get_type_hints + +from airbyte_cdk.models import FailureType, Level +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator +from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker +from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository +from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus +from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator, NoAuth +from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm +from airbyte_cdk.sources.declarative.auth.oauth import DeclarativeSingleUseRefreshTokenOauth2Authenticator +from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator +from airbyte_cdk.sources.declarative.auth.token import ( + ApiKeyAuthenticator, + BasicHttpAuthenticator, + BearerAuthenticator, + LegacySessionTokenAuthenticator, +) +from airbyte_cdk.sources.declarative.auth.token_provider import InterpolatedStringTokenProvider, SessionTokenProvider, TokenProvider +from airbyte_cdk.sources.declarative.checks import CheckStream +from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel +from airbyte_cdk.sources.declarative.datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.decoders import ( + Decoder, + IterableDecoder, + JsonDecoder, + JsonlDecoder, + PaginationDecoderDecorator, + XmlDecoder, +) +from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector, ResponseToFileExtractor +from airbyte_cdk.sources.declarative.extractors.record_filter import ClientSideIncrementalRecordFilterDecorator +from airbyte_cdk.sources.declarative.extractors.record_selector import SCHEMA_TRANSFORMER_TYPE_MAPPING +from airbyte_cdk.sources.declarative.incremental import ( + ChildPartitionResumableFullRefreshCursor, + CursorFactory, + DatetimeBasedCursor, + DeclarativeCursor, + GlobalSubstreamCursor, + PerPartitionCursor, + PerPartitionWithGlobalCursor, + ResumableFullRefreshCursor, +) +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import LegacyToPerPartitionStateMigration +from airbyte_cdk.sources.declarative.models import CustomStateMigration +from airbyte_cdk.sources.declarative.models.declarative_component_schema import AddedFieldDefinition as AddedFieldDefinitionModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import AddFields as AddFieldsModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ApiKeyAuthenticator as ApiKeyAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import AsyncJobStatusMap as AsyncJobStatusMapModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import AsyncRetriever as AsyncRetrieverModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import BasicHttpAuthenticator as BasicHttpAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import BearerAuthenticator as BearerAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CheckStream as CheckStreamModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CompositeErrorHandler as CompositeErrorHandlerModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ConcurrencyLevel as ConcurrencyLevelModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ConstantBackoffStrategy as ConstantBackoffStrategyModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CursorPagination as CursorPaginationModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomAuthenticator as CustomAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomBackoffStrategy as CustomBackoffStrategyModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomErrorHandler as CustomErrorHandlerModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomIncrementalSync as CustomIncrementalSyncModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomPaginationStrategy as CustomPaginationStrategyModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomPartitionRouter as CustomPartitionRouterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomRecordExtractor as CustomRecordExtractorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomRecordFilter as CustomRecordFilterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomRequester as CustomRequesterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomRetriever as CustomRetrieverModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomSchemaLoader as CustomSchemaLoader +from airbyte_cdk.sources.declarative.models.declarative_component_schema import CustomTransformation as CustomTransformationModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import DatetimeBasedCursor as DatetimeBasedCursorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import DeclarativeStream as DeclarativeStreamModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import DefaultErrorHandler as DefaultErrorHandlerModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import DefaultPaginator as DefaultPaginatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import DpathExtractor as DpathExtractorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, +) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import HttpRequester as HttpRequesterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import HttpResponseFilter as HttpResponseFilterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import InlineSchemaLoader as InlineSchemaLoaderModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import IterableDecoder as IterableDecoderModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import JsonDecoder as JsonDecoderModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import JsonFileSchemaLoader as JsonFileSchemaLoaderModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import JsonlDecoder as JsonlDecoderModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import JwtAuthenticator as JwtAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import JwtHeaders as JwtHeadersModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import JwtPayload as JwtPayloadModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import KeysToLower as KeysToLowerModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, +) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, +) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ListPartitionRouter as ListPartitionRouterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import MinMaxDatetime as MinMaxDatetimeModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import NoAuth as NoAuthModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import NoPagination as NoPaginationModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import OAuthAuthenticator as OAuthAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import OffsetIncrement as OffsetIncrementModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import PageIncrement as PageIncrementModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ParentStreamConfig as ParentStreamConfigModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import RecordFilter as RecordFilterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import RecordSelector as RecordSelectorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import RemoveFields as RemoveFieldsModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import RequestOption as RequestOptionModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import RequestPath as RequestPathModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import SelectiveAuthenticator as SelectiveAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import SessionTokenAuthenticator as SessionTokenAuthenticatorModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import SimpleRetriever as SimpleRetrieverModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import SubstreamPartitionRouter as SubstreamPartitionRouterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType +from airbyte_cdk.sources.declarative.models.declarative_component_schema import WaitTimeFromHeader as WaitTimeFromHeaderModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import XmlDecoder as XmlDecoderModel +from airbyte_cdk.sources.declarative.partition_routers import ( + CartesianProductStreamSlicer, + ListPartitionRouter, + SinglePartitionRouter, + SubstreamPartitionRouter, +) +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ParentStreamConfig +from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption +from airbyte_cdk.sources.declarative.requesters.error_handlers import CompositeErrorHandler, DefaultErrorHandler, HttpResponseFilter +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( + ConstantBackoffStrategy, + ExponentialBackoffStrategy, + WaitTimeFromHeaderBackoffStrategy, + WaitUntilTimeFromHeaderBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository +from airbyte_cdk.sources.declarative.requesters.paginators import DefaultPaginator, NoPagination, PaginatorTestReadDecorator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( + CursorPaginationStrategy, + CursorStopCondition, + OffsetIncrement, + PageIncrement, + StopConditionPaginationStrategyDecorator, +) +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType +from airbyte_cdk.sources.declarative.requesters.request_options import ( + DatetimeBasedRequestOptionsProvider, + DefaultRequestOptionsProvider, + InterpolatedRequestOptionsProvider, + RequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.retrievers import AsyncRetriever, SimpleRetriever, SimpleRetrieverTestReadDecorator +from airbyte_cdk.sources.declarative.schema import DefaultSchemaLoader, InlineSchemaLoader, JsonFileSchemaLoader +from airbyte_cdk.sources.declarative.spec import Spec +from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer +from airbyte_cdk.sources.declarative.transformations import AddFields, RecordTransformation, RemoveFields +from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition +from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import KeysToLowerTransformation +from airbyte_cdk.sources.message import InMemoryMessageRepository, LogAppenderMessageRepositoryDecorator, MessageRepository +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + CustomFormatConcurrentStreamStateConverter, + DateTimeStreamStateConverter, +) +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction +from airbyte_cdk.sources.types import Config +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer +from isodate import parse_duration +from pydantic.v1 import BaseModel + +ComponentDefinition = Mapping[str, Any] + + +class ModelToComponentFactory: + + EPOCH_DATETIME_FORMAT = "%s" + + def __init__( + self, + limit_pages_fetched_per_slice: Optional[int] = None, + limit_slices_fetched: Optional[int] = None, + emit_connector_builder_messages: bool = False, + disable_retries: bool = False, + disable_cache: bool = False, + message_repository: Optional[MessageRepository] = None, + ): + self._init_mappings() + self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice + self._limit_slices_fetched = limit_slices_fetched + self._emit_connector_builder_messages = emit_connector_builder_messages + self._disable_retries = disable_retries + self._disable_cache = disable_cache + self._message_repository = message_repository or InMemoryMessageRepository( # type: ignore + self._evaluate_log_level(emit_connector_builder_messages) + ) + + def _init_mappings(self) -> None: + self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { + AddedFieldDefinitionModel: self.create_added_field_definition, + AddFieldsModel: self.create_add_fields, + ApiKeyAuthenticatorModel: self.create_api_key_authenticator, + BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, + BearerAuthenticatorModel: self.create_bearer_authenticator, + CheckStreamModel: self.create_check_stream, + CompositeErrorHandlerModel: self.create_composite_error_handler, + ConcurrencyLevelModel: self.create_concurrency_level, + ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, + CursorPaginationModel: self.create_cursor_pagination, + CustomAuthenticatorModel: self.create_custom_component, + CustomBackoffStrategyModel: self.create_custom_component, + CustomErrorHandlerModel: self.create_custom_component, + CustomIncrementalSyncModel: self.create_custom_component, + CustomRecordExtractorModel: self.create_custom_component, + CustomRecordFilterModel: self.create_custom_component, + CustomRequesterModel: self.create_custom_component, + CustomRetrieverModel: self.create_custom_component, + CustomSchemaLoader: self.create_custom_component, + CustomStateMigration: self.create_custom_component, + CustomPaginationStrategyModel: self.create_custom_component, + CustomPartitionRouterModel: self.create_custom_component, + CustomTransformationModel: self.create_custom_component, + DatetimeBasedCursorModel: self.create_datetime_based_cursor, + DeclarativeStreamModel: self.create_declarative_stream, + DefaultErrorHandlerModel: self.create_default_error_handler, + DefaultPaginatorModel: self.create_default_paginator, + DpathExtractorModel: self.create_dpath_extractor, + ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, + SessionTokenAuthenticatorModel: self.create_session_token_authenticator, + HttpRequesterModel: self.create_http_requester, + HttpResponseFilterModel: self.create_http_response_filter, + InlineSchemaLoaderModel: self.create_inline_schema_loader, + JsonDecoderModel: self.create_json_decoder, + JsonlDecoderModel: self.create_jsonl_decoder, + KeysToLowerModel: self.create_keys_to_lower_transformation, + IterableDecoderModel: self.create_iterable_decoder, + XmlDecoderModel: self.create_xml_decoder, + JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, + JwtAuthenticatorModel: self.create_jwt_authenticator, + LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, + ListPartitionRouterModel: self.create_list_partition_router, + MinMaxDatetimeModel: self.create_min_max_datetime, + NoAuthModel: self.create_no_auth, + NoPaginationModel: self.create_no_pagination, + OAuthAuthenticatorModel: self.create_oauth_authenticator, + OffsetIncrementModel: self.create_offset_increment, + PageIncrementModel: self.create_page_increment, + ParentStreamConfigModel: self.create_parent_stream_config, + RecordFilterModel: self.create_record_filter, + RecordSelectorModel: self.create_record_selector, + RemoveFieldsModel: self.create_remove_fields, + RequestPathModel: self.create_request_path, + RequestOptionModel: self.create_request_option, + LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, + SelectiveAuthenticatorModel: self.create_selective_authenticator, + SimpleRetrieverModel: self.create_simple_retriever, + SpecModel: self.create_spec, + SubstreamPartitionRouterModel: self.create_substream_partition_router, + WaitTimeFromHeaderModel: self.create_wait_time_from_header, + WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, + AsyncRetrieverModel: self.create_async_retriever, + } + + # Needed for the case where we need to perform a second parse on the fields of a custom component + self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} + + def create_component( + self, model_type: Type[BaseModel], component_definition: ComponentDefinition, config: Config, **kwargs: Any + ) -> Any: + """ + Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and + subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating + creating declarative components from that model. + + :param model_type: The type of declarative component that is being initialized + :param component_definition: The mapping that represents a declarative component + :param config: The connector config that is provided by the customer + :return: The declarative component to be used at runtime + """ + + component_type = component_definition.get("type") + if component_definition.get("type") != model_type.__name__: + raise ValueError(f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead") + + declarative_component_model = model_type.parse_obj(component_definition) + + if not isinstance(declarative_component_model, model_type): + raise ValueError(f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}") + + return self._create_component_from_model(model=declarative_component_model, config=config, **kwargs) + + def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: + if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: + raise ValueError(f"{model.__class__} with attributes {model} is not a valid component type") + component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) + if not component_constructor: + raise ValueError(f"Could not find constructor for {model.__class__}") + return component_constructor(model=model, config=config, **kwargs) + + @staticmethod + def create_added_field_definition(model: AddedFieldDefinitionModel, config: Config, **kwargs: Any) -> AddedFieldDefinition: + interpolated_value = InterpolatedString.create(model.value, parameters=model.parameters or {}) + return AddedFieldDefinition( + path=model.path, + value=interpolated_value, + value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), + parameters=model.parameters or {}, + ) + + def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: + added_field_definitions = [ + self._create_component_from_model( + model=added_field_definition_model, + value_type=ModelToComponentFactory._json_schema_type_name_to_type(added_field_definition_model.value_type), + config=config, + ) + for added_field_definition_model in model.fields + ] + return AddFields(fields=added_field_definitions, parameters=model.parameters or {}) + + def create_keys_to_lower_transformation(self, model: KeysToLowerModel, config: Config, **kwargs: Any) -> KeysToLowerTransformation: + return KeysToLowerTransformation() + + @staticmethod + def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: + if not value_type: + return None + names_to_types = { + ValueType.string: str, + ValueType.number: float, + ValueType.integer: int, + ValueType.boolean: bool, + } + return names_to_types[value_type] + + @staticmethod + def create_api_key_authenticator( + model: ApiKeyAuthenticatorModel, config: Config, token_provider: Optional[TokenProvider] = None, **kwargs: Any + ) -> ApiKeyAuthenticator: + if model.inject_into is None and model.header is None: + raise ValueError("Expected either inject_into or header to be set for ApiKeyAuthenticator") + + if model.inject_into is not None and model.header is not None: + raise ValueError("inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option") + + if token_provider is not None and model.api_token != "": + raise ValueError("If token_provider is set, api_token is ignored and has to be set to empty string.") + + request_option = ( + RequestOption( + inject_into=RequestOptionType(model.inject_into.inject_into.value), + field_name=model.inject_into.field_name, + parameters=model.parameters or {}, + ) + if model.inject_into + else RequestOption( + inject_into=RequestOptionType.header, + field_name=model.header or "", + parameters=model.parameters or {}, + ) + ) + return ApiKeyAuthenticator( + token_provider=( + token_provider + if token_provider is not None + else InterpolatedStringTokenProvider(api_token=model.api_token or "", config=config, parameters=model.parameters or {}) + ), + request_option=request_option, + config=config, + parameters=model.parameters or {}, + ) + + def create_legacy_to_per_partition_state_migration( + self, + model: LegacyToPerPartitionStateMigrationModel, + config: Mapping[str, Any], + declarative_stream: DeclarativeStreamModel, + ) -> LegacyToPerPartitionStateMigration: + retriever = declarative_stream.retriever + if not isinstance(retriever, SimpleRetrieverModel): + raise ValueError( + f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever. Got {type(retriever)}" + ) + partition_router = retriever.partition_router + if not isinstance(partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel)): + raise ValueError( + f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" + ) + if not hasattr(partition_router, "parent_stream_configs"): + raise ValueError("LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration.") + + return LegacyToPerPartitionStateMigration(declarative_stream.retriever.partition_router, declarative_stream.incremental_sync, config, declarative_stream.parameters) # type: ignore # The retriever type was already checked + + def create_session_token_authenticator( + self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any + ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: + decoder = self._create_component_from_model(model=model.decoder, config=config) if model.decoder else JsonDecoder(parameters={}) + login_requester = self._create_component_from_model( + model=model.login_requester, config=config, name=f"{name}_login_requester", decoder=decoder + ) + token_provider = SessionTokenProvider( + login_requester=login_requester, + session_token_path=model.session_token_path, + expiration_duration=parse_duration(model.expiration_duration) if model.expiration_duration else None, + parameters=model.parameters or {}, + message_repository=self._message_repository, + decoder=decoder, + ) + if model.request_authentication.type == "Bearer": + return ModelToComponentFactory.create_bearer_authenticator( + BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value + config, + token_provider=token_provider, # type: ignore # $parameters defaults to None + ) + else: + return ModelToComponentFactory.create_api_key_authenticator( + ApiKeyAuthenticatorModel(type="ApiKeyAuthenticator", api_token="", inject_into=model.request_authentication.inject_into), # type: ignore # $parameters and headers default to None + config=config, + token_provider=token_provider, + ) + + @staticmethod + def create_basic_http_authenticator(model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any) -> BasicHttpAuthenticator: + return BasicHttpAuthenticator( + password=model.password or "", username=model.username, config=config, parameters=model.parameters or {} + ) + + @staticmethod + def create_bearer_authenticator( + model: BearerAuthenticatorModel, config: Config, token_provider: Optional[TokenProvider] = None, **kwargs: Any + ) -> BearerAuthenticator: + if token_provider is not None and model.api_token != "": + raise ValueError("If token_provider is set, api_token is ignored and has to be set to empty string.") + return BearerAuthenticator( + token_provider=( + token_provider + if token_provider is not None + else InterpolatedStringTokenProvider(api_token=model.api_token or "", config=config, parameters=model.parameters or {}) + ), + config=config, + parameters=model.parameters or {}, + ) + + @staticmethod + def create_check_stream(model: CheckStreamModel, config: Config, **kwargs: Any) -> CheckStream: + return CheckStream(stream_names=model.stream_names, parameters={}) + + def create_composite_error_handler(self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any) -> CompositeErrorHandler: + error_handlers = [ + self._create_component_from_model(model=error_handler_model, config=config) for error_handler_model in model.error_handlers + ] + return CompositeErrorHandler(error_handlers=error_handlers, parameters=model.parameters or {}) + + @staticmethod + def create_concurrency_level(model: ConcurrencyLevelModel, config: Config, **kwargs: Any) -> ConcurrencyLevel: + return ConcurrencyLevel( + default_concurrency=model.default_concurrency, + max_concurrency=model.max_concurrency, + config=config, + parameters={}, + ) + + def create_concurrent_cursor_from_datetime_based_cursor( + self, + state_manager: ConnectorStateManager, + model_type: Type[BaseModel], + component_definition: ComponentDefinition, + stream_name: str, + stream_namespace: Optional[str], + config: Config, + stream_state: MutableMapping[str, Any], + **kwargs: Any, + ) -> Tuple[ConcurrentCursor, DateTimeStreamStateConverter]: + + component_type = component_definition.get("type") + if component_definition.get("type") != model_type.__name__: + raise ValueError(f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead") + + datetime_based_cursor_model = model_type.parse_obj(component_definition) + + if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): + raise ValueError(f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}") + + interpolated_cursor_field = InterpolatedString.create( + datetime_based_cursor_model.cursor_field, parameters=datetime_based_cursor_model.parameters or {} + ) + cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) + + interpolated_partition_field_start = InterpolatedString.create( + datetime_based_cursor_model.partition_field_start or "start_time", parameters=datetime_based_cursor_model.parameters or {} + ) + interpolated_partition_field_end = InterpolatedString.create( + datetime_based_cursor_model.partition_field_end or "end_time", parameters=datetime_based_cursor_model.parameters or {} + ) + + slice_boundary_fields = ( + interpolated_partition_field_start.eval(config=config), + interpolated_partition_field_end.eval(config=config), + ) + + datetime_format = datetime_based_cursor_model.datetime_format + + cursor_granularity = ( + parse_duration(datetime_based_cursor_model.cursor_granularity) if datetime_based_cursor_model.cursor_granularity else None + ) + + lookback_window = None + interpolated_lookback_window = ( + InterpolatedString.create(datetime_based_cursor_model.lookback_window, parameters=datetime_based_cursor_model.parameters or {}) + if datetime_based_cursor_model.lookback_window + else None + ) + if interpolated_lookback_window: + evaluated_lookback_window = interpolated_lookback_window.eval(config=config) + if evaluated_lookback_window: + lookback_window = parse_duration(evaluated_lookback_window) + + connector_state_converter: DateTimeStreamStateConverter + connector_state_converter = CustomFormatConcurrentStreamStateConverter( + datetime_format=datetime_format, + input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, + is_sequential_state=True, + cursor_granularity=cursor_granularity, + # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + ) + + start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] + if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): + start_date_runtime_value = self.create_min_max_datetime(model=datetime_based_cursor_model.start_datetime, config=config) + else: + start_date_runtime_value = datetime_based_cursor_model.start_datetime + + end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] + if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): + end_date_runtime_value = self.create_min_max_datetime(model=datetime_based_cursor_model.end_datetime, config=config) + else: + end_date_runtime_value = datetime_based_cursor_model.end_datetime + + interpolated_start_date = MinMaxDatetime.create( + interpolated_string_or_min_max_datetime=start_date_runtime_value, parameters=datetime_based_cursor_model.parameters + ) + interpolated_end_date = ( + None if not end_date_runtime_value else MinMaxDatetime.create(end_date_runtime_value, datetime_based_cursor_model.parameters) + ) + + # If datetime format is not specified then start/end datetime should inherit it from the stream slicer + if not interpolated_start_date.datetime_format: + interpolated_start_date.datetime_format = datetime_format + if interpolated_end_date and not interpolated_end_date.datetime_format: + interpolated_end_date.datetime_format = datetime_format + + start_date = interpolated_start_date.get_datetime(config=config) + end_date_provider = ( + partial(interpolated_end_date.get_datetime, config) if interpolated_end_date else connector_state_converter.get_end_provider() + ) + + if (datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity) or ( + not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity + ): + raise ValueError( + f"If step is defined, cursor_granularity should be as well and vice-versa. " + f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" + ) + + # When step is not defined, default to a step size from the starting date to the present moment + step_length = datetime.timedelta.max + interpolated_step = ( + InterpolatedString.create(datetime_based_cursor_model.step, parameters=datetime_based_cursor_model.parameters or {}) + if datetime_based_cursor_model.step + else None + ) + if interpolated_step: + evaluated_step = interpolated_step.eval(config) + if evaluated_step: + step_length = parse_duration(evaluated_step) + + return ( + ConcurrentCursor( + stream_name=stream_name, + stream_namespace=stream_namespace, + stream_state=stream_state, + message_repository=self._message_repository, # type: ignore # message_repository is always instantiated with a value by factory + connector_state_manager=state_manager, + connector_state_converter=connector_state_converter, + cursor_field=cursor_field, + slice_boundary_fields=slice_boundary_fields, + start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + lookback_window=lookback_window, + slice_range=step_length, + cursor_granularity=cursor_granularity, + ), + connector_state_converter, + ) + + @staticmethod + def create_constant_backoff_strategy(model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any) -> ConstantBackoffStrategy: + return ConstantBackoffStrategy( + backoff_time_in_seconds=model.backoff_time_in_seconds, + config=config, + parameters=model.parameters or {}, + ) + + def create_cursor_pagination( + self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any + ) -> CursorPaginationStrategy: + if isinstance(decoder, PaginationDecoderDecorator): + if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)): + raise ValueError( + f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead." + ) + decoder_to_use = decoder + else: + if not isinstance(decoder, (JsonDecoder, XmlDecoder)): + raise ValueError(f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead.") + decoder_to_use = PaginationDecoderDecorator(decoder=decoder) + + return CursorPaginationStrategy( + cursor_value=model.cursor_value, + decoder=decoder_to_use, + page_size=model.page_size, + stop_condition=model.stop_condition, + config=config, + parameters=model.parameters or {}, + ) + + def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: + """ + Generically creates a custom component based on the model type and a class_name reference to the custom Python class being + instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor + :param model: The Pydantic model of the custom component being created + :param config: The custom defined connector config + :return: The declarative component built from the Pydantic model to be used at runtime + """ + + custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) + component_fields = get_type_hints(custom_component_class) + model_args = model.dict() + model_args["config"] = config + + # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions + # we defer to these arguments over the component's definition + for key, arg in kwargs.items(): + model_args[key] = arg + + # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not + # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to + # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components + for model_field, model_value in model_args.items(): + # If a custom component field doesn't have a type set, we try to use the type hints to infer the type + if isinstance(model_value, dict) and "type" not in model_value and model_field in component_fields: + derived_type = self._derive_component_type_from_type_hints(component_fields.get(model_field)) + if derived_type: + model_value["type"] = derived_type + + if self._is_component(model_value): + model_args[model_field] = self._create_nested_component(model, model_field, model_value, config) + elif isinstance(model_value, list): + vals = [] + for v in model_value: + if isinstance(v, dict) and "type" not in v and model_field in component_fields: + derived_type = self._derive_component_type_from_type_hints(component_fields.get(model_field)) + if derived_type: + v["type"] = derived_type + if self._is_component(v): + vals.append(self._create_nested_component(model, model_field, v, config)) + else: + vals.append(v) + model_args[model_field] = vals + + kwargs = {class_field: model_args[class_field] for class_field in component_fields.keys() if class_field in model_args} + return custom_component_class(**kwargs) + + @staticmethod + def _get_class_from_fully_qualified_class_name(full_qualified_class_name: str) -> Any: + split = full_qualified_class_name.split(".") + module = ".".join(split[:-1]) + class_name = split[-1] + try: + return getattr(importlib.import_module(module), class_name) + except AttributeError: + raise ValueError(f"Could not load class {full_qualified_class_name}.") + + @staticmethod + def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: + interface = field_type + while True: + origin = get_origin(interface) + if origin: + # Unnest types until we reach the raw type + # List[T] -> T + # Optional[List[T]] -> T + args = get_args(interface) + interface = args[0] + else: + break + if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): + return interface.__name__ + return None + + @staticmethod + def is_builtin_type(cls: Optional[Type[Any]]) -> bool: + if not cls: + return False + return cls.__module__ == "builtins" + + @staticmethod + def _extract_missing_parameters(error: TypeError) -> List[str]: + parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) + if parameter_search: + return re.findall(r"\'(.+?)\'", parameter_search.group(1)) + else: + return [] + + def _create_nested_component(self, model: Any, model_field: str, model_value: Any, config: Config) -> Any: + type_name = model_value.get("type", None) + if not type_name: + # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent + return model_value + + model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) + if model_type: + parsed_model = model_type.parse_obj(model_value) + try: + # To improve usability of the language, certain fields are shared between components. This can come in the form of + # a parent component passing some of its fields to a child component or the parent extracting fields from other child + # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base + # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created + # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that + # are needed by a component and could not be shared. + model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) + constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs + model_parameters = model_value.get("$parameters", {}) + matching_parameters = {kwarg: model_parameters[kwarg] for kwarg in constructor_kwargs if kwarg in model_parameters} + return self._create_component_from_model(model=parsed_model, config=config, **matching_parameters) + except TypeError as error: + missing_parameters = self._extract_missing_parameters(error) + if missing_parameters: + raise ValueError( + f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " + + ", ".join((f"{type_name}.$parameters.{parameter}" for parameter in missing_parameters)) + ) + raise TypeError(f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}") + else: + raise ValueError( + f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" + ) + + @staticmethod + def _is_component(model_value: Any) -> bool: + return isinstance(model_value, dict) and model_value.get("type") is not None + + def create_datetime_based_cursor(self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any) -> DatetimeBasedCursor: + start_datetime: Union[str, MinMaxDatetime] = ( + model.start_datetime if isinstance(model.start_datetime, str) else self.create_min_max_datetime(model.start_datetime, config) + ) + end_datetime: Union[str, MinMaxDatetime, None] = None + if model.is_data_feed and model.end_datetime: + raise ValueError("Data feed does not support end_datetime") + if model.is_data_feed and model.is_client_side_incremental: + raise ValueError("`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them.") + if model.end_datetime: + end_datetime = ( + model.end_datetime if isinstance(model.end_datetime, str) else self.create_min_max_datetime(model.end_datetime, config) + ) + + end_time_option = ( + RequestOption( + inject_into=RequestOptionType(model.end_time_option.inject_into.value), + field_name=model.end_time_option.field_name, + parameters=model.parameters or {}, + ) + if model.end_time_option + else None + ) + start_time_option = ( + RequestOption( + inject_into=RequestOptionType(model.start_time_option.inject_into.value), + field_name=model.start_time_option.field_name, + parameters=model.parameters or {}, + ) + if model.start_time_option + else None + ) + + return DatetimeBasedCursor( + cursor_field=model.cursor_field, + cursor_datetime_formats=model.cursor_datetime_formats if model.cursor_datetime_formats else [], + cursor_granularity=model.cursor_granularity, + datetime_format=model.datetime_format, + end_datetime=end_datetime, + start_datetime=start_datetime, + step=model.step, + end_time_option=end_time_option, + lookback_window=model.lookback_window, + start_time_option=start_time_option, + partition_field_end=model.partition_field_end, + partition_field_start=model.partition_field_start, + message_repository=self._message_repository, + is_compare_strictly=model.is_compare_strictly, + config=config, + parameters=model.parameters or {}, + ) + + def create_declarative_stream(self, model: DeclarativeStreamModel, config: Config, **kwargs: Any) -> DeclarativeStream: + # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field + # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the + # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in + # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. + combined_slicers = self._merge_stream_slicers(model=model, config=config) + + primary_key = model.primary_key.__root__ if model.primary_key else None + stop_condition_on_cursor = ( + model.incremental_sync and hasattr(model.incremental_sync, "is_data_feed") and model.incremental_sync.is_data_feed + ) + client_side_incremental_sync = None + if ( + model.incremental_sync + and hasattr(model.incremental_sync, "is_client_side_incremental") + and model.incremental_sync.is_client_side_incremental + ): + supported_slicers = (DatetimeBasedCursor, GlobalSubstreamCursor, PerPartitionWithGlobalCursor) + if combined_slicers and not isinstance(combined_slicers, supported_slicers): + raise ValueError("Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead") + client_side_incremental_sync = { + "date_time_based_cursor": self._create_component_from_model(model=model.incremental_sync, config=config), + "substream_cursor": ( + combined_slicers if isinstance(combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)) else None + ), + } + + if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): + cursor_model = model.incremental_sync + + end_time_option = ( + RequestOption( + inject_into=RequestOptionType(cursor_model.end_time_option.inject_into.value), + field_name=cursor_model.end_time_option.field_name, + parameters=cursor_model.parameters or {}, + ) + if cursor_model.end_time_option + else None + ) + start_time_option = ( + RequestOption( + inject_into=RequestOptionType(cursor_model.start_time_option.inject_into.value), + field_name=cursor_model.start_time_option.field_name, + parameters=cursor_model.parameters or {}, + ) + if cursor_model.start_time_option + else None + ) + + request_options_provider = DatetimeBasedRequestOptionsProvider( + start_time_option=start_time_option, + end_time_option=end_time_option, + partition_field_start=cursor_model.partition_field_end, + partition_field_end=cursor_model.partition_field_end, + config=config, + parameters=model.parameters or {}, + ) + else: + request_options_provider = None + + transformations = [] + if model.transformations: + for transformation_model in model.transformations: + transformations.append(self._create_component_from_model(model=transformation_model, config=config)) + retriever = self._create_component_from_model( + model=model.retriever, + config=config, + name=model.name, + primary_key=primary_key, + stream_slicer=combined_slicers, + request_options_provider=request_options_provider, + stop_condition_on_cursor=stop_condition_on_cursor, + client_side_incremental_sync=client_side_incremental_sync, + transformations=transformations, + ) + cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None + + if model.state_migrations: + state_transformations = [ + self._create_component_from_model(state_migration, config, declarative_stream=model) + for state_migration in model.state_migrations + ] + else: + state_transformations = [] + + if model.schema_loader: + schema_loader = self._create_component_from_model(model=model.schema_loader, config=config) + else: + options = model.parameters or {} + if "name" not in options: + options["name"] = model.name + schema_loader = DefaultSchemaLoader(config=config, parameters=options) + + return DeclarativeStream( + name=model.name or "", + primary_key=primary_key, + retriever=retriever, + schema_loader=schema_loader, + stream_cursor_field=cursor_field or "", + state_migrations=state_transformations, + config=config, + parameters=model.parameters or {}, + ) + + def _merge_stream_slicers(self, model: DeclarativeStreamModel, config: Config) -> Optional[StreamSlicer]: + stream_slicer = None + if ( + hasattr(model.retriever, "partition_router") + and isinstance(model.retriever, SimpleRetrieverModel) + and model.retriever.partition_router + ): + stream_slicer_model = model.retriever.partition_router + + if isinstance(stream_slicer_model, list): + stream_slicer = CartesianProductStreamSlicer( + [self._create_component_from_model(model=slicer, config=config) for slicer in stream_slicer_model], parameters={} + ) + else: + stream_slicer = self._create_component_from_model(model=stream_slicer_model, config=config) + + if model.incremental_sync and stream_slicer: + incremental_sync_model = model.incremental_sync + if hasattr(incremental_sync_model, "global_substream_cursor") and incremental_sync_model.global_substream_cursor: + cursor_component = self._create_component_from_model(model=incremental_sync_model, config=config) + return GlobalSubstreamCursor(stream_cursor=cursor_component, partition_router=stream_slicer) + else: + cursor_component = self._create_component_from_model(model=incremental_sync_model, config=config) + return PerPartitionWithGlobalCursor( + cursor_factory=CursorFactory( + lambda: self._create_component_from_model(model=incremental_sync_model, config=config), + ), + partition_router=stream_slicer, + stream_cursor=cursor_component, + ) + elif model.incremental_sync: + return self._create_component_from_model(model=model.incremental_sync, config=config) if model.incremental_sync else None + elif stream_slicer: + # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` + return PerPartitionCursor( + cursor_factory=CursorFactory(create_function=partial(ChildPartitionResumableFullRefreshCursor, {})), + partition_router=stream_slicer, + ) + elif hasattr(model.retriever, "paginator") and model.retriever.paginator and not stream_slicer: + # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` + return ResumableFullRefreshCursor(parameters={}) + else: + return None + + def create_default_error_handler(self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any) -> DefaultErrorHandler: + backoff_strategies = [] + if model.backoff_strategies: + for backoff_strategy_model in model.backoff_strategies: + backoff_strategies.append(self._create_component_from_model(model=backoff_strategy_model, config=config)) + + response_filters = [] + if model.response_filters: + for response_filter_model in model.response_filters: + response_filters.append(self._create_component_from_model(model=response_filter_model, config=config)) + response_filters.append(HttpResponseFilter(config=config, parameters=model.parameters or {})) + + return DefaultErrorHandler( + backoff_strategies=backoff_strategies, + max_retries=model.max_retries, + response_filters=response_filters, + config=config, + parameters=model.parameters or {}, + ) + + def create_default_paginator( + self, + model: DefaultPaginatorModel, + config: Config, + *, + url_base: str, + decoder: Optional[Decoder] = None, + cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, + ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: + if decoder: + if not isinstance(decoder, (JsonDecoder, XmlDecoder)): + raise ValueError(f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead.") + decoder_to_use = PaginationDecoderDecorator(decoder=decoder) + else: + decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) + page_size_option = ( + self._create_component_from_model(model=model.page_size_option, config=config) if model.page_size_option else None + ) + page_token_option = ( + self._create_component_from_model(model=model.page_token_option, config=config) if model.page_token_option else None + ) + pagination_strategy = self._create_component_from_model(model=model.pagination_strategy, config=config, decoder=decoder_to_use) + if cursor_used_for_stop_condition: + pagination_strategy = StopConditionPaginationStrategyDecorator( + pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) + ) + paginator = DefaultPaginator( + decoder=decoder_to_use, + page_size_option=page_size_option, + page_token_option=page_token_option, + pagination_strategy=pagination_strategy, + url_base=url_base, + config=config, + parameters=model.parameters or {}, + ) + if self._limit_pages_fetched_per_slice: + return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) + return paginator + + def create_dpath_extractor( + self, model: DpathExtractorModel, config: Config, decoder: Optional[Decoder] = None, **kwargs: Any + ) -> DpathExtractor: + if decoder: + decoder_to_use = decoder + else: + decoder_to_use = JsonDecoder(parameters={}) + model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] + return DpathExtractor(decoder=decoder_to_use, field_path=model_field_path, config=config, parameters=model.parameters or {}) + + @staticmethod + def create_exponential_backoff_strategy(model: ExponentialBackoffStrategyModel, config: Config) -> ExponentialBackoffStrategy: + return ExponentialBackoffStrategy(factor=model.factor or 5, parameters=model.parameters or {}, config=config) + + def create_http_requester(self, model: HttpRequesterModel, decoder: Decoder, config: Config, *, name: str) -> HttpRequester: + authenticator = ( + self._create_component_from_model(model=model.authenticator, config=config, url_base=model.url_base, name=name, decoder=decoder) + if model.authenticator + else None + ) + error_handler = ( + self._create_component_from_model(model=model.error_handler, config=config) + if model.error_handler + else DefaultErrorHandler(backoff_strategies=[], response_filters=[], config=config, parameters=model.parameters or {}) + ) + + request_options_provider = InterpolatedRequestOptionsProvider( + request_body_data=model.request_body_data, + request_body_json=model.request_body_json, + request_headers=model.request_headers, + request_parameters=model.request_parameters, + config=config, + parameters=model.parameters or {}, + ) + + assert model.use_cache is not None # for mypy + assert model.http_method is not None # for mypy + + use_cache = model.use_cache and not self._disable_cache + + return HttpRequester( + name=name, + url_base=model.url_base, + path=model.path, + authenticator=authenticator, + error_handler=error_handler, + http_method=HttpMethod[model.http_method.value], + request_options_provider=request_options_provider, + config=config, + disable_retries=self._disable_retries, + parameters=model.parameters or {}, + message_repository=self._message_repository, + use_cache=use_cache, + decoder=decoder, + stream_response=decoder.is_stream_response() if decoder else False, + ) + + @staticmethod + def create_http_response_filter(model: HttpResponseFilterModel, config: Config, **kwargs: Any) -> HttpResponseFilter: + if model.action: + action = ResponseAction(model.action.value) + else: + action = None + + failure_type = FailureType(model.failure_type.value) if model.failure_type else None + + http_codes = ( + set(model.http_codes) if model.http_codes else set() + ) # JSON schema notation has no set data type. The schema enforces an array of unique elements + + return HttpResponseFilter( + action=action, + failure_type=failure_type, + error_message=model.error_message or "", + error_message_contains=model.error_message_contains or "", + http_codes=http_codes, + predicate=model.predicate or "", + config=config, + parameters=model.parameters or {}, + ) + + @staticmethod + def create_inline_schema_loader(model: InlineSchemaLoaderModel, config: Config, **kwargs: Any) -> InlineSchemaLoader: + return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) + + @staticmethod + def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder: + return JsonDecoder(parameters={}) + + @staticmethod + def create_jsonl_decoder(model: JsonlDecoderModel, config: Config, **kwargs: Any) -> JsonlDecoder: + return JsonlDecoder(parameters={}) + + @staticmethod + def create_iterable_decoder(model: IterableDecoderModel, config: Config, **kwargs: Any) -> IterableDecoder: + return IterableDecoder(parameters={}) + + @staticmethod + def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: + return XmlDecoder(parameters={}) + + @staticmethod + def create_json_file_schema_loader(model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any) -> JsonFileSchemaLoader: + return JsonFileSchemaLoader(file_path=model.file_path or "", config=config, parameters=model.parameters or {}) + + @staticmethod + def create_jwt_authenticator(model: JwtAuthenticatorModel, config: Config, **kwargs: Any) -> JwtAuthenticator: + jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) + jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) + return JwtAuthenticator( + config=config, + parameters=model.parameters or {}, + algorithm=JwtAlgorithm(model.algorithm.value), + secret_key=model.secret_key, + base64_encode_secret_key=model.base64_encode_secret_key, + token_duration=model.token_duration, + header_prefix=model.header_prefix, + kid=jwt_headers.kid, + typ=jwt_headers.typ, + cty=jwt_headers.cty, + iss=jwt_payload.iss, + sub=jwt_payload.sub, + aud=jwt_payload.aud, + additional_jwt_headers=model.additional_jwt_headers, + additional_jwt_payload=model.additional_jwt_payload, + ) + + @staticmethod + def create_list_partition_router(model: ListPartitionRouterModel, config: Config, **kwargs: Any) -> ListPartitionRouter: + request_option = ( + RequestOption( + inject_into=RequestOptionType(model.request_option.inject_into.value), + field_name=model.request_option.field_name, + parameters=model.parameters or {}, + ) + if model.request_option + else None + ) + return ListPartitionRouter( + cursor_field=model.cursor_field, + request_option=request_option, + values=model.values, + config=config, + parameters=model.parameters or {}, + ) + + @staticmethod + def create_min_max_datetime(model: MinMaxDatetimeModel, config: Config, **kwargs: Any) -> MinMaxDatetime: + return MinMaxDatetime( + datetime=model.datetime, + datetime_format=model.datetime_format or "", + max_datetime=model.max_datetime or "", + min_datetime=model.min_datetime or "", + parameters=model.parameters or {}, + ) + + @staticmethod + def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: + return NoAuth(parameters=model.parameters or {}) + + @staticmethod + def create_no_pagination(model: NoPaginationModel, config: Config, **kwargs: Any) -> NoPagination: + return NoPagination(parameters={}) + + def create_oauth_authenticator(self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any) -> DeclarativeOauth2Authenticator: + if model.refresh_token_updater: + # ignore type error because fixing it would have a lot of dependencies, revisit later + return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore + config, + InterpolatedString.create(model.token_refresh_endpoint, parameters=model.parameters or {}).eval(config), + access_token_name=InterpolatedString.create( + model.access_token_name or "access_token", parameters=model.parameters or {} + ).eval(config), + refresh_token_name=model.refresh_token_updater.refresh_token_name, + expires_in_name=InterpolatedString.create(model.expires_in_name or "expires_in", parameters=model.parameters or {}).eval( + config + ), + client_id=InterpolatedString.create(model.client_id, parameters=model.parameters or {}).eval(config), + client_secret=InterpolatedString.create(model.client_secret, parameters=model.parameters or {}).eval(config), + access_token_config_path=model.refresh_token_updater.access_token_config_path, + refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, + token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, + grant_type=InterpolatedString.create(model.grant_type or "refresh_token", parameters=model.parameters or {}).eval(config), + refresh_request_body=InterpolatedMapping(model.refresh_request_body or {}, parameters=model.parameters or {}).eval(config), + scopes=model.scopes, + token_expiry_date_format=model.token_expiry_date_format, + message_repository=self._message_repository, + refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, + refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, + refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, + ) + # ignore type error because fixing it would have a lot of dependencies, revisit later + return DeclarativeOauth2Authenticator( # type: ignore + access_token_name=model.access_token_name or "access_token", + client_id=model.client_id, + client_secret=model.client_secret, + expires_in_name=model.expires_in_name or "expires_in", + grant_type=model.grant_type or "refresh_token", + refresh_request_body=model.refresh_request_body, + refresh_token=model.refresh_token, + scopes=model.scopes, + token_expiry_date=model.token_expiry_date, + token_expiry_date_format=model.token_expiry_date_format, # type: ignore + token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), + token_refresh_endpoint=model.token_refresh_endpoint, + config=config, + parameters=model.parameters or {}, + message_repository=self._message_repository, + ) + + @staticmethod + def create_offset_increment(model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any) -> OffsetIncrement: + if isinstance(decoder, PaginationDecoderDecorator): + if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)): + raise ValueError( + f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead." + ) + decoder_to_use = decoder + else: + if not isinstance(decoder, (JsonDecoder, XmlDecoder)): + raise ValueError(f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead.") + decoder_to_use = PaginationDecoderDecorator(decoder=decoder) + return OffsetIncrement( + page_size=model.page_size, + config=config, + decoder=decoder_to_use, + inject_on_first_request=model.inject_on_first_request or False, + parameters=model.parameters or {}, + ) + + @staticmethod + def create_page_increment(model: PageIncrementModel, config: Config, **kwargs: Any) -> PageIncrement: + return PageIncrement( + page_size=model.page_size, + config=config, + start_from_page=model.start_from_page or 0, + inject_on_first_request=model.inject_on_first_request or False, + parameters=model.parameters or {}, + ) + + def create_parent_stream_config(self, model: ParentStreamConfigModel, config: Config, **kwargs: Any) -> ParentStreamConfig: + declarative_stream = self._create_component_from_model(model.stream, config=config) + request_option = self._create_component_from_model(model.request_option, config=config) if model.request_option else None + return ParentStreamConfig( + parent_key=model.parent_key, + request_option=request_option, + stream=declarative_stream, + partition_field=model.partition_field, + config=config, + incremental_dependency=model.incremental_dependency or False, + parameters=model.parameters or {}, + extra_fields=model.extra_fields, + ) + + @staticmethod + def create_record_filter(model: RecordFilterModel, config: Config, **kwargs: Any) -> RecordFilter: + return RecordFilter(condition=model.condition or "", config=config, parameters=model.parameters or {}) + + @staticmethod + def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: + return RequestPath(parameters={}) + + @staticmethod + def create_request_option(model: RequestOptionModel, config: Config, **kwargs: Any) -> RequestOption: + inject_into = RequestOptionType(model.inject_into.value) + return RequestOption(field_name=model.field_name, inject_into=inject_into, parameters={}) + + def create_record_selector( + self, + model: RecordSelectorModel, + config: Config, + *, + transformations: List[RecordTransformation], + decoder: Optional[Decoder] = None, + client_side_incremental_sync: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> RecordSelector: + assert model.schema_normalization is not None # for mypy + extractor = self._create_component_from_model(model=model.extractor, decoder=decoder, config=config) + record_filter = self._create_component_from_model(model.record_filter, config=config) if model.record_filter else None + if client_side_incremental_sync: + record_filter = ClientSideIncrementalRecordFilterDecorator( + config=config, + parameters=model.parameters, + condition=model.record_filter.condition if (model.record_filter and hasattr(model.record_filter, "condition")) else None, + **client_side_incremental_sync, + ) + schema_normalization = TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) + + return RecordSelector( + extractor=extractor, + config=config, + record_filter=record_filter, + transformations=transformations, + schema_normalization=schema_normalization, + parameters=model.parameters or {}, + ) + + @staticmethod + def create_remove_fields(model: RemoveFieldsModel, config: Config, **kwargs: Any) -> RemoveFields: + return RemoveFields(field_pointers=model.field_pointers, condition=model.condition or "", parameters={}) + + def create_selective_authenticator(self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any) -> DeclarativeAuthenticator: + authenticators = {name: self._create_component_from_model(model=auth, config=config) for name, auth in model.authenticators.items()} + # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error + return SelectiveAuthenticator( # type: ignore[abstract] + config=config, + authenticators=authenticators, + authenticator_selection_path=model.authenticator_selection_path, + **kwargs, + ) + + @staticmethod + def create_legacy_session_token_authenticator( + model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any + ) -> LegacySessionTokenAuthenticator: + return LegacySessionTokenAuthenticator( + api_url=url_base, + header=model.header, + login_url=model.login_url, + password=model.password or "", + session_token=model.session_token or "", + session_token_response_key=model.session_token_response_key or "", + username=model.username or "", + validate_session_url=model.validate_session_url, + config=config, + parameters=model.parameters or {}, + ) + + def create_simple_retriever( + self, + model: SimpleRetrieverModel, + config: Config, + *, + name: str, + primary_key: Optional[Union[str, List[str], List[List[str]]]], + stream_slicer: Optional[StreamSlicer], + request_options_provider: Optional[RequestOptionsProvider] = None, + stop_condition_on_cursor: bool = False, + client_side_incremental_sync: Optional[Dict[str, Any]] = None, + transformations: List[RecordTransformation], + ) -> SimpleRetriever: + decoder = self._create_component_from_model(model=model.decoder, config=config) if model.decoder else JsonDecoder(parameters={}) + requester = self._create_component_from_model(model=model.requester, decoder=decoder, config=config, name=name) + record_selector = self._create_component_from_model( + model=model.record_selector, + config=config, + decoder=decoder, + transformations=transformations, + client_side_incremental_sync=client_side_incremental_sync, + ) + url_base = model.requester.url_base if hasattr(model.requester, "url_base") else requester.get_url_base() + + # Define cursor only if per partition or common incremental support is needed + cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None + + if not isinstance(stream_slicer, DatetimeBasedCursor) or type(stream_slicer) is not DatetimeBasedCursor: + # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). + # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement + # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's + # request_options_provider + request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) + elif not request_options_provider: + request_options_provider = DefaultRequestOptionsProvider(parameters={}) + + stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) + + cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None + paginator = ( + self._create_component_from_model( + model=model.paginator, + config=config, + url_base=url_base, + decoder=decoder, + cursor_used_for_stop_condition=cursor_used_for_stop_condition, + ) + if model.paginator + else NoPagination(parameters={}) + ) + + ignore_stream_slicer_parameters_on_paginated_requests = model.ignore_stream_slicer_parameters_on_paginated_requests or False + + if self._limit_slices_fetched or self._emit_connector_builder_messages: + return SimpleRetrieverTestReadDecorator( + name=name, + paginator=paginator, + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + stream_slicer=stream_slicer, + request_option_provider=request_options_provider, + cursor=cursor, + config=config, + maximum_number_of_slices=self._limit_slices_fetched or 5, + ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, + parameters=model.parameters or {}, + ) + return SimpleRetriever( + name=name, + paginator=paginator, + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + stream_slicer=stream_slicer, + request_option_provider=request_options_provider, + cursor=cursor, + config=config, + ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, + parameters=model.parameters or {}, + ) + + def _create_async_job_status_mapping( + self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any + ) -> Mapping[str, AsyncJobStatus]: + api_status_to_cdk_status = {} + for cdk_status, api_statuses in model.dict().items(): + if cdk_status == "type": + # This is an element of the dict because of the typing of the CDK but it is not a CDK status + continue + + for status in api_statuses: + if status in api_status_to_cdk_status: + raise ValueError( + f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" + ) + api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) + return api_status_to_cdk_status + + def _get_async_job_status(self, status: str) -> AsyncJobStatus: + match status: + case "running": + return AsyncJobStatus.RUNNING + case "completed": + return AsyncJobStatus.COMPLETED + case "failed": + return AsyncJobStatus.FAILED + case "timeout": + return AsyncJobStatus.TIMED_OUT + case _: + raise ValueError(f"Unsupported CDK status {status}") + + def create_async_retriever( + self, + model: AsyncRetrieverModel, + config: Config, + *, + name: str, + primary_key: Optional[Union[str, List[str], List[List[str]]]], # this seems to be needed to match create_simple_retriever + stream_slicer: Optional[StreamSlicer], + client_side_incremental_sync: Optional[Dict[str, Any]] = None, + transformations: List[RecordTransformation], + **kwargs: Any, + ) -> AsyncRetriever: + + decoder = self._create_component_from_model(model=model.decoder, config=config) if model.decoder else JsonDecoder(parameters={}) + record_selector = self._create_component_from_model( + model=model.record_selector, + config=config, + decoder=decoder, + transformations=transformations, + client_side_incremental_sync=client_side_incremental_sync, + ) + stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) + creation_requester = self._create_component_from_model( + model=model.creation_requester, decoder=decoder, config=config, name=f"job creation - {name}" + ) + polling_requester = self._create_component_from_model( + model=model.polling_requester, decoder=decoder, config=config, name=f"job polling - {name}" + ) + job_download_components_name = f"job download - {name}" + download_requester = self._create_component_from_model( + model=model.download_requester, decoder=decoder, config=config, name=job_download_components_name + ) + download_retriever = SimpleRetriever( + requester=download_requester, + record_selector=RecordSelector( + extractor=ResponseToFileExtractor(), + record_filter=None, + transformations=[], + schema_normalization=TypeTransformer(TransformConfig.NoTransform), + config=config, + parameters={}, + ), + primary_key=None, + name=job_download_components_name, + paginator=( + self._create_component_from_model(model=model.download_paginator, decoder=decoder, config=config, url_base="") + if model.download_paginator + else NoPagination(parameters={}) + ), + config=config, + parameters={}, + ) + abort_requester = ( + self._create_component_from_model(model=model.abort_requester, decoder=decoder, config=config, name=f"job abort - {name}") + if model.abort_requester + else None + ) + delete_requester = ( + self._create_component_from_model(model=model.delete_requester, decoder=decoder, config=config, name=f"job delete - {name}") + if model.delete_requester + else None + ) + status_extractor = self._create_component_from_model(model=model.status_extractor, decoder=decoder, config=config, name=name) + urls_extractor = self._create_component_from_model(model=model.urls_extractor, decoder=decoder, config=config, name=name) + job_repository: AsyncJobRepository = AsyncHttpJobRepository( + creation_requester=creation_requester, + polling_requester=polling_requester, + download_retriever=download_retriever, + abort_requester=abort_requester, + delete_requester=delete_requester, + status_extractor=status_extractor, + status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), + urls_extractor=urls_extractor, + ) + + return AsyncRetriever( + job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( + job_repository, + stream_slices, + JobTracker(1), # FIXME eventually make the number of concurrent jobs in the API configurable. Until then, we limit to 1 + self._message_repository, + has_bulk_parent=False, # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk + ), + record_selector=record_selector, + stream_slicer=stream_slicer, + config=config, + parameters=model.parameters or {}, + ) + + @staticmethod + def create_spec(model: SpecModel, config: Config, **kwargs: Any) -> Spec: + return Spec( + connection_specification=model.connection_specification, + documentation_url=model.documentation_url, + advanced_auth=model.advanced_auth, + parameters={}, + ) + + def create_substream_partition_router( + self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any + ) -> SubstreamPartitionRouter: + parent_stream_configs = [] + if model.parent_stream_configs: + parent_stream_configs.extend( + [ + self._create_message_repository_substream_wrapper(model=parent_stream_config, config=config) + for parent_stream_config in model.parent_stream_configs + ] + ) + + return SubstreamPartitionRouter(parent_stream_configs=parent_stream_configs, parameters=model.parameters or {}, config=config) + + def _create_message_repository_substream_wrapper(self, model: ParentStreamConfigModel, config: Config) -> Any: + substream_factory = ModelToComponentFactory( + limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, + limit_slices_fetched=self._limit_slices_fetched, + emit_connector_builder_messages=self._emit_connector_builder_messages, + disable_retries=self._disable_retries, + disable_cache=self._disable_cache, + message_repository=LogAppenderMessageRepositoryDecorator( + {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, + self._message_repository, + self._evaluate_log_level(self._emit_connector_builder_messages), + ), + ) + return substream_factory._create_component_from_model(model=model, config=config) + + @staticmethod + def create_wait_time_from_header(model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any) -> WaitTimeFromHeaderBackoffStrategy: + return WaitTimeFromHeaderBackoffStrategy( + header=model.header, + parameters=model.parameters or {}, + config=config, + regex=model.regex, + max_waiting_time_in_seconds=model.max_waiting_time_in_seconds if model.max_waiting_time_in_seconds is not None else None, + ) + + @staticmethod + def create_wait_until_time_from_header( + model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any + ) -> WaitUntilTimeFromHeaderBackoffStrategy: + return WaitUntilTimeFromHeaderBackoffStrategy( + header=model.header, parameters=model.parameters or {}, config=config, min_wait=model.min_wait, regex=model.regex + ) + + def get_message_repository(self) -> MessageRepository: + return self._message_repository + + def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: + return Level.DEBUG if emit_connector_builder_messages else Level.INFO diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/__init__.py new file mode 100644 index 000000000000..86e472a42c52 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.partition_routers.cartesian_product_stream_slicer import CartesianProductStreamSlicer +from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import ListPartitionRouter +from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter + +__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py new file mode 100644 index 000000000000..14898428ede3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py @@ -0,0 +1,154 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import itertools +import logging +from collections import ChainMap +from collections.abc import Callable +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, List, Mapping, Optional + +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter +from airbyte_cdk.sources.types import StreamSlice, StreamState + + +def check_for_substream_in_slicers(slicers: Iterable[PartitionRouter], log_warning: Callable[[str], None]) -> None: + """ + Recursively checks for the presence of SubstreamPartitionRouter within slicers. + Logs a warning if a SubstreamPartitionRouter is found within a CartesianProductStreamSlicer. + + Args: + slicers (Iterable[PartitionRouter]): The list of slicers to check. + log_warning (Callable): Logging function to record warnings. + """ + for slicer in slicers: + if isinstance(slicer, SubstreamPartitionRouter): + log_warning("Parent state handling is not supported for CartesianProductStreamSlicer.") + return + elif isinstance(slicer, CartesianProductStreamSlicer): + # Recursively check sub-slicers within CartesianProductStreamSlicer + check_for_substream_in_slicers(slicer.stream_slicers, log_warning) + + +@dataclass +class CartesianProductStreamSlicer(PartitionRouter): + """ + Stream slicers that iterates over the cartesian product of input stream slicers + Given 2 stream slicers with the following slices: + A: [{"i": 0}, {"i": 1}, {"i": 2}] + B: [{"s": "hello"}, {"s": "world"}] + the resulting stream slices are + [ + {"i": 0, "s": "hello"}, + {"i": 0, "s": "world"}, + {"i": 1, "s": "hello"}, + {"i": 1, "s": "world"}, + {"i": 2, "s": "hello"}, + {"i": 2, "s": "world"}, + ] + + Attributes: + stream_slicers (List[PartitionRouter]): Underlying stream slicers. The RequestOptions (e.g: Request headers, parameters, etc..) returned by this slicer are the combination of the RequestOptions of its input slicers. If there are conflicts e.g: two slicers define the same header or request param, the conflict is resolved by taking the value from the first slicer, where ordering is determined by the order in which slicers were input to this composite slicer. + """ + + stream_slicers: List[PartitionRouter] + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + check_for_substream_in_slicers(self.stream_slicers, self.logger.warning) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return dict( + ChainMap( + *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons + s.get_request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return dict( + ChainMap( + *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons + s.get_request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return dict( + ChainMap( + *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons + s.get_request_body_data(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return dict( + ChainMap( + *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons + s.get_request_body_json(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def stream_slices(self) -> Iterable[StreamSlice]: + sub_slices = (s.stream_slices() for s in self.stream_slicers) + product = itertools.product(*sub_slices) + for stream_slice_tuple in product: + partition = dict(ChainMap(*[s.partition for s in stream_slice_tuple])) # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons + cursor_slices = [s.cursor_slice for s in stream_slice_tuple if s.cursor_slice] + if len(cursor_slices) > 1: + raise ValueError(f"There should only be a single cursor slice. Found {cursor_slices}") + if cursor_slices: + cursor_slice = cursor_slices[0] + else: + cursor_slice = {} + yield StreamSlice(partition=partition, cursor_slice=cursor_slice) + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Parent stream states are not supported for cartesian product stream slicer + """ + pass + + def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: + """ + Parent stream states are not supported for cartesian product stream slicer + """ + pass + + @property + def logger(self) -> logging.Logger: + return logging.getLogger("airbyte.CartesianProductStreamSlicer") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py new file mode 100644 index 000000000000..564a3119e25b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py @@ -0,0 +1,101 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, List, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState + + +@dataclass +class ListPartitionRouter(PartitionRouter): + """ + Partition router that iterates over the values of a list + If values is a string, then evaluate it as literal and assert the resulting literal is a list + + Attributes: + values (Union[str, List[str]]): The values to iterate over + cursor_field (Union[InterpolatedString, str]): The name of the cursor field + config (Config): The user-provided configuration as specified by the source's spec + request_option (Optional[RequestOption]): The request option to configure the HTTP request + """ + + values: Union[str, List[str]] + cursor_field: Union[InterpolatedString, str] + config: Config + parameters: InitVar[Mapping[str, Any]] + request_option: Optional[RequestOption] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if isinstance(self.values, str): + self.values = InterpolatedString.create(self.values, parameters=parameters).eval(self.config) + self._cursor_field = ( + InterpolatedString(string=self.cursor_field, parameters=parameters) if isinstance(self.cursor_field, str) else self.cursor_field + ) + + self._cursor = None + + def get_request_params( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.request_parameter, stream_slice) + + def get_request_headers( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.header, stream_slice) + + def get_request_body_data( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.body_data, stream_slice) + + def get_request_body_json( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.body_json, stream_slice) + + def stream_slices(self) -> Iterable[StreamSlice]: + return [StreamSlice(partition={self._cursor_field.eval(self.config): slice_value}, cursor_slice={}) for slice_value in self.values] + + def _get_request_option(self, request_option_type: RequestOptionType, stream_slice: Optional[StreamSlice]) -> Mapping[str, Any]: + if self.request_option and self.request_option.inject_into == request_option_type and stream_slice: + slice_value = stream_slice.get(self._cursor_field.eval(self.config)) + if slice_value: + return {self.request_option.field_name.eval(self.config): slice_value} # type: ignore # field_name is always casted to InterpolatedString + else: + return {} + else: + return {} + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + ListPartitionRouter doesn't have parent streams + """ + pass + + def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: + """ + ListPartitionRouter doesn't have parent streams + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/partition_router.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/partition_router.py new file mode 100644 index 000000000000..3a9bc3abfbf2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/partition_router.py @@ -0,0 +1,62 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Mapping, Optional + +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.types import StreamState + + +@dataclass +class PartitionRouter(StreamSlicer): + """ + Base class for partition routers. + Methods: + set_parent_state(stream_state): Set the state of the parent streams. + get_parent_state(): Get the state of the parent streams. + """ + + @abstractmethod + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Set the state of the parent streams. + + This method should only be implemented if the slicer is based on some parent stream and needs to read this stream + incrementally using the state. + + Args: + stream_state (StreamState): The state of the streams to be set. The expected format is a dictionary that includes + 'parent_state' which is a dictionary of parent state names to their corresponding state. + Example: + { + "parent_state": { + "parent_stream_name_1": { ... }, + "parent_stream_name_2": { ... }, + ... + } + } + """ + + @abstractmethod + def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: + """ + Get the state of the parent streams. + + This method should only be implemented if the slicer is based on some parent stream and needs to read this stream + incrementally using the state. + + Returns: + Optional[Mapping[str, StreamState]]: The current state of the parent streams in a dictionary format. + The returned format will be: + { + "parent_stream_name1": { + "last_updated": "2023-05-27T00:00:00Z" + }, + "parent_stream_name2": { + "last_updated": "2023-05-27T00:00:00Z" + } + } + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py new file mode 100644 index 000000000000..32e6a353dedf --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, Mapping, Optional + +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.types import StreamSlice, StreamState + + +@dataclass +class SinglePartitionRouter(PartitionRouter): + """Partition router returning only a stream slice""" + + parameters: InitVar[Mapping[str, Any]] + + def get_request_params( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_headers( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_body_data( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_body_json( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def stream_slices(self) -> Iterable[StreamSlice]: + yield StreamSlice(partition={}, cursor_slice={}) + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + SinglePartitionRouter doesn't have parent streams + """ + pass + + def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: + """ + SinglePartitionRouter doesn't have parent streams + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py new file mode 100644 index 000000000000..80bf6034d2a0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -0,0 +1,295 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import copy +import logging +from dataclasses import InitVar, dataclass +from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, Optional, Union + +import dpath +from airbyte_cdk.models import AirbyteMessage +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState +from airbyte_cdk.utils import AirbyteTracedException + +if TYPE_CHECKING: + from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream + + +@dataclass +class ParentStreamConfig: + """ + Describes how to create a stream slice from a parent stream + + stream: The stream to read records from + parent_key: The key of the parent stream's records that will be the stream slice key + partition_field: The partition key + extra_fields: Additional field paths to include in the stream slice + request_option: How to inject the slice value on an outgoing HTTP request + incremental_dependency (bool): Indicates if the parent stream should be read incrementally. + """ + + stream: "DeclarativeStream" # Parent streams must be DeclarativeStream because we can't know which part of the stream slice is a partition for regular Stream + parent_key: Union[InterpolatedString, str] + partition_field: Union[InterpolatedString, str] + config: Config + parameters: InitVar[Mapping[str, Any]] + extra_fields: Optional[Union[List[List[str]], List[List[InterpolatedString]]]] = None # List of field paths (arrays of strings) + request_option: Optional[RequestOption] = None + incremental_dependency: bool = False + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self.parent_key = InterpolatedString.create(self.parent_key, parameters=parameters) + self.partition_field = InterpolatedString.create(self.partition_field, parameters=parameters) + if self.extra_fields: + # Create InterpolatedString for each field path in extra_keys + self.extra_fields = [ + [InterpolatedString.create(path, parameters=parameters) for path in key_path] for key_path in self.extra_fields + ] + + +@dataclass +class SubstreamPartitionRouter(PartitionRouter): + """ + Partition router that iterates over the parent's stream records and emits slices + Will populate the state with `partition_field` and `parent_slice` so they can be accessed by other components + + Attributes: + parent_stream_configs (List[ParentStreamConfig]): parent streams to iterate over and their config + """ + + parent_stream_configs: List[ParentStreamConfig] + config: Config + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if not self.parent_stream_configs: + raise ValueError("SubstreamPartitionRouter needs at least 1 parent stream") + self._parameters = parameters + + def get_request_params( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.request_parameter, stream_slice) + + def get_request_headers( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.header, stream_slice) + + def get_request_body_data( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.body_data, stream_slice) + + def get_request_body_json( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response + return self._get_request_option(RequestOptionType.body_json, stream_slice) + + def _get_request_option(self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice]) -> Mapping[str, Any]: + params = {} + if stream_slice: + for parent_config in self.parent_stream_configs: + if parent_config.request_option and parent_config.request_option.inject_into == option_type: + key = parent_config.partition_field.eval(self.config) # type: ignore # partition_field is always casted to an interpolated string + value = stream_slice.get(key) + if value: + params.update({parent_config.request_option.field_name.eval(config=self.config): value}) # type: ignore # field_name is always casted to an interpolated string + return params + + def stream_slices(self) -> Iterable[StreamSlice]: + """ + Iterate over each parent stream's record and create a StreamSlice for each record. + + For each stream, iterate over its stream_slices. + For each stream slice, iterate over each record. + yield a stream slice for each such records. + + If a parent slice contains no record, emit a slice with parent_record=None. + + The template string can interpolate the following values: + - parent_stream_slice: mapping representing the parent's stream slice + - parent_record: mapping representing the parent record + - parent_stream_name: string representing the parent stream name + """ + if not self.parent_stream_configs: + yield from [] + else: + for parent_stream_config in self.parent_stream_configs: + parent_stream = parent_stream_config.stream + parent_field = parent_stream_config.parent_key.eval(self.config) # type: ignore # parent_key is always casted to an interpolated string + partition_field = parent_stream_config.partition_field.eval(self.config) # type: ignore # partition_field is always casted to an interpolated string + extra_fields = None + if parent_stream_config.extra_fields: + extra_fields = [[field_path_part.eval(self.config) for field_path_part in field_path] for field_path in parent_stream_config.extra_fields] # type: ignore # extra_fields is always casted to an interpolated string + + # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does + # not support either substreams or RFR, but something that needs to be considered once we do + for parent_record in parent_stream.read_only_records(): + parent_partition = None + # Skip non-records (eg AirbyteLogMessage) + if isinstance(parent_record, AirbyteMessage): + self.logger.warning( + f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state." + ) + if parent_record.type == MessageType.RECORD: + parent_record = parent_record.record.data # type: ignore[union-attr, assignment] # record is always a Record + else: + continue + elif isinstance(parent_record, Record): + parent_partition = parent_record.associated_slice.partition if parent_record.associated_slice else {} + parent_record = parent_record.data + elif not isinstance(parent_record, Mapping): + # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid + raise AirbyteTracedException(message=f"Parent stream returned records as invalid type {type(parent_record)}") + try: + partition_value = dpath.get(parent_record, parent_field) + except KeyError: + continue + + # Add extra fields + extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields) + + yield StreamSlice( + partition={partition_field: partition_value, "parent_slice": parent_partition or {}}, + cursor_slice={}, + extra_fields=extracted_extra_fields, + ) + + def _extract_extra_fields( + self, parent_record: Mapping[str, Any] | AirbyteMessage, extra_fields: Optional[List[List[str]]] = None + ) -> Mapping[str, Any]: + """ + Extracts additional fields specified by their paths from the parent record. + + Args: + parent_record (Mapping[str, Any]): The record from the parent stream to extract fields from. + extra_fields (Optional[List[List[str]]]): A list of field paths (as lists of strings) to extract from the parent record. + + Returns: + Mapping[str, Any]: A dictionary containing the extracted fields. + The keys are the joined field paths, and the values are the corresponding extracted values. + """ + extracted_extra_fields = {} + if extra_fields: + for extra_field_path in extra_fields: + try: + extra_field_value = dpath.get(parent_record, extra_field_path) + self.logger.debug(f"Extracted extra_field_path: {extra_field_path} with value: {extra_field_value}") + except KeyError: + self.logger.debug(f"Failed to extract extra_field_path: {extra_field_path}") + extra_field_value = None + extracted_extra_fields[".".join(extra_field_path)] = extra_field_value + return extracted_extra_fields + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Set the state of the parent streams. + + If the `parent_state` key is missing from `stream_state`, migrate the child stream state to the parent stream's state format. + This migration applies only to parent streams with incremental dependencies. + + Args: + stream_state (StreamState): The state of the streams to be set. + + Example of state format: + { + "parent_state": { + "parent_stream_name1": { + "last_updated": "2023-05-27T00:00:00Z" + }, + "parent_stream_name2": { + "last_updated": "2023-05-27T00:00:00Z" + } + } + } + + Example of migrating to parent state format: + - Initial state: + { + "updated_at": "2023-05-27T00:00:00Z" + } + - After migration: + { + "updated_at": "2023-05-27T00:00:00Z", + "parent_state": { + "parent_stream_name": { + "parent_stream_cursor": "2023-05-27T00:00:00Z" + } + } + } + """ + if not stream_state: + return + + parent_state = stream_state.get("parent_state", {}) + + # If `parent_state` doesn't exist and at least one parent stream has an incremental dependency, + # copy the child state to parent streams with incremental dependencies. + incremental_dependency = any([parent_config.incremental_dependency for parent_config in self.parent_stream_configs]) + if not parent_state and not incremental_dependency: + return + + if not parent_state and incremental_dependency: + # Attempt to retrieve child state + substream_state = list(stream_state.values()) + substream_state = substream_state[0] if substream_state else {} + parent_state = {} + + # Copy child state to parent streams with incremental dependencies + if substream_state: + for parent_config in self.parent_stream_configs: + if parent_config.incremental_dependency: + parent_state[parent_config.stream.name] = {parent_config.stream.cursor_field: substream_state} + + # Set state for each parent stream with an incremental dependency + for parent_config in self.parent_stream_configs: + if parent_config.incremental_dependency: + parent_config.stream.state = parent_state.get(parent_config.stream.name, {}) + + def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: + """ + Get the state of the parent streams. + + Returns: + StreamState: The current state of the parent streams. + + Example of state format: + { + "parent_stream_name1": { + "last_updated": "2023-05-27T00:00:00Z" + }, + "parent_stream_name2": { + "last_updated": "2023-05-27T00:00:00Z" + } + } + """ + parent_state = {} + for parent_config in self.parent_stream_configs: + if parent_config.incremental_dependency: + parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.state) + return parent_state + + @property + def logger(self) -> logging.Logger: + return logging.getLogger("airbyte.SubstreamPartitionRouter") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/__init__.py new file mode 100644 index 000000000000..e5266ea7cfd4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/__init__.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpRequester +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption +from airbyte_cdk.sources.declarative.requesters.requester import Requester + +__all__ = ["HttpRequester", "RequestOption", "Requester"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py new file mode 100644 index 000000000000..490169b6b6ee --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py @@ -0,0 +1,11 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler import CompositeErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter import HttpResponseFilter + +__all__ = ["BackoffStrategy", "CompositeErrorHandler", "DefaultErrorHandler", "ErrorHandler", "HttpResponseFilter"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py new file mode 100644 index 000000000000..29647ae2959e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py @@ -0,0 +1,21 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy import ConstantBackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy import ( + ExponentialBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_time_from_header_backoff_strategy import ( + WaitTimeFromHeaderBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_until_time_from_header_backoff_strategy import ( + WaitUntilTimeFromHeaderBackoffStrategy, +) + +__all__ = [ + "ConstantBackoffStrategy", + "ExponentialBackoffStrategy", + "WaitTimeFromHeaderBackoffStrategy", + "WaitUntilTimeFromHeaderBackoffStrategy", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py new file mode 100644 index 000000000000..96c50ef0bf6d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.streams.http.error_handlers import BackoffStrategy +from airbyte_cdk.sources.types import Config + + +@dataclass +class ConstantBackoffStrategy(BackoffStrategy): + """ + Backoff strategy with a constant backoff interval + + Attributes: + backoff_time_in_seconds (float): time to backoff before retrying a retryable request. + """ + + backoff_time_in_seconds: Union[float, InterpolatedString, str] + parameters: InitVar[Mapping[str, Any]] + config: Config + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if not isinstance(self.backoff_time_in_seconds, InterpolatedString): + self.backoff_time_in_seconds = str(self.backoff_time_in_seconds) + if isinstance(self.backoff_time_in_seconds, float): + self.backoff_time_in_seconds = InterpolatedString.create(str(self.backoff_time_in_seconds), parameters=parameters) + else: + self.backoff_time_in_seconds = InterpolatedString.create(self.backoff_time_in_seconds, parameters=parameters) + + def backoff_time( + self, + response_or_exception: Optional[Union[requests.Response, requests.RequestException]], + attempt_count: int, + ) -> Optional[float]: + return self.backoff_time_in_seconds.eval(self.config) # type: ignore # backoff_time_in_seconds is always cast to an interpolated string diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py new file mode 100644 index 000000000000..b3a57675b727 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py @@ -0,0 +1,44 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.streams.http.error_handlers import BackoffStrategy +from airbyte_cdk.sources.types import Config + + +@dataclass +class ExponentialBackoffStrategy(BackoffStrategy): + """ + Backoff strategy with an exponential backoff interval + + Attributes: + factor (float): multiplicative factor + """ + + parameters: InitVar[Mapping[str, Any]] + config: Config + factor: Union[float, InterpolatedString, str] = 5 + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if not isinstance(self.factor, InterpolatedString): + self.factor = str(self.factor) + if isinstance(self.factor, float): + self._factor = InterpolatedString.create(str(self.factor), parameters=parameters) + else: + self._factor = InterpolatedString.create(self.factor, parameters=parameters) + + @property + def _retry_factor(self) -> float: + return self._factor.eval(self.config) # type: ignore # factor is always cast to an interpolated string + + def backoff_time( + self, + response_or_exception: Optional[Union[requests.Response, requests.RequestException]], + attempt_count: int, + ) -> Optional[float]: + return self._retry_factor * 2**attempt_count # type: ignore # factor is always cast to an interpolated string diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py new file mode 100644 index 000000000000..e7f5e1f88a64 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import numbers +from re import Pattern +from typing import Optional + +import requests + + +def get_numeric_value_from_header(response: requests.Response, header: str, regex: Optional[Pattern[str]]) -> Optional[float]: + """ + Extract a header value from the response as a float + :param response: response the extract header value from + :param header: Header to extract + :param regex: optional regex to apply on the header to obtain the value + :return: header value as float if it's a number. None otherwise + """ + header_value = response.headers.get(header, None) + if not header_value: + return None + if isinstance(header_value, str): + if regex: + match = regex.match(header_value) + if match: + header_value = match.group() + return _as_float(header_value) + elif isinstance(header_value, numbers.Number): + return float(header_value) # type: ignore[arg-type] + else: + return None + + +def _as_float(s: str) -> Optional[float]: + try: + return float(s) + except ValueError: + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py new file mode 100644 index 000000000000..79eb8a7fe23d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import re +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper import get_numeric_value_from_header +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from airbyte_cdk.sources.types import Config +from airbyte_cdk.utils import AirbyteTracedException + + +@dataclass +class WaitTimeFromHeaderBackoffStrategy(BackoffStrategy): + """ + Extract wait time from http header + + Attributes: + header (str): header to read wait time from + regex (Optional[str]): optional regex to apply on the header to extract its value + max_waiting_time_in_seconds: (Optional[float]): given the value extracted from the header is greater than this value, stop the stream + """ + + header: Union[InterpolatedString, str] + parameters: InitVar[Mapping[str, Any]] + config: Config + regex: Optional[Union[InterpolatedString, str]] = None + max_waiting_time_in_seconds: Optional[float] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self.regex = InterpolatedString.create(self.regex, parameters=parameters) if self.regex else None + self.header = InterpolatedString.create(self.header, parameters=parameters) + + def backoff_time( + self, response_or_exception: Optional[Union[requests.Response, requests.RequestException]], attempt_count: int + ) -> Optional[float]: + header = self.header.eval(config=self.config) # type: ignore # header is always cast to an interpolated stream + if self.regex: + evaled_regex = self.regex.eval(self.config) # type: ignore # header is always cast to an interpolated string + regex = re.compile(evaled_regex) + else: + regex = None + header_value = None + if isinstance(response_or_exception, requests.Response): + header_value = get_numeric_value_from_header(response_or_exception, header, regex) + if self.max_waiting_time_in_seconds and header_value and header_value >= self.max_waiting_time_in_seconds: + raise AirbyteTracedException( + internal_message=f"Rate limit wait time {header_value} is greater than max waiting time of {self.max_waiting_time_in_seconds} seconds. Stopping the stream...", + message="The rate limit is greater than max waiting time has been reached.", + failure_type=FailureType.transient_error, + ) + return header_value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py new file mode 100644 index 000000000000..861f8ba83193 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py @@ -0,0 +1,66 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import numbers +import re +import time +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper import get_numeric_value_from_header +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from airbyte_cdk.sources.types import Config + + +@dataclass +class WaitUntilTimeFromHeaderBackoffStrategy(BackoffStrategy): + """ + Extract time at which we can retry the request from response header + and wait for the difference between now and that time + + Attributes: + header (str): header to read wait time from + min_wait (Optional[float]): minimum time to wait for safety + regex (Optional[str]): optional regex to apply on the header to extract its value + """ + + header: Union[InterpolatedString, str] + parameters: InitVar[Mapping[str, Any]] + config: Config + min_wait: Optional[Union[float, InterpolatedString, str]] = None + regex: Optional[Union[InterpolatedString, str]] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self.header = InterpolatedString.create(self.header, parameters=parameters) + self.regex = InterpolatedString.create(self.regex, parameters=parameters) if self.regex else None + if not isinstance(self.min_wait, InterpolatedString): + self.min_wait = InterpolatedString.create(str(self.min_wait), parameters=parameters) + + def backoff_time( + self, response_or_exception: Optional[Union[requests.Response, requests.RequestException]], attempt_count: int + ) -> Optional[float]: + now = time.time() + header = self.header.eval(self.config) # type: ignore # header is always cast to an interpolated string + if self.regex: + evaled_regex = self.regex.eval(self.config) # type: ignore # header is always cast to an interpolated string + regex = re.compile(evaled_regex) + else: + regex = None + wait_until = None + if isinstance(response_or_exception, requests.Response): + wait_until = get_numeric_value_from_header(response_or_exception, header, regex) + min_wait = self.min_wait.eval(self.config) # type: ignore # header is always cast to an interpolated string + if wait_until is None or not wait_until: + return float(min_wait) if min_wait else None + if (isinstance(wait_until, str) and wait_until.isnumeric()) or isinstance(wait_until, numbers.Number): + wait_time = float(wait_until) - now + else: + return float(min_wait) + if min_wait: + return float(max(wait_time, min_wait)) + elif wait_time < 0: + return None + return wait_time diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py new file mode 100644 index 000000000000..7a44f7b94895 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC +from dataclasses import dataclass + +from airbyte_cdk.sources.streams.http.error_handlers import BackoffStrategy + + +@dataclass +class DecalarativeBackoffStrategy(BackoffStrategy, ABC): + """ + This interface exists to retain backwards compatability with connectors that reference the declarative BackoffStrategy. As part of the effort to promote common interfaces to the Python CDK, this now extends the Python CDK backoff strategy interface. + + Backoff strategy defining how long to wait before retrying a request that resulted in an error. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py new file mode 100644 index 000000000000..bc151feca3dc --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ( + ErrorResolution, + ResponseAction, + create_fallback_error_resolution, +) + + +@dataclass +class CompositeErrorHandler(ErrorHandler): + """ + Error handler that sequentially iterates over a list of `ErrorHandler`s + + Sample config chaining 2 different retriers: + error_handler: + type: "CompositeErrorHandler" + error_handlers: + - response_filters: + - predicate: "{{ 'codase' in response }}" + action: RETRY + backoff_strategies: + - type: "ConstantBackoff" + backoff_time_in_seconds: 5 + - response_filters: + - http_codes: [ 403 ] + action: RETRY + backoff_strategies: + - type: "ConstantBackoff" + backoff_time_in_seconds: 10 + Attributes: + error_handlers (List[ErrorHandler]): list of error handlers + """ + + error_handlers: List[ErrorHandler] + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if not self.error_handlers: + raise ValueError("CompositeErrorHandler expects at least 1 underlying error handler") + + @property + def max_retries(self) -> Optional[int]: + return self.error_handlers[0].max_retries + + @property + def max_time(self) -> Optional[int]: + return max([error_handler.max_time or 0 for error_handler in self.error_handlers]) + + def interpret_response(self, response_or_exception: Optional[Union[requests.Response, Exception]]) -> ErrorResolution: + matched_error_resolution = None + for error_handler in self.error_handlers: + matched_error_resolution = error_handler.interpret_response(response_or_exception) + + if not isinstance(matched_error_resolution, ErrorResolution): + continue + + if matched_error_resolution.response_action == ResponseAction.SUCCESS: + return matched_error_resolution + + if ( + matched_error_resolution.response_action == ResponseAction.RETRY + or matched_error_resolution.response_action == ResponseAction.IGNORE + ): + return matched_error_resolution + if matched_error_resolution: + return matched_error_resolution + + return create_fallback_error_resolution(response_or_exception) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py new file mode 100644 index 000000000000..68ff5ecf99cd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py @@ -0,0 +1,136 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, MutableMapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_http_response_filter import DefaultHttpResponseFilter +from airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter import HttpResponseFilter +from airbyte_cdk.sources.streams.http.error_handlers import BackoffStrategy, ErrorHandler +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ( + SUCCESS_RESOLUTION, + ErrorResolution, + create_fallback_error_resolution, +) +from airbyte_cdk.sources.types import Config + + +@dataclass +class DefaultErrorHandler(ErrorHandler): + """ + Default error handler. + + By default, the handler will only use the `DEFAULT_ERROR_MAPPING` that is part of the Python CDK's `HttpStatusErrorHandler`. + + If the response is successful, then a SUCCESS_RESOLUTION is returned. + Otherwise, iterate over the response_filters. + If any of the filter match the response, then return the appropriate status. + When `DefaultErrorHandler.backoff_time()` is invoked, iterate sequentially over the backoff_strategies and return the first non-None backoff time, else return None. + + Sample configs: + + 1. retry 10 times + ` + error_handler: + max_retries: 10 + ` + 2. backoff for 5 seconds + ` + error_handler: + backoff_strategies: + - type: "ConstantBackoff" + backoff_time_in_seconds: 5 + ` + 3. retry on HTTP 404 + ` + error_handler: + response_filters: + - http_codes: [ 404 ] + action: RETRY + ` + 4. ignore HTTP 404 + ` + error_handler: + response_filters: + - http_codes: [ 404 ] + action: IGNORE + ` + 5. retry if error message contains `retrythisrequest!` substring + ` + error_handler: + response_filters: + - error_message_contain: "retrythisrequest!" + action: IGNORE + ` + 6. retry if 'code' is a field present in the response body + ` + error_handler: + response_filters: + - predicate: "{{ 'code' in response }}" + action: IGNORE + ` + + 7. ignore 429 and retry on 404 + ` + error_handler: + - http_codes: [ 429 ] + action: IGNORE + - http_codes: [ 404 ] + action: RETRY + ` + + Attributes: + response_filters (Optional[List[HttpResponseFilter]]): response filters to iterate on + max_retries (Optional[int]): maximum retry attempts + backoff_strategies (Optional[List[BackoffStrategy]]): list of backoff strategies to use to determine how long + to wait before retrying + """ + + parameters: InitVar[Mapping[str, Any]] + config: Config + response_filters: Optional[List[HttpResponseFilter]] = None + max_retries: Optional[int] = 5 + max_time: int = 60 * 10 + _max_retries: int = field(init=False, repr=False, default=5) + _max_time: int = field(init=False, repr=False, default=60 * 10) + backoff_strategies: Optional[List[BackoffStrategy]] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + + if not self.response_filters: + self.response_filters = [HttpResponseFilter(config=self.config, parameters={})] + + self._last_request_to_attempt_count: MutableMapping[requests.PreparedRequest, int] = {} + + def interpret_response(self, response_or_exception: Optional[Union[requests.Response, Exception]]) -> ErrorResolution: + + if self.response_filters: + for response_filter in self.response_filters: + matched_error_resolution = response_filter.matches(response_or_exception=response_or_exception) + if matched_error_resolution: + return matched_error_resolution + if isinstance(response_or_exception, requests.Response): + if response_or_exception.ok: + return SUCCESS_RESOLUTION + + default_reponse_filter = DefaultHttpResponseFilter(parameters={}, config=self.config) + default_response_filter_resolution = default_reponse_filter.matches(response_or_exception) + + return ( + default_response_filter_resolution + if default_response_filter_resolution + else create_fallback_error_resolution(response_or_exception) + ) + + def backoff_time( + self, response_or_exception: Optional[Union[requests.Response, requests.RequestException]], attempt_count: int = 0 + ) -> Optional[float]: + backoff = None + if self.backoff_strategies: + for backoff_strategy in self.backoff_strategies: + backoff = backoff_strategy.backoff_time(response_or_exception=response_or_exception, attempt_count=attempt_count) # type: ignore # attempt_count maintained for compatibility with low code CDK + if backoff: + return backoff + return backoff diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py new file mode 100644 index 000000000000..4e3f54169825 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py @@ -0,0 +1,30 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from typing import Optional, Union + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter import HttpResponseFilter +from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, create_fallback_error_resolution + + +class DefaultHttpResponseFilter(HttpResponseFilter): + def matches(self, response_or_exception: Optional[Union[requests.Response, Exception]]) -> Optional[ErrorResolution]: + + default_mapped_error_resolution = None + + if isinstance(response_or_exception, (requests.Response, Exception)): + + mapped_key: Union[int, type] = ( + response_or_exception.status_code + if isinstance(response_or_exception, requests.Response) + else response_or_exception.__class__ + ) + + default_mapped_error_resolution = DEFAULT_ERROR_MAPPING.get(mapped_key) + + return ( + default_mapped_error_resolution if default_mapped_error_resolution else create_fallback_error_resolution(response_or_exception) + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py new file mode 100644 index 000000000000..a84747f91996 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC +from dataclasses import dataclass + +from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler + + +@dataclass +class DeclarativeErrorHandler(ErrorHandler, ABC): + """ + This interface exists to retain backwards compatability with connectors that reference the declarative ErrorHandler. As part of the effort to promote common interfaces to the Python CDK, this now extends the Python CDK ErrorHandler interface. + + `ErrorHandler` defines how to handle errors that occur during the request process, returning an ErrorResolution object that defines how to proceed. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py new file mode 100644 index 000000000000..c452dcac5cdd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Set, Union + +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.streams.http.error_handlers import JsonErrorMessageParser +from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, ResponseAction +from airbyte_cdk.sources.types import Config + + +@dataclass +class HttpResponseFilter: + """ + Filter to select a response based on its HTTP status code, error message or a predicate. + If a response matches the filter, the response action, failure_type, and error message are returned as an ErrorResolution object. + For http_codes declared in the filter, the failure_type will default to `system_error`. + To override default failure_type use configured failure_type with ResponseAction.FAIL. + + Attributes: + action (Union[ResponseAction, str]): action to execute if a request matches + failure_type (Union[ResponseAction, str]): failure type of traced exception if a response matches the filter + http_codes (Set[int]): http code of matching requests + error_message_contains (str): error substring of matching requests + predicate (str): predicate to apply to determine if a request is matching + error_message (Union[InterpolatedString, str): error message to display if the response matches the filter + """ + + config: Config + parameters: InitVar[Mapping[str, Any]] + action: Optional[Union[ResponseAction, str]] = None + failure_type: Optional[Union[FailureType, str]] = None + http_codes: Optional[Set[int]] = None + error_message_contains: Optional[str] = None + predicate: Union[InterpolatedBoolean, str] = "" + error_message: Union[InterpolatedString, str] = "" + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + + if self.action is not None: + if self.http_codes is None and self.predicate is None and self.error_message_contains is None: + raise ValueError("HttpResponseFilter requires a filter condition if an action is specified") + elif isinstance(self.action, str): + self.action = ResponseAction[self.action] + self.http_codes = self.http_codes or set() + if isinstance(self.predicate, str): + self.predicate = InterpolatedBoolean(condition=self.predicate, parameters=parameters) + self.error_message = InterpolatedString.create(string_or_interpolated=self.error_message, parameters=parameters) + self._error_message_parser = JsonErrorMessageParser() + if self.failure_type and isinstance(self.failure_type, str): + self.failure_type = FailureType[self.failure_type] + + def matches(self, response_or_exception: Optional[Union[requests.Response, Exception]]) -> Optional[ErrorResolution]: + filter_action = self._matches_filter(response_or_exception) + mapped_key = ( + response_or_exception.status_code if isinstance(response_or_exception, requests.Response) else response_or_exception.__class__ + ) + + if isinstance(mapped_key, (int, Exception)): + default_mapped_error_resolution = self._match_default_error_mapping(mapped_key) + else: + default_mapped_error_resolution = None + + if filter_action is not None: + default_error_message = default_mapped_error_resolution.error_message if default_mapped_error_resolution else "" + error_message = None + if isinstance(response_or_exception, requests.Response): + error_message = self._create_error_message(response_or_exception) + error_message = error_message or default_error_message + + if self.failure_type and filter_action == ResponseAction.FAIL: + failure_type = self.failure_type + elif default_mapped_error_resolution: + failure_type = default_mapped_error_resolution.failure_type + else: + failure_type = FailureType.system_error + + return ErrorResolution( + response_action=filter_action, + failure_type=failure_type, + error_message=error_message, + ) + + if ( + (isinstance(self.http_codes, list) and len(self.http_codes)) is None + and self.predicate is None + and self.error_message_contains is None + ) and default_mapped_error_resolution: + return default_mapped_error_resolution + + return None + + def _match_default_error_mapping(self, mapped_key: Union[int, type[Exception]]) -> Optional[ErrorResolution]: + return DEFAULT_ERROR_MAPPING.get(mapped_key) + + def _matches_filter(self, response_or_exception: Optional[Union[requests.Response, Exception]]) -> Optional[ResponseAction]: + """ + Apply the HTTP filter on the response and return the action to execute if it matches + :param response: The HTTP response to evaluate + :return: The action to execute. None if the response does not match the filter + """ + if isinstance(response_or_exception, requests.Response) and ( + response_or_exception.status_code in self.http_codes # type: ignore # http_codes set is always initialized to a value in __post_init__ + or self._response_matches_predicate(response_or_exception) + or self._response_contains_error_message(response_or_exception) + ): + return self.action # type: ignore # action is always cast to a ResponseAction not a str + return None + + @staticmethod + def _safe_response_json(response: requests.Response) -> dict[str, Any]: + try: + return response.json() # type: ignore # Response.json() returns a dictionary even if the signature does not + except requests.exceptions.JSONDecodeError: + return {} + + def _create_error_message(self, response: requests.Response) -> Optional[str]: + """ + Construct an error message based on the specified message template of the filter. + :param response: The HTTP response which can be used during interpolation + :return: The evaluated error message string to be emitted + """ + return self.error_message.eval(self.config, response=self._safe_response_json(response), headers=response.headers) # type: ignore # error_message is always cast to an interpolated string + + def _response_matches_predicate(self, response: requests.Response) -> bool: + return bool(self.predicate.condition and self.predicate.eval(None, response=self._safe_response_json(response), headers=response.headers)) if self.predicate else False # type: ignore # predicate is always cast to an interpolated string + + def _response_contains_error_message(self, response: requests.Response) -> bool: + if not self.error_message_contains: + return False + else: + error_message = self._error_message_parser.parse_response_error_message(response=response) + return bool(error_message and self.error_message_contains in error_message) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_job_repository.py new file mode 100644 index 000000000000..2c425bf84dd5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_job_repository.py @@ -0,0 +1,206 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +import logging +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Any, Dict, Iterable, Mapping, Optional + +import requests +from airbyte_cdk import AirbyteMessage +from airbyte_cdk.logger import lazy_log +from airbyte_cdk.models import FailureType, Type +from airbyte_cdk.sources.declarative.async_job.job import AsyncJob +from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository +from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus +from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor, RecordExtractor +from airbyte_cdk.sources.declarative.extractors.response_to_file_extractor import ResponseToFileExtractor +from airbyte_cdk.sources.declarative.requesters.requester import Requester +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.types import Record, StreamSlice +from airbyte_cdk.utils import AirbyteTracedException +from requests import Response + +LOGGER = logging.getLogger("airbyte") + + +@dataclass +class AsyncHttpJobRepository(AsyncJobRepository): + creation_requester: Requester + polling_requester: Requester + download_retriever: SimpleRetriever + abort_requester: Optional[Requester] + delete_requester: Optional[Requester] + status_extractor: DpathExtractor + status_mapping: Mapping[str, AsyncJobStatus] + urls_extractor: DpathExtractor + + job_timeout: Optional[timedelta] = None + record_extractor: RecordExtractor = field(init=False, repr=False, default_factory=lambda: ResponseToFileExtractor()) + + def __post_init__(self) -> None: + self._create_job_response_by_id: Dict[str, Response] = {} + self._polling_job_response_by_id: Dict[str, Response] = {} + + def _get_validated_polling_response(self, stream_slice: StreamSlice) -> requests.Response: + """ + Validates and retrieves the pooling response for a given stream slice. + + Args: + stream_slice (StreamSlice): The stream slice to send the pooling request for. + + Returns: + requests.Response: The validated pooling response. + + Raises: + AirbyteTracedException: If the polling request returns an empty response. + """ + + polling_response: Optional[requests.Response] = self.polling_requester.send_request(stream_slice=stream_slice) + if polling_response is None: + raise AirbyteTracedException( + internal_message="Polling Requester received an empty Response.", + failure_type=FailureType.system_error, + ) + return polling_response + + def _get_validated_job_status(self, response: requests.Response) -> AsyncJobStatus: + """ + Validates the job status extracted from the API response. + + Args: + response (requests.Response): The API response. + + Returns: + AsyncJobStatus: The validated job status. + + Raises: + ValueError: If the API status is unknown. + """ + + api_status = next(iter(self.status_extractor.extract_records(response)), None) + job_status = self.status_mapping.get(str(api_status), None) + if job_status is None: + raise ValueError( + f"API status `{api_status}` is unknown. Contact the connector developer to make sure this status is supported." + ) + + return job_status + + def _start_job_and_validate_response(self, stream_slice: StreamSlice) -> requests.Response: + """ + Starts a job and validates the response. + + Args: + stream_slice (StreamSlice): The stream slice to be used for the job. + + Returns: + requests.Response: The response from the job creation requester. + + Raises: + AirbyteTracedException: If no response is received from the creation requester. + """ + + response: Optional[requests.Response] = self.creation_requester.send_request(stream_slice=stream_slice) + if not response: + raise AirbyteTracedException( + internal_message="Always expect a response or an exception from creation_requester", + failure_type=FailureType.system_error, + ) + + return response + + def start(self, stream_slice: StreamSlice) -> AsyncJob: + """ + Starts a job for the given stream slice. + + Args: + stream_slice (StreamSlice): The stream slice to start the job for. + + Returns: + AsyncJob: The asynchronous job object representing the started job. + """ + + response: requests.Response = self._start_job_and_validate_response(stream_slice) + job_id: str = str(uuid.uuid4()) + self._create_job_response_by_id[job_id] = response + + return AsyncJob(api_job_id=job_id, job_parameters=stream_slice, timeout=self.job_timeout) + + def update_jobs_status(self, jobs: Iterable[AsyncJob]) -> None: + """ + Updates the status of multiple jobs. + + Because we don't have interpolation on random fields, we have this hack which consist on using the stream_slice to allow for + interpolation. We are looking at enabling interpolation on more field which would require a change to those three layers: + HttpRequester, RequestOptionProvider, RequestInputProvider. + + Args: + jobs (Iterable[AsyncJob]): An iterable of AsyncJob objects representing the jobs to update. + + Returns: + None + """ + for job in jobs: + stream_slice = self._get_create_job_stream_slice(job) + polling_response: requests.Response = self._get_validated_polling_response(stream_slice) + job_status: AsyncJobStatus = self._get_validated_job_status(polling_response) + + if job_status != job.status(): + lazy_log(LOGGER, logging.DEBUG, lambda: f"Status of job {job.api_job_id()} changed from {job.status()} to {job_status}") + else: + lazy_log(LOGGER, logging.DEBUG, lambda: f"Status of job {job.api_job_id()} is still {job.status()}") + + job.update_status(job_status) + if job_status == AsyncJobStatus.COMPLETED: + self._polling_job_response_by_id[job.api_job_id()] = polling_response + + def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]: + """ + Fetches records from the given job. + + Args: + job (AsyncJob): The job to fetch records from. + + Yields: + Iterable[Mapping[str, Any]]: A generator that yields records as dictionaries. + + """ + + for url in self.urls_extractor.extract_records(self._polling_job_response_by_id[job.api_job_id()]): + stream_slice: StreamSlice = StreamSlice(partition={"url": url}, cursor_slice={}) + for message in self.download_retriever.read_records({}, stream_slice): + if isinstance(message, Record): + yield message.data + elif isinstance(message, AirbyteMessage): + if message.type == Type.RECORD: + yield message.record.data # type: ignore # message.record won't be None here as the message is a record + elif isinstance(message, (dict, Mapping)): + yield message + else: + raise TypeError(f"Unknown type `{type(message)}` for message") + + yield from [] + + def abort(self, job: AsyncJob) -> None: + if not self.abort_requester: + return + + self.abort_requester.send_request(stream_slice=self._get_create_job_stream_slice(job)) + + def delete(self, job: AsyncJob) -> None: + if not self.delete_requester: + return + + self.delete_requester.send_request(stream_slice=self._get_create_job_stream_slice(job)) + self._clean_up_job(job.api_job_id()) + + def _clean_up_job(self, job_id: str) -> None: + del self._create_job_response_by_id[job_id] + del self._polling_job_response_by_id[job_id] + + def _get_create_job_stream_slice(self, job: AsyncJob) -> StreamSlice: + stream_slice = StreamSlice( + partition={"create_job_response": self._create_job_response_by_id[job.api_job_id()]}, + cursor_slice={}, + ) + return stream_slice diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_requester.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_requester.py new file mode 100644 index 000000000000..05d8bfa1b957 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_requester.py @@ -0,0 +1,321 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import os +from dataclasses import InitVar, dataclass, field +from typing import Any, Callable, Mapping, MutableMapping, Optional, Union +from urllib.parse import urljoin + +import requests +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator, NoAuth +from airbyte_cdk.sources.declarative.decoders import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester +from airbyte_cdk.sources.message import MessageRepository, NoopMessageRepository +from airbyte_cdk.sources.streams.http import HttpClient +from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState +from airbyte_cdk.utils.mapping_helpers import combine_mappings + + +@dataclass +class HttpRequester(Requester): + """ + Default implementation of a Requester + + Attributes: + name (str): Name of the stream. Only used for request/response caching + url_base (Union[InterpolatedString, str]): Base url to send requests to + path (Union[InterpolatedString, str]): Path to send requests to + http_method (Union[str, HttpMethod]): HTTP method to use when sending requests + request_options_provider (Optional[InterpolatedRequestOptionsProvider]): request option provider defining the options to set on outgoing requests + authenticator (DeclarativeAuthenticator): Authenticator defining how to authenticate to the source + error_handler (Optional[ErrorHandler]): Error handler defining how to detect and handle errors + backoff_strategies (Optional[List[BackoffStrategy]]): List of backoff strategies to use when retrying requests + config (Config): The user-provided configuration as specified by the source's spec + use_cache (bool): Indicates that data should be cached for this stream + """ + + name: str + url_base: Union[InterpolatedString, str] + path: Union[InterpolatedString, str] + config: Config + parameters: InitVar[Mapping[str, Any]] + authenticator: Optional[DeclarativeAuthenticator] = None + http_method: Union[str, HttpMethod] = HttpMethod.GET + request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None + error_handler: Optional[ErrorHandler] = None + disable_retries: bool = False + message_repository: MessageRepository = NoopMessageRepository() + use_cache: bool = False + _exit_on_rate_limit: bool = False + stream_response: bool = False + decoder: Decoder = field(default_factory=lambda: JsonDecoder(parameters={})) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._url_base = InterpolatedString.create(self.url_base, parameters=parameters) + self._path = InterpolatedString.create(self.path, parameters=parameters) + if self.request_options_provider is None: + self._request_options_provider = InterpolatedRequestOptionsProvider(config=self.config, parameters=parameters) + elif isinstance(self.request_options_provider, dict): + self._request_options_provider = InterpolatedRequestOptionsProvider(config=self.config, **self.request_options_provider) + else: + self._request_options_provider = self.request_options_provider + self._authenticator = self.authenticator or NoAuth(parameters=parameters) + self._http_method = HttpMethod[self.http_method] if isinstance(self.http_method, str) else self.http_method + self.error_handler = self.error_handler + self._parameters = parameters + + if self.error_handler is not None and hasattr(self.error_handler, "backoff_strategies"): + backoff_strategies = self.error_handler.backoff_strategies + else: + backoff_strategies = None + + self._http_client = HttpClient( + name=self.name, + logger=self.logger, + error_handler=self.error_handler, + authenticator=self._authenticator, + use_cache=self.use_cache, + backoff_strategy=backoff_strategies, + disable_retries=self.disable_retries, + message_repository=self.message_repository, + ) + + @property + def exit_on_rate_limit(self) -> bool: + return self._exit_on_rate_limit + + @exit_on_rate_limit.setter + def exit_on_rate_limit(self, value: bool) -> None: + self._exit_on_rate_limit = value + + def get_authenticator(self) -> DeclarativeAuthenticator: + return self._authenticator + + def get_url_base(self) -> str: + return os.path.join(self._url_base.eval(self.config), "") + + def get_path( + self, *, stream_state: Optional[StreamState], stream_slice: Optional[StreamSlice], next_page_token: Optional[Mapping[str, Any]] + ) -> str: + kwargs = {"stream_state": stream_state, "stream_slice": stream_slice, "next_page_token": next_page_token} + path = str(self._path.eval(self.config, **kwargs)) + return path.lstrip("/") + + def get_method(self) -> HttpMethod: + return self._http_method + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return self._request_options_provider.get_request_params( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._request_options_provider.get_request_headers( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + + # fixing request options provider types has a lot of dependencies + def get_request_body_data( # type: ignore + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return ( + self._request_options_provider.get_request_body_data( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + or {} + ) + + # fixing request options provider types has a lot of dependencies + def get_request_body_json( # type: ignore + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping[str, Any]]: + return self._request_options_provider.get_request_body_json( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + + @property + def logger(self) -> logging.Logger: + return logging.getLogger(f"airbyte.HttpRequester.{self.name}") + + def _get_request_options( + self, + stream_state: Optional[StreamState], + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + requester_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], + auth_options_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], + extra_options: Optional[Union[Mapping[str, Any], str]] = None, + ) -> Union[Mapping[str, Any], str]: + """ + Get the request_option from the requester, the authenticator and extra_options passed in. + Raise a ValueError if there's a key collision + Returned merged mapping otherwise + """ + return combine_mappings( + [ + requester_method(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + auth_options_method(), + extra_options, + ] + ) + + def _request_headers( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + extra_headers: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies request headers. + Authentication headers will overwrite any overlapping headers returned from this method. + """ + headers = self._get_request_options( + stream_state, + stream_slice, + next_page_token, + self.get_request_headers, + self.get_authenticator().get_auth_header, + extra_headers, + ) + if isinstance(headers, str): + raise ValueError("Request headers cannot be a string") + return {str(k): str(v) for k, v in headers.items()} + + def _request_params( + self, + stream_state: Optional[StreamState], + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + extra_params: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + options = self._get_request_options( + stream_state, stream_slice, next_page_token, self.get_request_params, self.get_authenticator().get_request_params, extra_params + ) + if isinstance(options, str): + raise ValueError("Request params cannot be a string") + + for k, v in options.items(): + if isinstance(v, (dict,)): + raise ValueError(f"Invalid value for `{k}` parameter. The values of request params cannot be an object.") + + return options + + def _request_body_data( + self, + stream_state: Optional[StreamState], + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + extra_body_data: Optional[Union[Mapping[str, Any], str]] = None, + ) -> Optional[Union[Mapping[str, Any], str]]: + """ + Specifies how to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + # Warning: use self.state instead of the stream_state passed as argument! + return self._get_request_options( + stream_state, + stream_slice, + next_page_token, + self.get_request_body_data, + self.get_authenticator().get_request_body_data, + extra_body_data, + ) + + def _request_body_json( + self, + stream_state: Optional[StreamState], + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + extra_body_json: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping[str, Any]]: + """ + Specifies how to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + # Warning: use self.state instead of the stream_state passed as argument! + options = self._get_request_options( + stream_state, + stream_slice, + next_page_token, + self.get_request_body_json, + self.get_authenticator().get_request_body_json, + extra_body_json, + ) + if isinstance(options, str): + raise ValueError("Request body json cannot be a string") + return options + + @classmethod + def _join_url(cls, url_base: str, path: str) -> str: + return urljoin(url_base, path) + + def send_request( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + path: Optional[str] = None, + request_headers: Optional[Mapping[str, Any]] = None, + request_params: Optional[Mapping[str, Any]] = None, + request_body_data: Optional[Union[Mapping[str, Any], str]] = None, + request_body_json: Optional[Mapping[str, Any]] = None, + log_formatter: Optional[Callable[[requests.Response], Any]] = None, + ) -> Optional[requests.Response]: + + request, response = self._http_client.send_request( + http_method=self.get_method().value, + url=self._join_url( + self.get_url_base(), + path or self.get_path(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + ), + request_kwargs={"stream": self.stream_response}, + headers=self._request_headers(stream_state, stream_slice, next_page_token, request_headers), + params=self._request_params(stream_state, stream_slice, next_page_token, request_params), + json=self._request_body_json(stream_state, stream_slice, next_page_token, request_body_json), + data=self._request_body_data(stream_state, stream_slice, next_page_token, request_body_data), + dedupe_query_params=True, + log_formatter=log_formatter, + exit_on_rate_limit=self._exit_on_rate_limit, + ) + + return response diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/__init__.py new file mode 100644 index 000000000000..cb2cfddb275c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.paginators.default_paginator import DefaultPaginator, PaginatorTestReadDecorator +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy + +__all__ = ["DefaultPaginator", "NoPagination", "PaginationStrategy", "Paginator", "PaginatorTestReadDecorator"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py new file mode 100644 index 000000000000..c92e99770070 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py @@ -0,0 +1,242 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, MutableMapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.decoders import Decoder, JsonDecoder, PaginationDecoderDecorator +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath +from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState + + +@dataclass +class DefaultPaginator(Paginator): + """ + Default paginator to request pages of results with a fixed size until the pagination strategy no longer returns a next_page_token + + Examples: + 1. + * fetches up to 10 records at a time by setting the "limit" request param to 10 + * updates the request path with "{{ response._metadata.next }}" + ``` + paginator: + type: "DefaultPaginator" + page_size_option: + type: RequestOption + inject_into: request_parameter + field_name: limit + page_token_option: + type: RequestPath + path: "location" + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 + ``` + + 2. + * fetches up to 5 records at a time by setting the "page_size" header to 5 + * increments a record counter and set the request parameter "offset" to the value of the counter + ``` + paginator: + type: "DefaultPaginator" + page_size_option: + type: RequestOption + inject_into: header + field_name: page_size + pagination_strategy: + type: "OffsetIncrement" + page_size: 5 + page_token_option: + option_type: "request_parameter" + field_name: "offset" + ``` + + 3. + * fetches up to 5 records at a time by setting the "page_size" request param to 5 + * increments a page counter and set the request parameter "page" to the value of the counter + ``` + paginator: + type: "DefaultPaginator" + page_size_option: + type: RequestOption + inject_into: request_parameter + field_name: page_size + pagination_strategy: + type: "PageIncrement" + page_size: 5 + page_token_option: + type: RequestOption + option_type: "request_parameter" + field_name: "page" + ``` + Attributes: + page_size_option (Optional[RequestOption]): the request option to set the page size. Cannot be injected in the path. + page_token_option (Optional[RequestPath, RequestOption]): the request option to set the page token + pagination_strategy (PaginationStrategy): Strategy defining how to get the next page token + config (Config): connection config + url_base (Union[InterpolatedString, str]): endpoint's base url + decoder (Decoder): decoder to decode the response + """ + + pagination_strategy: PaginationStrategy + config: Config + url_base: Union[InterpolatedString, str] + parameters: InitVar[Mapping[str, Any]] + decoder: Decoder = field(default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))) + page_size_option: Optional[RequestOption] = None + page_token_option: Optional[Union[RequestPath, RequestOption]] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if self.page_size_option and not self.pagination_strategy.get_page_size(): + raise ValueError("page_size_option cannot be set if the pagination strategy does not have a page_size") + if isinstance(self.url_base, str): + self.url_base = InterpolatedString(string=self.url_base, parameters=parameters) + self._token: Optional[Any] = self.pagination_strategy.initial_token + + def next_page_token( + self, response: requests.Response, last_page_size: int, last_record: Optional[Record] + ) -> Optional[Mapping[str, Any]]: + self._token = self.pagination_strategy.next_page_token(response, last_page_size, last_record) + if self._token: + return {"next_page_token": self._token} + else: + return None + + def path(self) -> Optional[str]: + if self._token and self.page_token_option and isinstance(self.page_token_option, RequestPath): + # Replace url base to only return the path + return str(self._token).replace(self.url_base.eval(self.config), "") # type: ignore # url_base is casted to a InterpolatedString in __post_init__ + else: + return None + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return self._get_request_options(RequestOptionType.request_parameter) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, str]: + return self._get_request_options(RequestOptionType.header) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_data) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_json) + + def reset(self, reset_value: Optional[Any] = None) -> None: + if reset_value: + self.pagination_strategy.reset(reset_value=reset_value) + else: + self.pagination_strategy.reset() + self._token = self.pagination_strategy.initial_token + + def _get_request_options(self, option_type: RequestOptionType) -> MutableMapping[str, Any]: + options = {} + + if ( + self.page_token_option + and self._token is not None + and isinstance(self.page_token_option, RequestOption) + and self.page_token_option.inject_into == option_type + ): + options[self.page_token_option.field_name.eval(config=self.config)] = self._token # type: ignore # field_name is always cast to an interpolated string + if self.page_size_option and self.pagination_strategy.get_page_size() and self.page_size_option.inject_into == option_type: + options[self.page_size_option.field_name.eval(config=self.config)] = self.pagination_strategy.get_page_size() # type: ignore # field_name is always cast to an interpolated string + return options + + +class PaginatorTestReadDecorator(Paginator): + """ + In some cases, we want to limit the number of requests that are made to the backend source. This class allows for limiting the number of + pages that are queried throughout a read command. + """ + + _PAGE_COUNT_BEFORE_FIRST_NEXT_CALL = 1 + + def __init__(self, decorated: Paginator, maximum_number_of_pages: int = 5) -> None: + if maximum_number_of_pages and maximum_number_of_pages < 1: + raise ValueError(f"The maximum number of pages on a test read needs to be strictly positive. Got {maximum_number_of_pages}") + self._maximum_number_of_pages = maximum_number_of_pages + self._decorated = decorated + self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL + + def next_page_token( + self, response: requests.Response, last_page_size: int, last_record: Optional[Record] + ) -> Optional[Mapping[str, Any]]: + if self._page_count >= self._maximum_number_of_pages: + return None + + self._page_count += 1 + return self._decorated.next_page_token(response, last_page_size, last_record) + + def path(self) -> Optional[str]: + return self._decorated.path() + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._decorated.get_request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, str]: + return self._decorated.get_request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return self._decorated.get_request_body_data(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._decorated.get_request_body_json(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + + def reset(self, reset_value: Optional[Any] = None) -> None: + self._decorated.reset() + self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py new file mode 100644 index 000000000000..4065902ffd09 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py @@ -0,0 +1,65 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, MutableMapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState + + +@dataclass +class NoPagination(Paginator): + """ + Pagination implementation that never returns a next page. + """ + + parameters: InitVar[Mapping[str, Any]] + + def path(self) -> Optional[str]: + return None + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return {} + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, str]: + return {} + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return {} + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def next_page_token(self, response: requests.Response, last_page_size: int, last_record: Optional[Record]) -> Mapping[str, Any]: + return {} + + def reset(self, reset_value: Optional[Any] = None) -> None: + # No state to reset + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/paginator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/paginator.py new file mode 100644 index 000000000000..aebc8241a2d0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/paginator.py @@ -0,0 +1,52 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.types import Record + + +@dataclass +class Paginator(ABC, RequestOptionsProvider): + """ + Defines the token to use to fetch the next page of records from the API. + + If needed, the Paginator will set request options to be set on the HTTP request to fetch the next page of records. + If the next_page_token is the path to the next page of records, then it should be accessed through the `path` method + """ + + @abstractmethod + def reset(self, reset_value: Optional[Any] = None) -> None: + """ + Reset the pagination's inner state + """ + + @abstractmethod + def next_page_token( + self, response: requests.Response, last_page_size: int, last_record: Optional[Record] + ) -> Optional[Mapping[str, Any]]: + """ + Returns the next_page_token to use to fetch the next page of records. + + :param response: the response to process + :param last_page_size: the number of records read from the response + :param last_record: the last record extracted from the response + :return: A mapping {"next_page_token": } for the next page from the input response object. Returning None means there are no more pages to read in this response. + """ + pass + + @abstractmethod + def path(self) -> Optional[str]: + """ + Returns the URL path to hit to fetch the next page of records + + e.g: if you wanted to hit https://myapi.com/v1/some_entity then this will return "some_entity" + + :return: path to hit to fetch the next request. Returning None means the path is not defined by the next_page_token + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py new file mode 100644 index 000000000000..03e5ecae532e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment import OffsetIncrement +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment import PageIncrement +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.stop_condition import ( + CursorStopCondition, + StopConditionPaginationStrategyDecorator, +) + +__all__ = [ + "CursorPaginationStrategy", + "CursorStopCondition", + "OffsetIncrement", + "PageIncrement", + "StopConditionPaginationStrategyDecorator", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py new file mode 100644 index 000000000000..7ba3c1d096ac --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py @@ -0,0 +1,81 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Dict, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.decoders import Decoder, JsonDecoder, PaginationDecoderDecorator +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.types import Config, Record + + +@dataclass +class CursorPaginationStrategy(PaginationStrategy): + """ + Pagination strategy that evaluates an interpolated string to define the next page token + + Attributes: + page_size (Optional[int]): the number of records to request + cursor_value (Union[InterpolatedString, str]): template string evaluating to the cursor value + config (Config): connection config + stop_condition (Optional[InterpolatedBoolean]): template string evaluating when to stop paginating + decoder (Decoder): decoder to decode the response + """ + + cursor_value: Union[InterpolatedString, str] + config: Config + parameters: InitVar[Mapping[str, Any]] + page_size: Optional[int] = None + stop_condition: Optional[Union[InterpolatedBoolean, str]] = None + decoder: Decoder = field(default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._initial_cursor = None + if isinstance(self.cursor_value, str): + self._cursor_value = InterpolatedString.create(self.cursor_value, parameters=parameters) + else: + self._cursor_value = self.cursor_value + if isinstance(self.stop_condition, str): + self._stop_condition: Optional[InterpolatedBoolean] = InterpolatedBoolean(condition=self.stop_condition, parameters=parameters) + else: + self._stop_condition = self.stop_condition + + @property + def initial_token(self) -> Optional[Any]: + return self._initial_cursor + + def next_page_token(self, response: requests.Response, last_page_size: int, last_record: Optional[Record]) -> Optional[Any]: + decoded_response = next(self.decoder.decode(response)) + + # The default way that link is presented in requests.Response is a string of various links (last, next, etc). This + # is not indexable or useful for parsing the cursor, so we replace it with the link dictionary from response.links + headers: Dict[str, Any] = dict(response.headers) + headers["link"] = response.links + if self._stop_condition: + should_stop = self._stop_condition.eval( + self.config, + response=decoded_response, + headers=headers, + last_record=last_record, + last_page_size=last_page_size, + ) + if should_stop: + return None + token = self._cursor_value.eval( + config=self.config, + response=decoded_response, + headers=headers, + last_record=last_record, + last_page_size=last_page_size, + ) + return token if token else None + + def reset(self, reset_value: Optional[Any] = None) -> None: + self._initial_cursor = reset_value + + def get_page_size(self) -> Optional[int]: + return self.page_size diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py new file mode 100644 index 000000000000..295b09082b56 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py @@ -0,0 +1,82 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.decoders import Decoder, JsonDecoder, PaginationDecoderDecorator +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.types import Config, Record + + +@dataclass +class OffsetIncrement(PaginationStrategy): + """ + Pagination strategy that returns the number of records reads so far and returns it as the next page token + Examples: + # page_size to be a constant integer value + pagination_strategy: + type: OffsetIncrement + page_size: 2 + + # page_size to be a constant string value + pagination_strategy: + type: OffsetIncrement + page_size: "2" + + # page_size to be an interpolated string value + pagination_strategy: + type: OffsetIncrement + page_size: "{{ parameters['items_per_page'] }}" + + Attributes: + page_size (InterpolatedString): the number of records to request + """ + + config: Config + page_size: Optional[Union[str, int]] + parameters: InitVar[Mapping[str, Any]] + decoder: Decoder = field(default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))) + inject_on_first_request: bool = False + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._offset = 0 + page_size = str(self.page_size) if isinstance(self.page_size, int) else self.page_size + if page_size: + self._page_size: Optional[InterpolatedString] = InterpolatedString(page_size, parameters=parameters) + else: + self._page_size = None + + @property + def initial_token(self) -> Optional[Any]: + if self.inject_on_first_request: + return self._offset + return None + + def next_page_token(self, response: requests.Response, last_page_size: int, last_record: Optional[Record]) -> Optional[Any]: + decoded_response = next(self.decoder.decode(response)) + + # Stop paginating when there are fewer records than the page size or the current page has no records + if (self._page_size and last_page_size < self._page_size.eval(self.config, response=decoded_response)) or last_page_size == 0: + return None + else: + self._offset += last_page_size + return self._offset + + def reset(self, reset_value: Optional[Any] = 0) -> None: + if not isinstance(reset_value, int): + raise ValueError(f"Reset value {reset_value} for OffsetIncrement pagination strategy was not an integer") + else: + self._offset = reset_value + + def get_page_size(self) -> Optional[int]: + if self._page_size: + page_size = self._page_size.eval(self.config) + if not isinstance(page_size, int): + raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}") + return page_size + else: + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py new file mode 100644 index 000000000000..978ac1abaafd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.types import Config, Record + + +@dataclass +class PageIncrement(PaginationStrategy): + """ + Pagination strategy that returns the number of pages reads so far and returns it as the next page token + + Attributes: + page_size (int): the number of records to request + start_from_page (int): number of the initial page + """ + + config: Config + page_size: Optional[Union[str, int]] + parameters: InitVar[Mapping[str, Any]] + start_from_page: int = 0 + inject_on_first_request: bool = False + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._page = self.start_from_page + if isinstance(self.page_size, int) or (self.page_size is None): + self._page_size = self.page_size + else: + page_size = InterpolatedString(self.page_size, parameters=parameters).eval(self.config) + if not isinstance(page_size, int): + raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}") + self._page_size = page_size + + @property + def initial_token(self) -> Optional[Any]: + if self.inject_on_first_request: + return self._page + return None + + def next_page_token(self, response: requests.Response, last_page_size: int, last_record: Optional[Record]) -> Optional[Any]: + # Stop paginating when there are fewer records than the page size or the current page has no records + if (self._page_size and last_page_size < self._page_size) or last_page_size == 0: + return None + else: + self._page += 1 + return self._page + + def reset(self, reset_value: Optional[Any] = None) -> None: + if reset_value is None: + self._page = self.start_from_page + elif not isinstance(reset_value, int): + raise ValueError(f"Reset value {reset_value} for PageIncrement pagination strategy was not an integer") + else: + self._page = reset_value + + def get_page_size(self) -> Optional[int]: + return self._page_size diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py new file mode 100644 index 000000000000..135eb4812c6f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Optional + +import requests +from airbyte_cdk.sources.types import Record + + +@dataclass +class PaginationStrategy: + """ + Defines how to get the next page token + """ + + @property + @abstractmethod + def initial_token(self) -> Optional[Any]: + """ + Return the initial value of the token + """ + + @abstractmethod + def next_page_token(self, response: requests.Response, last_page_size: int, last_record: Optional[Record]) -> Optional[Any]: + """ + :param response: response to process + :param last_page_size: the number of records read from the response + :param last_record: the last record extracted from the response + :return: next page token. Returns None if there are no more pages to fetch + """ + pass + + @abstractmethod + def reset(self, reset_value: Optional[Any] = None) -> None: + """ + Reset the pagination's inner state + """ + + @abstractmethod + def get_page_size(self) -> Optional[int]: + """ + :return: page size: The number of records to fetch in a page. Returns None if unspecified + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py new file mode 100644 index 000000000000..ca79bfd39ac7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py @@ -0,0 +1,53 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Any, Optional + +import requests +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.types import Record + + +class PaginationStopCondition(ABC): + @abstractmethod + def is_met(self, record: Record) -> bool: + """ + Given a condition is met, the pagination will stop + + :param record: a record used to evaluate the condition + """ + raise NotImplementedError() + + +class CursorStopCondition(PaginationStopCondition): + def __init__(self, cursor: DeclarativeCursor): + self._cursor = cursor + + def is_met(self, record: Record) -> bool: + return not self._cursor.should_be_synced(record) + + +class StopConditionPaginationStrategyDecorator(PaginationStrategy): + def __init__(self, _delegate: PaginationStrategy, stop_condition: PaginationStopCondition): + self._delegate = _delegate + self._stop_condition = stop_condition + + def next_page_token(self, response: requests.Response, last_page_size: int, last_record: Optional[Record]) -> Optional[Any]: + # We evaluate in reverse order because the assumption is that most of the APIs using data feed structure will return records in + # descending order. In terms of performance/memory, we return the records lazily + if last_record and self._stop_condition.is_met(last_record): + return None + return self._delegate.next_page_token(response, last_page_size, last_record) + + def reset(self) -> None: + self._delegate.reset() + + def get_page_size(self) -> Optional[int]: + return self._delegate.get_page_size() + + @property + def initial_token(self) -> Optional[Any]: + return self._delegate.initial_token diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_option.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_option.py new file mode 100644 index 000000000000..d13d2056681d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_option.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from enum import Enum +from typing import Any, Mapping, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + + +class RequestOptionType(Enum): + """ + Describes where to set a value on a request + """ + + request_parameter = "request_parameter" + header = "header" + body_data = "body_data" + body_json = "body_json" + + +@dataclass +class RequestOption: + """ + Describes an option to set on a request + + Attributes: + field_name (str): Describes the name of the parameter to inject + inject_into (RequestOptionType): Describes where in the HTTP request to inject the parameter + """ + + field_name: Union[InterpolatedString, str] + inject_into: RequestOptionType + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self.field_name = InterpolatedString.create(self.field_name, parameters=parameters) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/__init__.py new file mode 100644 index 000000000000..c6540e939ed6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/__init__.py @@ -0,0 +1,14 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.request_options.datetime_based_request_options_provider import ( + DatetimeBasedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_options.default_request_options_provider import DefaultRequestOptionsProvider +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider + +__all__ = ["DatetimeBasedRequestOptionsProvider", "DefaultRequestOptionsProvider", "InterpolatedRequestOptionsProvider", "RequestOptionsProvider"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py new file mode 100644 index 000000000000..d9e86afcffb5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py @@ -0,0 +1,78 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState + + +@dataclass +class DatetimeBasedRequestOptionsProvider(RequestOptionsProvider): + """ + Request options provider that extracts fields from the stream_slice and injects them into the respective location in the + outbound request being made + """ + + config: Config + parameters: InitVar[Mapping[str, Any]] + start_time_option: Optional[RequestOption] = None + end_time_option: Optional[RequestOption] = None + partition_field_start: Optional[str] = None + partition_field_end: Optional[str] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._partition_field_start = InterpolatedString.create(self.partition_field_start or "start_time", parameters=parameters) + self._partition_field_end = InterpolatedString.create(self.partition_field_end or "end_time", parameters=parameters) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.request_parameter, stream_slice) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.header, stream_slice) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return self._get_request_options(RequestOptionType.body_data, stream_slice) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_json, stream_slice) + + def _get_request_options(self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice]) -> Mapping[str, Any]: + options: MutableMapping[str, Any] = {} + if not stream_slice: + return options + if self.start_time_option and self.start_time_option.inject_into == option_type: + options[self.start_time_option.field_name.eval(config=self.config)] = stream_slice.get( # type: ignore # field_name is always casted to an interpolated string + self._partition_field_start.eval(self.config) + ) + if self.end_time_option and self.end_time_option.inject_into == option_type: + options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(self._partition_field_end.eval(self.config)) # type: ignore # field_name is always casted to an interpolated string + return options diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py new file mode 100644 index 000000000000..42d8ee70a4b4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py @@ -0,0 +1,58 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.types import StreamSlice, StreamState + + +@dataclass +class DefaultRequestOptionsProvider(RequestOptionsProvider): + """ + Request options provider that extracts fields from the stream_slice and injects them into the respective location in the + outbound request being made + """ + + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + pass + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return {} + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py new file mode 100644 index 000000000000..4a6c7a860e17 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_nested_mapping import InterpolatedNestedMapping, NestedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState + + +@dataclass +class InterpolatedNestedRequestInputProvider: + """ + Helper class that generically performs string interpolation on a provided deeply nested dictionary or string input + """ + + parameters: InitVar[Mapping[str, Any]] + request_inputs: Optional[Union[str, NestedMapping]] = field(default=None) + config: Config = field(default_factory=dict) + _interpolator: Optional[Union[InterpolatedString, InterpolatedNestedMapping]] = field(init=False, repr=False, default=None) + _request_inputs: Optional[Union[str, NestedMapping]] = field(init=False, repr=False, default=None) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + + self._request_inputs = self.request_inputs or {} + if isinstance(self._request_inputs, str): + self._interpolator = InterpolatedString(self._request_inputs, default="", parameters=parameters) + else: + self._interpolator = InterpolatedNestedMapping(self._request_inputs, parameters=parameters) + + def eval_request_inputs( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Returns the request inputs to set on an outgoing HTTP request + + :param stream_state: The stream state + :param stream_slice: The stream slice + :param next_page_token: The pagination token + :return: The request inputs to set on an outgoing HTTP request + """ + kwargs = {"stream_state": stream_state, "stream_slice": stream_slice, "next_page_token": next_page_token} + return self._interpolator.eval(self.config, **kwargs) # type: ignore # self._interpolator is always initialized with a value and will not be None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py new file mode 100644 index 000000000000..868cd84208ef --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, Optional, Tuple, Type, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState + + +@dataclass +class InterpolatedRequestInputProvider: + """ + Helper class that generically performs string interpolation on the provided dictionary or string input + """ + + parameters: InitVar[Mapping[str, Any]] + request_inputs: Optional[Union[str, Mapping[str, str]]] = field(default=None) + config: Config = field(default_factory=dict) + _interpolator: Optional[Union[InterpolatedString, InterpolatedMapping]] = field(init=False, repr=False, default=None) + _request_inputs: Optional[Union[str, Mapping[str, str]]] = field(init=False, repr=False, default=None) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + + self._request_inputs = self.request_inputs or {} + if isinstance(self._request_inputs, str): + self._interpolator = InterpolatedString(self._request_inputs, default="", parameters=parameters) + else: + self._interpolator = InterpolatedMapping(self._request_inputs, parameters=parameters) + + def eval_request_inputs( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + valid_key_types: Optional[Tuple[Type[Any]]] = None, + valid_value_types: Optional[Tuple[Type[Any], ...]] = None, + ) -> Mapping[str, Any]: + """ + Returns the request inputs to set on an outgoing HTTP request + + :param stream_state: The stream state + :param stream_slice: The stream slice + :param next_page_token: The pagination token + :param valid_key_types: A tuple of types that the interpolator should allow + :param valid_value_types: A tuple of types that the interpolator should allow + :return: The request inputs to set on an outgoing HTTP request + """ + kwargs = {"stream_state": stream_state, "stream_slice": stream_slice, "next_page_token": next_page_token} + interpolated_value = self._interpolator.eval( # type: ignore # self._interpolator is always initialized with a value and will not be None + self.config, valid_key_types=valid_key_types, valid_value_types=valid_value_types, **kwargs + ) + + if isinstance(interpolated_value, dict): + non_null_tokens = {k: v for k, v in interpolated_value.items() if v is not None} + return non_null_tokens + return interpolated_value # type: ignore[no-any-return] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py new file mode 100644 index 000000000000..413a8bb1f632 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py @@ -0,0 +1,144 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_nested_mapping import NestedMapping +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_nested_request_input_provider import ( + InterpolatedNestedRequestInputProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_input_provider import InterpolatedRequestInputProvider +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState +from deprecated import deprecated + +RequestInput = Union[str, Mapping[str, str]] +ValidRequestTypes = (str, list) + + +@dataclass +class InterpolatedRequestOptionsProvider(RequestOptionsProvider): + """ + Defines the request options to set on an outgoing HTTP request by evaluating `InterpolatedMapping`s + + Attributes: + config (Config): The user-provided configuration as specified by the source's spec + request_parameters (Union[str, Mapping[str, str]]): The request parameters to set on an outgoing HTTP request + request_headers (Union[str, Mapping[str, str]]): The request headers to set on an outgoing HTTP request + request_body_data (Union[str, Mapping[str, str]]): The body data to set on an outgoing HTTP request + request_body_json (Union[str, Mapping[str, str]]): The json content to set on an outgoing HTTP request + """ + + parameters: InitVar[Mapping[str, Any]] + config: Config = field(default_factory=dict) + request_parameters: Optional[RequestInput] = None + request_headers: Optional[RequestInput] = None + request_body_data: Optional[RequestInput] = None + request_body_json: Optional[NestedMapping] = None + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if self.request_parameters is None: + self.request_parameters = {} + if self.request_headers is None: + self.request_headers = {} + if self.request_body_data is None: + self.request_body_data = {} + if self.request_body_json is None: + self.request_body_json = {} + + if self.request_body_json and self.request_body_data: + raise ValueError("RequestOptionsProvider should only contain either 'request_body_data' or 'request_body_json' not both") + + self._parameter_interpolator = InterpolatedRequestInputProvider( + config=self.config, request_inputs=self.request_parameters, parameters=parameters + ) + self._headers_interpolator = InterpolatedRequestInputProvider( + config=self.config, request_inputs=self.request_headers, parameters=parameters + ) + self._body_data_interpolator = InterpolatedRequestInputProvider( + config=self.config, request_inputs=self.request_body_data, parameters=parameters + ) + self._body_json_interpolator = InterpolatedNestedRequestInputProvider( + config=self.config, request_inputs=self.request_body_json, parameters=parameters + ) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + interpolated_value = self._parameter_interpolator.eval_request_inputs( + stream_state, stream_slice, next_page_token, valid_key_types=(str,), valid_value_types=ValidRequestTypes + ) + if isinstance(interpolated_value, dict): + return interpolated_value + return {} + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._headers_interpolator.eval_request_inputs(stream_state, stream_slice, next_page_token) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return self._body_data_interpolator.eval_request_inputs( + stream_state, + stream_slice, + next_page_token, + valid_key_types=(str,), + valid_value_types=ValidRequestTypes, + ) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._body_json_interpolator.eval_request_inputs(stream_state, stream_slice, next_page_token) + + @deprecated("This class is temporary and used to incrementally deliver low-code to concurrent", category=ExperimentalClassWarning) + def request_options_contain_stream_state(self) -> bool: + """ + Temporary helper method used as we move low-code streams to the concurrent framework. This method determines if + the InterpolatedRequestOptionsProvider has is a dependency on a non-thread safe interpolation context such as + stream_state. + """ + + return ( + self._check_if_interpolation_uses_stream_state(self.request_parameters) + or self._check_if_interpolation_uses_stream_state(self.request_headers) + or self._check_if_interpolation_uses_stream_state(self.request_body_data) + or self._check_if_interpolation_uses_stream_state(self.request_body_json) + ) + + @staticmethod + def _check_if_interpolation_uses_stream_state(request_input: Optional[Union[RequestInput, NestedMapping]]) -> bool: + if not request_input: + return False + elif isinstance(request_input, str): + return "stream_state" in request_input + else: + for key, val in request_input.items(): + # Covers the case of RequestInput in the form of a string or Mapping[str, str]. It also covers the case + # of a NestedMapping where the value is a string. + # Note: Doesn't account for nested mappings for request_body_json, but I don't see stream_state used in that way + # in our code + if "stream_state" in key or (isinstance(val, str) and "stream_state" in val): + return True + return False diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py new file mode 100644 index 000000000000..f0a94ecb91ab --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py @@ -0,0 +1,79 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.types import StreamSlice, StreamState + + +@dataclass +class RequestOptionsProvider: + """ + Defines the request options to set on an outgoing HTTP request + + Options can be passed by + - request parameter + - request headers + - body data + - json content + """ + + @abstractmethod + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + pass + + @abstractmethod + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.""" + + @abstractmethod + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + """ + Specifies how to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + + @abstractmethod + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies how to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_path.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_path.py new file mode 100644 index 000000000000..378ea6220c4f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_path.py @@ -0,0 +1,15 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping + + +@dataclass +class RequestPath: + """ + Describes that a component value should be inserted into the path + """ + + parameters: InitVar[Mapping[str, Any]] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/requester.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/requester.py new file mode 100644 index 000000000000..ef702216bb31 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/requester.py @@ -0,0 +1,135 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from enum import Enum +from typing import Any, Callable, Mapping, MutableMapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.types import StreamSlice, StreamState + + +class HttpMethod(Enum): + """ + Http Method to use when submitting an outgoing HTTP request + """ + + DELETE = "DELETE" + GET = "GET" + PATCH = "PATCH" + POST = "POST" + + +class Requester(RequestOptionsProvider): + @abstractmethod + def get_authenticator(self) -> DeclarativeAuthenticator: + """ + Specifies the authenticator to use when submitting requests + """ + pass + + @abstractmethod + def get_url_base(self) -> str: + """ + :return: URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/" + """ + + @abstractmethod + def get_path( + self, + *, + stream_state: Optional[StreamState], + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + ) -> str: + """ + Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity" + """ + + @abstractmethod + def get_method(self) -> HttpMethod: + """ + Specifies the HTTP method to use + """ + + @abstractmethod + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + """ + Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + + @abstractmethod + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. + """ + + @abstractmethod + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + """ + Specifies how to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + + @abstractmethod + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies how to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + + @abstractmethod + def send_request( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + path: Optional[str] = None, + request_headers: Optional[Mapping[str, Any]] = None, + request_params: Optional[Mapping[str, Any]] = None, + request_body_data: Optional[Union[Mapping[str, Any], str]] = None, + request_body_json: Optional[Mapping[str, Any]] = None, + log_formatter: Optional[Callable[[requests.Response], Any]] = None, + ) -> Optional[requests.Response]: + """ + Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error. + If path is set, the path configured on the requester itself is ignored. + If header, params and body are set, they are merged with the ones configured on the requester itself. + + If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/__init__.py new file mode 100644 index 000000000000..9ec5017fb38c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/__init__.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever, SimpleRetrieverTestReadDecorator +from airbyte_cdk.sources.declarative.retrievers.async_retriever import AsyncRetriever + +__all__ = ["Retriever", "SimpleRetriever", "SimpleRetrieverTestReadDecorator", "AsyncRetriever"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/async_retriever.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/async_retriever.py new file mode 100644 index 000000000000..a9f9686e7691 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/async_retriever.py @@ -0,0 +1,115 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + + +from dataclasses import InitVar, dataclass, field +from typing import Any, Callable, Iterable, Mapping, Optional + +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator, AsyncPartition +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector +from airbyte_cdk.sources.declarative.partition_routers import SinglePartitionRouter +from airbyte_cdk.sources.declarative.retrievers import Retriever +from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer +from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from deprecated.classic import deprecated + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +@dataclass +class AsyncRetriever(Retriever): + config: Config + parameters: InitVar[Mapping[str, Any]] + job_orchestrator_factory: Callable[[Iterable[StreamSlice]], AsyncJobOrchestrator] + record_selector: RecordSelector + stream_slicer: StreamSlicer = field(default_factory=lambda: SinglePartitionRouter(parameters={})) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._job_orchestrator_factory = self.job_orchestrator_factory + self.__job_orchestrator: Optional[AsyncJobOrchestrator] = None + self._parameters = parameters + + @property + def state(self) -> StreamState: + """ + As a first iteration for sendgrid, there is no state to be managed + """ + return {} + + @state.setter + def state(self, value: StreamState) -> None: + """ + As a first iteration for sendgrid, there is no state to be managed + """ + pass + + @property + def _job_orchestrator(self) -> AsyncJobOrchestrator: + if not self.__job_orchestrator: + raise AirbyteTracedException( + message="Invalid state within AsyncJobRetriever. Please contact Airbyte Support", + internal_message="AsyncPartitionRepository is expected to be accessed only after `stream_slices`", + failure_type=FailureType.system_error, + ) + + return self.__job_orchestrator + + def _get_stream_state(self) -> StreamState: + """ + Gets the current state of the stream. + + Returns: + StreamState: Mapping[str, Any] + """ + + return self.state + + def _validate_and_get_stream_slice_partition(self, stream_slice: Optional[StreamSlice] = None) -> AsyncPartition: + """ + Validates the stream_slice argument and returns the partition from it. + + Args: + stream_slice (Optional[StreamSlice]): The stream slice to validate and extract the partition from. + + Returns: + AsyncPartition: The partition extracted from the stream_slice. + + Raises: + AirbyteTracedException: If the stream_slice is not an instance of StreamSlice or if the partition is not present in the stream_slice. + + """ + if not isinstance(stream_slice, StreamSlice) or "partition" not in stream_slice.partition: + raise AirbyteTracedException( + message="Invalid arguments to AsyncJobRetriever.read_records: stream_slice is no optional. Please contact Airbyte Support", + failure_type=FailureType.system_error, + ) + return stream_slice["partition"] # type: ignore # stream_slice["partition"] has been added as an AsyncPartition as part of stream_slices + + def stream_slices(self) -> Iterable[Optional[StreamSlice]]: + slices = self.stream_slicer.stream_slices() + self.__job_orchestrator = self._job_orchestrator_factory(slices) + + for completed_partition in self._job_orchestrator.create_and_get_completed_partitions(): + yield StreamSlice( + partition=dict(completed_partition.stream_slice.partition) | {"partition": completed_partition}, + cursor_slice=completed_partition.stream_slice.cursor_slice, + ) + + def read_records( + self, + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice] = None, + ) -> Iterable[StreamData]: + + stream_state: StreamState = self._get_stream_state() + partition: AsyncPartition = self._validate_and_get_stream_slice_partition(stream_slice) + records: Iterable[Mapping[str, Any]] = self._job_orchestrator.fetch_records(partition) + + yield from self.record_selector.filter_and_transform( + all_data=records, + stream_state=stream_state, + records_schema=records_schema, + stream_slice=stream_slice, + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/retriever.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/retriever.py new file mode 100644 index 000000000000..155de5782aa0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/retriever.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from typing import Any, Iterable, Mapping, Optional + +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import StreamSlice +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.types import StreamState + + +class Retriever: + """ + Responsible for fetching a stream's records from an HTTP API source. + """ + + @abstractmethod + def read_records( + self, + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice] = None, + ) -> Iterable[StreamData]: + """ + Fetch a stream's records from an HTTP API source + + :param records_schema: json schema to describe record + :param stream_slice: The stream slice to read data for + :return: The records read from the API source + """ + + @abstractmethod + def stream_slices(self) -> Iterable[Optional[StreamSlice]]: + """Returns the stream slices""" + + @property + @abstractmethod + def state(self) -> StreamState: + """State getter, should return state in form that can serialized to a string and send to the output + as a STATE AirbyteMessage. + + A good example of a state is a cursor_value: + { + self.cursor_field: "cursor_value" + } + + State should try to be as small as possible but at the same time descriptive enough to restore + syncing process from the point where it stopped. + """ + + @state.setter + @abstractmethod + def state(self, value: StreamState) -> None: + """State setter, accept state serialized by state getter.""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py new file mode 100644 index 000000000000..99639d8467ee --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -0,0 +1,506 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from dataclasses import InitVar, dataclass, field +from functools import partial +from itertools import islice +from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union + +import requests +from airbyte_cdk.models import AirbyteMessage +from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector +from airbyte_cdk.sources.declarative.incremental import ResumableFullRefreshCursor +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.requesters.request_options import DefaultRequestOptionsProvider, RequestOptionsProvider +from airbyte_cdk.sources.declarative.requesters.requester import Requester +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.http_logger import format_http_message +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState +from airbyte_cdk.utils.mapping_helpers import combine_mappings + +FULL_REFRESH_SYNC_COMPLETE_KEY = "__ab_full_refresh_sync_complete" + + +@dataclass +class SimpleRetriever(Retriever): + """ + Retrieves records by synchronously sending requests to fetch records. + + The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the stream slicer. + + For each stream slice, submit requests until there are no more pages of records to fetch. + + This retriever currently inherits from HttpStream to reuse the request submission and pagination machinery. + As a result, some of the parameters passed to some methods are unused. + The two will be decoupled in a future release. + + Attributes: + stream_name (str): The stream's name + stream_primary_key (Optional[Union[str, List[str], List[List[str]]]]): The stream's primary key + requester (Requester): The HTTP requester + record_selector (HttpSelector): The record selector + paginator (Optional[Paginator]): The paginator + stream_slicer (Optional[StreamSlicer]): The stream slicer + cursor (Optional[cursor]): The cursor + parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + requester: Requester + record_selector: HttpSelector + config: Config + parameters: InitVar[Mapping[str, Any]] + name: str + _name: Union[InterpolatedString, str] = field(init=False, repr=False, default="") + primary_key: Optional[Union[str, List[str], List[List[str]]]] + _primary_key: str = field(init=False, repr=False, default="") + paginator: Optional[Paginator] = None + stream_slicer: StreamSlicer = field(default_factory=lambda: SinglePartitionRouter(parameters={})) + request_option_provider: RequestOptionsProvider = field(default_factory=lambda: DefaultRequestOptionsProvider(parameters={})) + cursor: Optional[DeclarativeCursor] = None + ignore_stream_slicer_parameters_on_paginated_requests: bool = False + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._paginator = self.paginator or NoPagination(parameters=parameters) + self._last_response: Optional[requests.Response] = None + self._last_page_size: int = 0 + self._last_record: Optional[Record] = None + self._parameters = parameters + self._name = InterpolatedString(self._name, parameters=parameters) if isinstance(self._name, str) else self._name + + # This mapping is used during a resumable full refresh syncs to indicate whether a partition has started syncing + # records. Partitions serve as the key and map to True if they already began processing records + self._partition_started: MutableMapping[Any, bool] = dict() + + @property # type: ignore + def name(self) -> str: + """ + :return: Stream name + """ + return str(self._name.eval(self.config)) if isinstance(self._name, InterpolatedString) else self._name + + @name.setter + def name(self, value: str) -> None: + if not isinstance(value, property): + self._name = value + + def _get_mapping( + self, method: Callable[..., Optional[Union[Mapping[str, Any], str]]], **kwargs: Any + ) -> Tuple[Union[Mapping[str, Any], str], Set[str]]: + """ + Get mapping from the provided method, and get the keys of the mapping. + If the method returns a string, it will return the string and an empty set. + If the method returns a dict, it will return the dict and its keys. + """ + mapping = method(**kwargs) or {} + keys = set(mapping.keys()) if not isinstance(mapping, str) else set() + return mapping, keys + + def _get_request_options( + self, + stream_state: Optional[StreamData], + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + paginator_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], + stream_slicer_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], + ) -> Union[Mapping[str, Any], str]: + """ + Get the request_option from the paginator and the stream slicer. + Raise a ValueError if there's a key collision + Returned merged mapping otherwise + """ + # FIXME we should eventually remove the usage of stream_state as part of the interpolation + mappings = [ + paginator_method(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + ] + if not next_page_token or not self.ignore_stream_slicer_parameters_on_paginated_requests: + mappings.append(stream_slicer_method(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token)) + return combine_mappings(mappings) + + def _request_headers( + self, + stream_state: Optional[StreamData] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies request headers. + Authentication headers will overwrite any overlapping headers returned from this method. + """ + headers = self._get_request_options( + stream_state, + stream_slice, + next_page_token, + self._paginator.get_request_headers, + self.stream_slicer.get_request_headers, + ) + if isinstance(headers, str): + raise ValueError("Request headers cannot be a string") + return {str(k): str(v) for k, v in headers.items()} + + def _request_params( + self, + stream_state: Optional[StreamData] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + params = self._get_request_options( + stream_state, + stream_slice, + next_page_token, + self._paginator.get_request_params, + self.request_option_provider.get_request_params, + ) + if isinstance(params, str): + raise ValueError("Request params cannot be a string") + return params + + def _request_body_data( + self, + stream_state: Optional[StreamData] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + """ + Specifies how to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + return self._get_request_options( + stream_state, + stream_slice, + next_page_token, + self._paginator.get_request_body_data, + self.request_option_provider.get_request_body_data, + ) + + def _request_body_json( + self, + stream_state: Optional[StreamData] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping[str, Any]]: + """ + Specifies how to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + body_json = self._get_request_options( + stream_state, + stream_slice, + next_page_token, + self._paginator.get_request_body_json, + self.request_option_provider.get_request_body_json, + ) + if isinstance(body_json, str): + raise ValueError("Request body json cannot be a string") + return body_json + + def _paginator_path( + self, + ) -> Optional[str]: + """ + If the paginator points to a path, follow it, else return nothing so the requester is used. + :param stream_state: + :param stream_slice: + :param next_page_token: + :return: + """ + return self._paginator.path() + + def _parse_response( + self, + response: Optional[requests.Response], + stream_state: StreamState, + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Record]: + if not response: + self._last_response = None + yield from [] + else: + self._last_response = response + record_generator = self.record_selector.select_records( + response=response, + stream_state=stream_state, + records_schema=records_schema, + stream_slice=stream_slice, + next_page_token=next_page_token, + ) + self._last_page_size = 0 + for record in record_generator: + self._last_page_size += 1 + self._last_record = record + yield record + + @property # type: ignore + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + """The stream's primary key""" + return self._primary_key + + @primary_key.setter + def primary_key(self, value: str) -> None: + if not isinstance(value, property): + self._primary_key = value + + def _next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + """ + Specifies a pagination strategy. + + The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params. + + :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response. + """ + return self._paginator.next_page_token(response, self._last_page_size, self._last_record) + + def _fetch_next_page( + self, stream_state: Mapping[str, Any], stream_slice: StreamSlice, next_page_token: Optional[Mapping[str, Any]] = None + ) -> Optional[requests.Response]: + return self.requester.send_request( + path=self._paginator_path(), + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=next_page_token, + request_headers=self._request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + request_params=self._request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + request_body_data=self._request_body_data( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ), + request_body_json=self._request_body_json( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ), + ) + + # This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well. + def _read_pages( + self, + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]], + stream_state: Mapping[str, Any], + stream_slice: StreamSlice, + ) -> Iterable[StreamData]: + pagination_complete = False + next_page_token = None + while not pagination_complete: + response = self._fetch_next_page(stream_state, stream_slice, next_page_token) + yield from records_generator_fn(response) + + if not response: + pagination_complete = True + else: + next_page_token = self._next_page_token(response) + if not next_page_token: + pagination_complete = True + + # Always return an empty generator just in case no records were ever yielded + yield from [] + + def _read_single_page( + self, + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]], + stream_state: Mapping[str, Any], + stream_slice: StreamSlice, + ) -> Iterable[StreamData]: + response = self._fetch_next_page(stream_state, stream_slice) + yield from records_generator_fn(response) + + if not response: + next_page_token: Mapping[str, Any] = {FULL_REFRESH_SYNC_COMPLETE_KEY: True} + else: + next_page_token = self._next_page_token(response) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True} + + if self.cursor: + self.cursor.close_slice(StreamSlice(cursor_slice=next_page_token, partition=stream_slice.partition)) + + # Always return an empty generator just in case no records were ever yielded + yield from [] + + def read_records( + self, + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice] = None, + ) -> Iterable[StreamData]: + """ + Fetch a stream's records from an HTTP API source + + :param records_schema: json schema to describe record + :param stream_slice: The stream slice to read data for + :return: The records read from the API source + """ + _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check + + most_recent_record_from_slice = None + record_generator = partial( + self._parse_records, + stream_state=self.state or {}, + stream_slice=_slice, + records_schema=records_schema, + ) + + if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor): + stream_state = self.state + + # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to fetch more records + # The platform deletes stream state for full refresh streams before starting a new job, so we don't need to worry about + # this value existing for the initial attempt + if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY): + return + cursor_value = stream_state.get("next_page_token") + + # The first attempt to read a page for the current partition should reset the paginator to the current + # cursor state which is initially assigned to the incoming state from the platform + partition_key = self._to_partition_key(_slice.partition) + if partition_key not in self._partition_started: + self._partition_started[partition_key] = True + self._paginator.reset(reset_value=cursor_value) + + yield from self._read_single_page(record_generator, stream_state, _slice) + else: + # Fixing paginator types has a long tail of dependencies + self._paginator.reset() + + for stream_data in self._read_pages(record_generator, self.state, _slice): + current_record = self._extract_record(stream_data, _slice) + if self.cursor and current_record: + self.cursor.observe(_slice, current_record) + + # Latest record read, not necessarily within slice boundaries. + # TODO Remove once all custom components implement `observe` method. + # https://github.com/airbytehq/airbyte-internal-issues/issues/6955 + most_recent_record_from_slice = self._get_most_recent_record(most_recent_record_from_slice, current_record, _slice) + yield stream_data + + if self.cursor: + self.cursor.close_slice(_slice, most_recent_record_from_slice) + return + + def _get_most_recent_record( + self, current_most_recent: Optional[Record], current_record: Optional[Record], stream_slice: StreamSlice + ) -> Optional[Record]: + if self.cursor and current_record: + if not current_most_recent: + return current_record + else: + return current_most_recent if self.cursor.is_greater_than_or_equal(current_most_recent, current_record) else current_record + else: + return None + + @staticmethod + def _extract_record(stream_data: StreamData, stream_slice: StreamSlice) -> Optional[Record]: + """ + As we allow the output of _read_pages to be StreamData, it can be multiple things. Therefore, we need to filter out and normalize + to data to streamline the rest of the process. + """ + if isinstance(stream_data, Record): + # Record is not part of `StreamData` but is the most common implementation of `Mapping[str, Any]` which is part of `StreamData` + return stream_data + elif isinstance(stream_data, (dict, Mapping)): + return Record(dict(stream_data), stream_slice) + elif isinstance(stream_data, AirbyteMessage) and stream_data.record: + return Record(stream_data.record.data, stream_slice) + return None + + # stream_slices is defined with arguments on http stream and fixing this has a long tail of dependencies. Will be resolved by the decoupling of http stream and simple retriever + def stream_slices(self) -> Iterable[Optional[StreamSlice]]: # type: ignore + """ + Specifies the slices for this stream. See the stream slicing section of the docs for more information. + + :param sync_mode: + :param cursor_field: + :param stream_state: + :return: + """ + return self.stream_slicer.stream_slices() + + @property + def state(self) -> Mapping[str, Any]: + return self.cursor.get_stream_state() if self.cursor else {} + + @state.setter + def state(self, value: StreamState) -> None: + """State setter, accept state serialized by state getter.""" + if self.cursor: + self.cursor.set_initial_state(value) + + def _parse_records( + self, + response: Optional[requests.Response], + stream_state: Mapping[str, Any], + records_schema: Mapping[str, Any], + stream_slice: Optional[StreamSlice], + ) -> Iterable[StreamData]: + yield from self._parse_response( + response, + stream_slice=stream_slice, + stream_state=stream_state, + records_schema=records_schema, + ) + + def must_deduplicate_query_params(self) -> bool: + return True + + @staticmethod + def _to_partition_key(to_serialize: Any) -> str: + # separators have changed in Python 3.4. To avoid being impacted by further change, we explicitly specify our own value + return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True) + + +@dataclass +class SimpleRetrieverTestReadDecorator(SimpleRetriever): + """ + In some cases, we want to limit the number of requests that are made to the backend source. This class allows for limiting the number of + slices that are queried throughout a read command. + """ + + maximum_number_of_slices: int = 5 + + def __post_init__(self, options: Mapping[str, Any]) -> None: + super().__post_init__(options) + if self.maximum_number_of_slices and self.maximum_number_of_slices < 1: + raise ValueError( + f"The maximum number of slices on a test read needs to be strictly positive. Got {self.maximum_number_of_slices}" + ) + + # stream_slices is defined with arguments on http stream and fixing this has a long tail of dependencies. Will be resolved by the decoupling of http stream and simple retriever + def stream_slices(self) -> Iterable[Optional[StreamSlice]]: # type: ignore + return islice(super().stream_slices(), self.maximum_number_of_slices) + + def _fetch_next_page( + self, stream_state: Mapping[str, Any], stream_slice: StreamSlice, next_page_token: Optional[Mapping[str, Any]] = None + ) -> Optional[requests.Response]: + return self.requester.send_request( + path=self._paginator_path(), + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=next_page_token, + request_headers=self._request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + request_params=self._request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + request_body_data=self._request_body_data( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ), + request_body_json=self._request_body_json( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ), + log_formatter=lambda response: format_http_message( + response, + f"Stream '{self.name}' request", + f"Request performed in order to extract records for stream '{self.name}'", + self.name, + ), + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py new file mode 100644 index 000000000000..fee72f44fe18 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.schema.default_schema_loader import DefaultSchemaLoader +from airbyte_cdk.sources.declarative.schema.inline_schema_loader import InlineSchemaLoader +from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import JsonFileSchemaLoader +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader + +__all__ = ["JsonFileSchemaLoader", "DefaultSchemaLoader", "SchemaLoader", "InlineSchemaLoader"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/default_schema_loader.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/default_schema_loader.py new file mode 100644 index 000000000000..1aa70be18fb2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/default_schema_loader.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from dataclasses import InitVar, dataclass +from typing import Any, Mapping + +from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import JsonFileSchemaLoader +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.types import Config + + +@dataclass +class DefaultSchemaLoader(SchemaLoader): + """ + Loads a schema from the default location or returns an empty schema for streams that have not defined their schema file yet. + + Attributes: + config (Config): The user-provided configuration as specified by the source's spec + parameters (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed + """ + + config: Config + parameters: InitVar[Mapping[str, Any]] + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._parameters = parameters + self.default_loader = JsonFileSchemaLoader(parameters=parameters, config=self.config) + + def get_json_schema(self) -> Mapping[str, Any]: + """ + Attempts to retrieve a schema from the default filepath location or returns the empty schema if a schema cannot be found. + + :return: The empty schema + """ + + try: + return self.default_loader.get_json_schema() + except OSError: + # A slight hack since we don't directly have the stream name. However, when building the default filepath we assume the + # runtime options stores stream name 'name' so we'll do the same here + stream_name = self._parameters.get("name", "") + logging.info(f"Could not find schema for stream {stream_name}, defaulting to the empty schema") + return {} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/inline_schema_loader.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/inline_schema_loader.py new file mode 100644 index 000000000000..72a46b7e595c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/inline_schema_loader.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Dict, Mapping + +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader + + +@dataclass +class InlineSchemaLoader(SchemaLoader): + """Describes a stream's schema""" + + schema: Dict[str, Any] + parameters: InitVar[Mapping[str, Any]] + + def get_json_schema(self) -> Mapping[str, Any]: + return self.schema diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py new file mode 100644 index 000000000000..af51fe5db01e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py @@ -0,0 +1,92 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import pkgutil +import sys +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, Tuple, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.types import Config +from airbyte_cdk.sources.utils.schema_helpers import ResourceSchemaLoader + + +def _default_file_path() -> str: + # Schema files are always in "source_/schemas/.json + # The connector's module name can be inferred by looking at the modules loaded and look for the one starting with source_ + source_modules = [ + k for k, v in sys.modules.items() if "source_" in k and "airbyte_cdk" not in k + ] # example: ['source_exchange_rates', 'source_exchange_rates.source'] + if source_modules: + module = source_modules[0].split(".")[0] + return f"./{module}/schemas/{{{{parameters['name']}}}}.json" + + # If we are not in a source_ module, the most likely scenario is we're processing a manifest from the connector builder + # server which does not require a json schema to be defined. + return "./{{parameters['name']}}.json" + + +@dataclass +class JsonFileSchemaLoader(ResourceSchemaLoader, SchemaLoader): + """ + Loads the schema from a json file + + Attributes: + file_path (Union[InterpolatedString, str]): The path to the json file describing the schema + name (str): The stream's name + config (Config): The user-provided configuration as specified by the source's spec + parameters (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed + """ + + config: Config + parameters: InitVar[Mapping[str, Any]] + file_path: Union[InterpolatedString, str] = field(default="") + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + if not self.file_path: + self.file_path = _default_file_path() + self.file_path = InterpolatedString.create(self.file_path, parameters=parameters) + + def get_json_schema(self) -> Mapping[str, Any]: + # todo: It is worth revisiting if we can replace file_path with just file_name if every schema is in the /schemas directory + # this would require that we find a creative solution to store or retrieve source_name in here since the files are mounted there + json_schema_path = self._get_json_filepath() + resource, schema_path = self.extract_resource_and_schema_path(json_schema_path) + raw_json_file = pkgutil.get_data(resource, schema_path) + + if not raw_json_file: + raise IOError(f"Cannot find file {json_schema_path}") + try: + raw_schema = json.loads(raw_json_file) + except ValueError as err: + raise RuntimeError(f"Invalid JSON file format for file {json_schema_path}") from err + self.package_name = resource + return self._resolve_schema_references(raw_schema) + + def _get_json_filepath(self) -> Any: + return self.file_path.eval(self.config) # type: ignore # file_path is always cast to an interpolated string + + @staticmethod + def extract_resource_and_schema_path(json_schema_path: str) -> Tuple[str, str]: + """ + When the connector is running on a docker container, package_data is accessible from the resource (source_), so we extract + the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight + hack to identify the source name while we are in the airbyte_cdk module. + :param json_schema_path: The path to the schema JSON file + :return: Tuple of the resource name and the path to the schema file + """ + split_path = json_schema_path.split("/") + + if split_path[0] == "" or split_path[0] == ".": + split_path = split_path[1:] + + if len(split_path) == 0: + return "", "" + + if len(split_path) == 1: + return "", split_path[0] + + return split_path[0], "/".join(split_path[1:]) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/schema_loader.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/schema_loader.py new file mode 100644 index 000000000000..a6beb70ae50b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/schema_loader.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Mapping + + +@dataclass +class SchemaLoader: + """Describes a stream's schema""" + + @abstractmethod + def get_json_schema(self) -> Mapping[str, Any]: + """Returns a mapping describing the stream's schema""" + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/spec/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/spec/__init__.py new file mode 100644 index 000000000000..1c13ed67cb66 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/spec/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.spec.spec import Spec + +__all__ = ["Spec"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/spec/spec.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/spec/spec.py new file mode 100644 index 000000000000..87c8911d6aa6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/spec/spec.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional + +from airbyte_cdk.models import AdvancedAuth, ConnectorSpecification, ConnectorSpecificationSerializer # type: ignore [attr-defined] +from airbyte_cdk.sources.declarative.models.declarative_component_schema import AuthFlow + + +@dataclass +class Spec: + """ + Returns a connection specification made up of information about the connector and how it can be configured + + Attributes: + connection_specification (Mapping[str, Any]): information related to how a connector can be configured + documentation_url (Optional[str]): The link the Airbyte documentation about this connector + """ + + connection_specification: Mapping[str, Any] + parameters: InitVar[Mapping[str, Any]] + documentation_url: Optional[str] = None + advanced_auth: Optional[AuthFlow] = None + + def generate_spec(self) -> ConnectorSpecification: + """ + Returns the connector specification according the spec block defined in the low code connector manifest. + """ + + obj: dict[str, Mapping[str, Any] | str | AdvancedAuth] = {"connectionSpecification": self.connection_specification} + + if self.documentation_url: + obj["documentationUrl"] = self.documentation_url + if self.advanced_auth: + self.advanced_auth.auth_flow_type = self.advanced_auth.auth_flow_type.value # type: ignore # We know this is always assigned to an AuthFlow which has the auth_flow_type field + # Map CDK AuthFlow model to protocol AdvancedAuth model + obj["advanced_auth"] = self.advanced_auth.dict() + + # We remap these keys to camel case because that's the existing format expected by the rest of the platform + return ConnectorSpecificationSerializer.load(obj) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/__init__.py new file mode 100644 index 000000000000..7bacc3ca80fc --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer + +__all__ = ["StreamSlicer"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py new file mode 100644 index 000000000000..a1ecf68ae5a3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py @@ -0,0 +1,30 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Iterable + +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.types import StreamSlice + + +@dataclass +class StreamSlicer(RequestOptionsProvider): + """ + Slices the stream into a subset of records. + Slices enable state checkpointing and data retrieval parallelization. + + The stream slicer keeps track of the cursor state as a dict of cursor_field -> cursor_value + + See the stream slicing section of the docs for more information. + """ + + @abstractmethod + def stream_slices(self) -> Iterable[StreamSlice]: + """ + Defines stream slices + + :return: List of stream slices + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/__init__.py new file mode 100644 index 000000000000..e18712a01273 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +# RecordTransformation is depended upon by every class in this module (since it's the abc everything implements). For this reason, +# the order of imports matters i.e: this file must fully import RecordTransformation before importing anything which depends on RecordTransformation +# Otherwise there will be a circular dependency (load order will be init.py --> RemoveFields (which tries to import RecordTransformation) --> +# init.py --> circular dep error, since loading this file causes it to try to import itself down the line. +# so we add the split directive below to tell isort to sort imports while keeping RecordTransformation as the first import +from .transformation import RecordTransformation + +# isort: split +from .add_fields import AddFields +from .remove_fields import RemoveFields + +__all__ = ["AddFields", "RecordTransformation", "RemoveFields"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/add_fields.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/add_fields.py new file mode 100644 index 000000000000..2a69b78218fd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/add_fields.py @@ -0,0 +1,128 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Dict, List, Mapping, Optional, Type, Union + +import dpath +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.types import Config, FieldPointer, StreamSlice, StreamState + + +@dataclass(frozen=True) +class AddedFieldDefinition: + """Defines the field to add on a record""" + + path: FieldPointer + value: Union[InterpolatedString, str] + value_type: Optional[Type[Any]] + parameters: InitVar[Mapping[str, Any]] + + +@dataclass(frozen=True) +class ParsedAddFieldDefinition: + """Defines the field to add on a record""" + + path: FieldPointer + value: InterpolatedString + value_type: Optional[Type[Any]] + parameters: InitVar[Mapping[str, Any]] + + +@dataclass +class AddFields(RecordTransformation): + """ + Transformation which adds field to an output record. The path of the added field can be nested. Adding nested fields will create all + necessary parent objects (like mkdir -p). Adding fields to an array will extend the array to that index (filling intermediate + indices with null values). So if you add a field at index 5 to the array ["value"], it will become ["value", null, null, null, null, + "new_value"]. + + + This transformation has access to the following contextual values: + record: the record about to be output by the connector + config: the input configuration provided to a connector + stream_state: the current state of the stream + stream_slice: the current stream slice being read + + + + Examples of instantiating this transformation via YAML: + - type: AddFields + fields: + # hardcoded constant + - path: ["path"] + value: "static_value" + + # nested path + - path: ["path", "to", "field"] + value: "static" + + # from config + - path: ["shop_id"] + value: "{{ config.shop_id }}" + + # from state + - path: ["current_state"] + value: "{{ stream_state.cursor_field }}" # Or {{ stream_state['cursor_field'] }} + + # from record + - path: ["unnested_value"] + value: {{ record.nested.field }} + + # from stream_slice + - path: ["start_date"] + value: {{ stream_slice.start_date }} + + # by supplying any valid Jinja template directive or expression https://jinja.palletsprojects.com/en/3.1.x/templates/# + - path: ["two_times_two"] + value: {{ 2 * 2 }} + + Attributes: + fields (List[AddedFieldDefinition]): A list of transformations (path and corresponding value) that will be added to the record + """ + + fields: List[AddedFieldDefinition] + parameters: InitVar[Mapping[str, Any]] + _parsed_fields: List[ParsedAddFieldDefinition] = field(init=False, repr=False, default_factory=list) + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + for add_field in self.fields: + if len(add_field.path) < 1: + raise ValueError(f"Expected a non-zero-length path for the AddFields transformation {add_field}") + + if not isinstance(add_field.value, InterpolatedString): + if not isinstance(add_field.value, str): + raise f"Expected a string value for the AddFields transformation: {add_field}" + else: + self._parsed_fields.append( + ParsedAddFieldDefinition( + add_field.path, + InterpolatedString.create(add_field.value, parameters=parameters), + value_type=add_field.value_type, + parameters=parameters, + ) + ) + else: + self._parsed_fields.append( + ParsedAddFieldDefinition(add_field.path, add_field.value, value_type=add_field.value_type, parameters={}) + ) + + def transform( + self, + record: Dict[str, Any], + config: Optional[Config] = None, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + ) -> None: + if config is None: + config = {} + kwargs = {"record": record, "stream_state": stream_state, "stream_slice": stream_slice} + for parsed_field in self._parsed_fields: + valid_types = (parsed_field.value_type,) if parsed_field.value_type else None + value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs) + dpath.new(record, parsed_field.path, value) + + def __eq__(self, other: Any) -> bool: + return bool(self.__dict__ == other.__dict__) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py new file mode 100644 index 000000000000..53db3d49abd4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py @@ -0,0 +1,22 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Any, Dict, Optional + +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState + + +@dataclass +class KeysToLowerTransformation(RecordTransformation): + def transform( + self, + record: Dict[str, Any], + config: Optional[Config] = None, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + ) -> None: + for key in set(record.keys()): + record[key.lower()] = record.pop(key) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/remove_fields.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/remove_fields.py new file mode 100644 index 000000000000..658d5dd2ccdb --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/remove_fields.py @@ -0,0 +1,70 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Dict, List, Mapping, Optional + +import dpath +import dpath.exceptions +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.types import Config, FieldPointer, StreamSlice, StreamState + + +@dataclass +class RemoveFields(RecordTransformation): + """ + A transformation which removes fields from a record. The fields removed are designated using FieldPointers. + During transformation, if a field or any of its parents does not exist in the record, no error is thrown. + + If an input field pointer references an item in a list (e.g: ["k", 0] in the object {"k": ["a", "b", "c"]}) then + the object at that index is set to None rather than being not entirely removed from the list. TODO change this behavior. + + It's possible to remove objects nested in lists e.g: removing [".", 0, "k"] from {".": [{"k": "V"}]} results in {".": [{}]} + + Usage syntax: + + ```yaml + my_stream: + + transformations: + - type: RemoveFields + field_pointers: + - ["path", "to", "field1"] + - ["path2"] + ``` + + Attributes: + field_pointers (List[FieldPointer]): pointers to the fields that should be removed + """ + + field_pointers: List[FieldPointer] + parameters: InitVar[Mapping[str, Any]] + condition: str = "" + + def __post_init__(self, parameters: Mapping[str, Any]) -> None: + self._filter_interpolator = InterpolatedBoolean(condition=self.condition, parameters=parameters) + + def transform( + self, + record: Dict[str, Any], + config: Optional[Config] = None, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + ) -> None: + """ + :param record: The record to be transformed + :return: the input record with the requested fields removed + """ + for pointer in self.field_pointers: + # the dpath library by default doesn't delete fields from arrays + try: + dpath.delete( + record, + pointer, + afilter=(lambda x: self._filter_interpolator.eval(config or {}, property=x)) if self.condition else None, + ) + except dpath.exceptions.PathNotFound: + # if the (potentially nested) property does not exist, silently skip + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/transformation.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/transformation.py new file mode 100644 index 000000000000..f5b22642964b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/transformation.py @@ -0,0 +1,37 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, Optional + +from airbyte_cdk.sources.types import Config, StreamSlice, StreamState + + +@dataclass +class RecordTransformation: + """ + Implementations of this class define transformations that can be applied to records of a stream. + """ + + @abstractmethod + def transform( + self, + record: Dict[str, Any], + config: Optional[Config] = None, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + ) -> None: + """ + Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument. + + :param record: The input record to be transformed + :param config: The user-provided configuration as specified by the source's spec + :param stream_state: The stream state + :param stream_slice: The stream slice + :return: The transformed record + """ + + def __eq__(self, other: object) -> bool: + return other.__dict__ == self.__dict__ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/types.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/types.py new file mode 100644 index 000000000000..91900d1885de --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/types.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from __future__ import annotations + +from airbyte_cdk.sources.types import Config, ConnectionDefinition, FieldPointer, Record, StreamSlice, StreamState + +# Note: This package originally contained class definitions for low-code CDK types, but we promoted them into the Python CDK. +# We've migrated connectors in the repository to reference the new location, but these assignments are used to retain backwards +# compatibility for sources created by OSS customers or on forks. This can be removed when we start bumping major versions. + +FieldPointer = FieldPointer +Config = Config +ConnectionDefinition = ConnectionDefinition +StreamState = StreamState +Record = Record +StreamSlice = StreamSlice diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/yaml_declarative_source.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/yaml_declarative_source.py new file mode 100644 index 000000000000..a0443b037f10 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/yaml_declarative_source.py @@ -0,0 +1,60 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pkgutil +from typing import Any, List, Mapping, Optional + +import yaml +from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource +from airbyte_cdk.sources.types import ConnectionDefinition + + +class YamlDeclarativeSource(ConcurrentDeclarativeSource[List[AirbyteStateMessage]]): + """Declarative source defined by a yaml file""" + + def __init__( + self, + path_to_yaml: str, + debug: bool = False, + catalog: Optional[ConfiguredAirbyteCatalog] = None, + config: Optional[Mapping[str, Any]] = None, + state: Optional[List[AirbyteStateMessage]] = None, + ) -> None: + """ + :param path_to_yaml: Path to the yaml file describing the source + """ + self._path_to_yaml = path_to_yaml + source_config = self._read_and_parse_yaml_file(path_to_yaml) + + super().__init__( + catalog=catalog or ConfiguredAirbyteCatalog(streams=[]), + config=config or {}, + state=state or [], + source_config=source_config, + ) + + def _read_and_parse_yaml_file(self, path_to_yaml_file: str) -> ConnectionDefinition: + package = self.__class__.__module__.split(".")[0] + + yaml_config = pkgutil.get_data(package, path_to_yaml_file) + if yaml_config: + decoded_yaml = yaml_config.decode() + return self._parse(decoded_yaml) + else: + return {} + + def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None: + extra_args["path_to_yaml"] = self._path_to_yaml + self.logger.debug("declarative source created from parsed YAML manifest", extra=extra_args) + + @staticmethod + def _parse(connection_definition_str: str) -> ConnectionDefinition: + """ + Parses a yaml file into a manifest. Component references still exist in the manifest which will be + resolved during the creating of the DeclarativeSource. + :param connection_definition_str: yaml string to parse + :return: The ConnectionDefinition parsed from connection_definition_str + """ + return yaml.safe_load(connection_definition_str) # type: ignore # yaml.safe_load doesn't return a type but know it is a Mapping diff --git a/airbyte-cdk/python/airbyte_cdk/sources/embedded/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/embedded/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/embedded/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/embedded/base_integration.py b/airbyte-cdk/python/airbyte_cdk/sources/embedded/base_integration.py new file mode 100644 index 000000000000..79c9bd850a3a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/embedded/base_integration.py @@ -0,0 +1,50 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Generic, Iterable, Optional, TypeVar + +from airbyte_cdk.connector import TConfig +from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStateMessage, SyncMode, Type +from airbyte_cdk.sources.embedded.catalog import create_configured_catalog, get_stream, get_stream_names +from airbyte_cdk.sources.embedded.runner import SourceRunner +from airbyte_cdk.sources.embedded.tools import get_defined_id +from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit + +TOutput = TypeVar("TOutput") + + +class BaseEmbeddedIntegration(ABC, Generic[TConfig, TOutput]): + def __init__(self, runner: SourceRunner[TConfig], config: TConfig): + check_config_against_spec_or_exit(config, runner.spec()) + + self.source = runner + self.config = config + + self.last_state: Optional[AirbyteStateMessage] = None + + @abstractmethod + def _handle_record(self, record: AirbyteRecordMessage, id: Optional[str]) -> Optional[TOutput]: + """ + Turn an Airbyte record into the appropriate output type for the integration. + """ + pass + + def _load_data(self, stream_name: str, state: Optional[AirbyteStateMessage] = None) -> Iterable[TOutput]: + catalog = self.source.discover(self.config) + stream = get_stream(catalog, stream_name) + if not stream: + raise ValueError(f"Stream {stream_name} not found, the following streams are available: {', '.join(get_stream_names(catalog))}") + if SyncMode.incremental not in stream.supported_sync_modes: + configured_catalog = create_configured_catalog(stream, sync_mode=SyncMode.full_refresh) + else: + configured_catalog = create_configured_catalog(stream, sync_mode=SyncMode.incremental) + + for message in self.source.read(self.config, configured_catalog, state): + if message.type == Type.RECORD: + output = self._handle_record(message.record, get_defined_id(stream, message.record.data)) # type: ignore[union-attr] # record has `data` + if output: + yield output + elif message.type is Type.STATE and message.state: + self.last_state = message.state diff --git a/airbyte-cdk/python/airbyte_cdk/sources/embedded/catalog.py b/airbyte-cdk/python/airbyte_cdk/sources/embedded/catalog.py new file mode 100644 index 000000000000..765e9b260233 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/embedded/catalog.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import List, Optional + +from airbyte_cdk.models import ( + AirbyteCatalog, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + SyncMode, +) +from airbyte_cdk.sources.embedded.tools import get_first + + +def get_stream(catalog: AirbyteCatalog, stream_name: str) -> Optional[AirbyteStream]: + return get_first(catalog.streams, lambda s: s.name == stream_name) + + +def get_stream_names(catalog: AirbyteCatalog) -> List[str]: + return [stream.name for stream in catalog.streams] + + +def to_configured_stream( + stream: AirbyteStream, + sync_mode: SyncMode = SyncMode.full_refresh, + destination_sync_mode: DestinationSyncMode = DestinationSyncMode.append, + cursor_field: Optional[List[str]] = None, + primary_key: Optional[List[List[str]]] = None, +) -> ConfiguredAirbyteStream: + return ConfiguredAirbyteStream( + stream=stream, sync_mode=sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key + ) + + +def to_configured_catalog(configured_streams: List[ConfiguredAirbyteStream]) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog(streams=configured_streams) + + +def create_configured_catalog(stream: AirbyteStream, sync_mode: SyncMode = SyncMode.full_refresh) -> ConfiguredAirbyteCatalog: + configured_streams = [to_configured_stream(stream, sync_mode=sync_mode, primary_key=stream.source_defined_primary_key)] + + return to_configured_catalog(configured_streams) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/embedded/runner.py b/airbyte-cdk/python/airbyte_cdk/sources/embedded/runner.py new file mode 100644 index 000000000000..c64e66ed581e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/embedded/runner.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import logging +from abc import ABC, abstractmethod +from typing import Generic, Iterable, Optional + +from airbyte_cdk.connector import TConfig +from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog, ConnectorSpecification +from airbyte_cdk.sources.source import Source + + +class SourceRunner(ABC, Generic[TConfig]): + @abstractmethod + def spec(self) -> ConnectorSpecification: + pass + + @abstractmethod + def discover(self, config: TConfig) -> AirbyteCatalog: + pass + + @abstractmethod + def read(self, config: TConfig, catalog: ConfiguredAirbyteCatalog, state: Optional[AirbyteStateMessage]) -> Iterable[AirbyteMessage]: + pass + + +class CDKRunner(SourceRunner[TConfig]): + def __init__(self, source: Source, name: str): + self._source = source + self._logger = logging.getLogger(name) + + def spec(self) -> ConnectorSpecification: + return self._source.spec(self._logger) + + def discover(self, config: TConfig) -> AirbyteCatalog: + return self._source.discover(self._logger, config) + + def read(self, config: TConfig, catalog: ConfiguredAirbyteCatalog, state: Optional[AirbyteStateMessage]) -> Iterable[AirbyteMessage]: + return self._source.read(self._logger, config, catalog, state=[state] if state else []) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/embedded/tools.py b/airbyte-cdk/python/airbyte_cdk/sources/embedded/tools.py new file mode 100644 index 000000000000..39d70c118cd0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/embedded/tools.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Callable, Dict, Iterable, Optional + +import dpath +from airbyte_cdk.models import AirbyteStream + + +def get_first(iterable: Iterable[Any], predicate: Callable[[Any], bool] = lambda m: True) -> Optional[Any]: + return next(filter(predicate, iterable), None) + + +def get_defined_id(stream: AirbyteStream, data: Dict[str, Any]) -> Optional[str]: + if not stream.source_defined_primary_key: + return None + primary_key = [] + for key in stream.source_defined_primary_key: + try: + primary_key.append(str(dpath.get(data, key))) + except KeyError: + primary_key.append("__not_found__") + return "_".join(primary_key) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/README.md b/airbyte-cdk/python/airbyte_cdk/sources/file_based/README.md new file mode 100644 index 000000000000..ea3c20d4ce99 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/README.md @@ -0,0 +1,152 @@ +## Behavior + +The Airbyte protocol defines the actions `spec`, `discover`, `check` and `read` for a source to be compliant. Here is the high-level description of the flow for a file-based source: + +- spec: calls AbstractFileBasedSpec.documentation_url and AbstractFileBasedSpec.schema to return a ConnectorSpecification. +- discover: calls Source.streams, and subsequently Stream.get_json_schema; this uses Source.open_file to open files during schema discovery. +- check: Source.check_connection is called from the entrypoint code (in the main CDK). +- read: Stream.read_records calls Stream.list_files which calls Source.list_matching_files, and then also uses Source.open_file to parse records from the file handle. + +## How to Implement Your Own + +To create a file-based source a user must extend three classes – AbstractFileBasedSource, AbstractFileBasedSpec, and AbstractStreamReader – to create an implementation for the connector’s specific storage system. They then initialize a FileBasedSource with the instance of AbstractStreamReader specific to their storage system. + +The abstract classes house the vast majority of the logic required by file-based sources. For example, when extending AbstractStreamReader, users only have to implement three methods: + +- list_matching_files: lists files matching the glob pattern(s) provided in the config. +- open_file: returns a file handle for reading. +- config property setter: concrete implementations of AbstractFileBasedStreamReader's config setter should assert that `value` is of the correct config type for that type of StreamReader. + +The result is that an implementation of a source might look like this: + +``` +class CustomStreamReader(AbstractStreamReader): + def open_file(self, remote_file: RemoteFile) -> FileHandler: + <...> + + def get_matching_files( + self, + globs: List[str], + logger: logging.Logger, + ) -> Iterable[RemoteFile]: + <...> + + @config.setter + def config(self, value: Config): + assert isinstance(value, CustomConfig) + self._config = value + + +class CustomConfig(AbstractFileBasedSpec): + @classmethod + def documentation_url(cls) -> AnyUrl: + return AnyUrl("https://docs.airbyte.com/integrations/sources/s3", scheme="https") + + a_spec_field: str = Field(title="A Spec Field", description="This is where you describe the fields of the spec", order=0) + <...> +``` + +For more information, feel free to check the docstrings of each classes or check specific implementations (like source-s3). + +## Supported File Types + +### Avro + +Avro is a serialization format developed by [Apache](https://avro.apache.org/docs/). Avro configuration options for the file-based CDK: + +- `double_as_string`: Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers. + +### CSV + +CSV is a format loosely described by [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180). The format is quite flexible which leads to a ton of options to consider: + +- `delimiter`: The character delimiting individual cells in the CSV data. By name, CSV is comma separated so the default value is `,` +- `quote_char`: When quoted fields are used, it is possible for a field to span multiple lines, even when line breaks appear within such field. The default quote character is `"`. +- `escape_char`: The character used for escaping special characters. +- `encoding`: The character encoding of the file. By default, `UTF-8` +- `double_quote`: Whether two quotes in a quoted CSV value denote a single quote in the data. +- `quoting_behavior`: The quoting behavior determines when a value in a row should have quote marks added around it. +- `skip_rows_before_header`: The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field. +- `skip_rows_after_header`: The number of rows to skip after the header row. +- `autogenerate_column_names`: If your CSV does not have a header row, the file-based CDK will need this enable to generate column names. +- `null_values`: As CSV does not explicitly define a value for null values, the user can specify a set of case-sensitive strings that should be interpreted as null values. +- `true_values`: As CSV does not explicitly define a value for positive boolean, the user can specify a set of case-sensitive strings that should be interpreted as true values. +- `false_values`: As CSV does not explicitly define a value for negative boolean, the user can specify a set of case-sensitive strings that should be interpreted as false values. + +### JSONL + +[JSONL](https://jsonlines.org/) (or JSON Lines) is a format where each row is a JSON object. There are no configuration option for this format. For backward compatibility reasons, the JSONL parser currently supports multiline objects even though this is not part of the JSONL standard. Following some data gathering, we reserve the right to remove the support for this. Given that files have multiline JSON objects, performances will be slow. + +### Parquet + +Parquet is a file format defined by [Apache](https://parquet.apache.org/). Configuration options are: + +- `decimal_as_float`: Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended. + +### Document file types (PDF, DOCX, Markdown) + +For file share source connectors, the `unstructured` parser can be used to parse document file types. The textual content of the whole file will be parsed as a single record with a `content` field containing the text encoded as markdown. + +To use the unstructured parser, the libraries `poppler` and `tesseract` need to be installed on the system running the connector. For example, on Ubuntu, you can install them with the following command: + +``` +apt-get install -y tesseract-ocr poppler-utils +``` + +on Mac, you can install these via brew: + +``` +brew install poppler +brew install tesseract +``` + +## Schema + +Having a schema allows for the file-based CDK to take action when there is a discrepancy between a record and what are the expected types of the record fields. + +Schema can be either inferred or user provided. + +- If the user defines it a format using JSON types, inference will not apply. Input schemas are a key/value pair of strings describing column name and data type. Supported types are `["string", "number", "integer", "object", "array", "boolean", "null"]`. For example, `{"col1": "string", "col2": "boolean"}`. +- If the user enables schemaless sync, schema will `{"data": "object"}` and therefore emitted records will look like `{"data": {"col1": val1, …}}`. This is recommended if the contents between files in the stream vary significantly, and/or if data is very nested. +- Else, the file-based CDK will infer the schema depending on the file type. Some file formats defined the schema as part of their metadata (like Parquet), some do on the record-level (like Avro) and some don't have any explicit typing (like JSON or CSV). Note that all CSV values are inferred as strings except where we are supporting legacy configurations. Any file format that does not define their schema on a metadata level will require the file-based CDK to iterate to a number of records. There is a limit of bytes that will be consumed in order to infer the schema. + +### Validation Policies + +Users will be required to select one of 3 different options, in the event that records are encountered that don’t conform to the schema. + +- Skip nonconforming records: check each record to see if it conforms to the user-input or inferred schema; skip the record if it doesn't conform. We keep a count of the number of records in each file that do and do not conform and emit a log message with these counts once we’re done reading the file. +- Emit all records: emit all records, even if they do not conform to the user-provided or inferred schema. Columns that don't exist in the configured catalog probably won't be available in the destination's table since that's the current behavior. + Only error if there are conflicting field types or malformed rows. +- Stop the sync and wait for schema re-discovery: if a record is encountered that does not conform to the configured catalog’s schema, we log a message and stop the whole sync. Note: this option is not recommended if the files have very different columns or datatypes, because the inferred schema may vary significantly at discover time. + +When the `schemaless` is enabled, validation will be skipped. + +## Breaking Changes (compared to previous S3 implementation) + +- [CSV] Mapping of type `array` and `object`: before, they were mapped as `large_string` and hence casted as strings. Given the new changes, if `array` or `object` is specified, the value will be casted as `array` and `object` respectively. +- [CSV] Before, a string value would not be considered as `null_values` if the column type was a string. We will now start to cast string columns with values matching `null_values` to null. +- [CSV] `decimal_point` option is deprecated: It is not possible anymore to use another character than `.` to separate the integer part from non-integer part. Given that the float is format with another character than this, it will be considered as a string. +- [Parquet] `columns` option is deprecated: You can use Airbyte column selection in order to have the same behavior. We don't expect it, but this could have impact on the performance as payload could be bigger. + +## Incremental syncs + +The file-based connectors supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts#connection-sync-modes): + +| Feature | Supported? | +| :--------------------------------------------- | :--------- | +| Full Refresh Sync | Yes | +| Incremental Sync | Yes | +| Replicate Incremental Deletes | No | +| Replicate Multiple Files \(pattern matching\) | Yes | +| Replicate Multiple Streams \(distinct tables\) | Yes | +| Namespaces | No | + +We recommend you do not manually modify files that are already synced. The connector has file-level granularity, which means adding or modifying a row in a CSV file will trigger a re-sync of the content of that file. + +### Incremental sync + +After the initial sync, the connector only pulls files that were modified since the last sync. + +The connector checkpoints the connection states when it is done syncing all files for a given timestamp. The connection's state only keeps track of the last 10 000 files synced. If more than 10 000 files are synced, the connector won't be able to rely on the connection state to deduplicate files. In this case, the connector will initialize its cursor to the minimum between the earliest file in the history, or 3 days ago. + +Both the maximum number of files, and the time buffer can be configured by connector developers. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/__init__.py new file mode 100644 index 000000000000..6ea0ca31eaf1 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/__init__.py @@ -0,0 +1,24 @@ +from .config.abstract_file_based_spec import AbstractFileBasedSpec +from .config.csv_format import CsvFormat +from .config.file_based_stream_config import FileBasedStreamConfig +from .config.jsonl_format import JsonlFormat +from .exceptions import CustomFileBasedException, ErrorListingFiles, FileBasedSourceError +from .file_based_source import DEFAULT_CONCURRENCY, FileBasedSource +from .file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from .remote_file import RemoteFile +from .stream.cursor import DefaultFileBasedCursor + +__all__ = [ + "AbstractFileBasedSpec", + "AbstractFileBasedStreamReader", + "CsvFormat", + "CustomFileBasedException", + "DefaultFileBasedCursor", + "ErrorListingFiles", + "FileBasedSource", + "FileBasedSourceError", + "FileBasedStreamConfig", + "FileReadMode", + "JsonlFormat", + "RemoteFile", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/__init__.py new file mode 100644 index 000000000000..a05e5421000a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/__init__.py @@ -0,0 +1,4 @@ +from .abstract_file_based_availability_strategy import AbstractFileBasedAvailabilityStrategy, AbstractFileBasedAvailabilityStrategyWrapper +from .default_file_based_availability_strategy import DefaultFileBasedAvailabilityStrategy + +__all__ = ["AbstractFileBasedAvailabilityStrategy", "AbstractFileBasedAvailabilityStrategyWrapper", "DefaultFileBasedAvailabilityStrategy"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py new file mode 100644 index 000000000000..ba26745ea57c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import abstractmethod +from typing import TYPE_CHECKING, Optional, Tuple + +from airbyte_cdk.sources import Source +from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy +from airbyte_cdk.sources.streams.concurrent.availability_strategy import ( + AbstractAvailabilityStrategy, + StreamAvailability, + StreamAvailable, + StreamUnavailable, +) +from airbyte_cdk.sources.streams.core import Stream + +if TYPE_CHECKING: + from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream + + +class AbstractFileBasedAvailabilityStrategy(AvailabilityStrategy): + @abstractmethod + def check_availability(self, stream: Stream, logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: + """ + Perform a connection check for the stream. + + Returns (True, None) if successful, otherwise (False, ). + """ + ... + + @abstractmethod + def check_availability_and_parsability( + self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source] + ) -> Tuple[bool, Optional[str]]: + """ + Performs a connection check for the stream, as well as additional checks that + verify that the connection is working as expected. + + Returns (True, None) if successful, otherwise (False, ). + """ + ... + + +class AbstractFileBasedAvailabilityStrategyWrapper(AbstractAvailabilityStrategy): + def __init__(self, stream: "AbstractFileBasedStream"): + self.stream = stream + + def check_availability(self, logger: logging.Logger) -> StreamAvailability: + is_available, reason = self.stream.availability_strategy.check_availability(self.stream, logger, None) + if is_available: + return StreamAvailable() + return StreamUnavailable(reason or "") + + def check_availability_and_parsability(self, logger: logging.Logger) -> Tuple[bool, Optional[str]]: + return self.stream.availability_strategy.check_availability_and_parsability(self.stream, logger, None) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py new file mode 100644 index 000000000000..079b1e2a11c7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py @@ -0,0 +1,118 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import traceback +from typing import TYPE_CHECKING, Optional, Tuple + +from airbyte_cdk import AirbyteTracedException +from airbyte_cdk.sources import Source +from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, CustomFileBasedException, FileBasedSourceError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema + +if TYPE_CHECKING: + from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream + + +class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy): + def __init__(self, stream_reader: AbstractFileBasedStreamReader): + self.stream_reader = stream_reader + + def check_availability(self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override] + """ + Perform a connection check for the stream (verify that we can list files from the stream). + + Returns (True, None) if successful, otherwise (False, ). + """ + try: + self._check_list_files(stream) + except CheckAvailabilityError: + return False, "".join(traceback.format_exc()) + + return True, None + + def check_availability_and_parsability( + self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source] + ) -> Tuple[bool, Optional[str]]: + """ + Perform a connection check for the stream. + + Returns (True, None) if successful, otherwise (False, ). + + For the stream: + - Verify the parser config is valid per check_config method of the parser. + - Verify that we can list files from the stream using the configured globs. + - Verify that we can read one file from the stream as long as the stream parser is not setting parser_max_n_files_for_parsability to 0. + + This method will also check that the files and their contents are consistent + with the configured options, as follows: + - If the files have extensions, verify that they don't disagree with the + configured file type. + - If the user provided a schema in the config, check that a subset of records in + one file conform to the schema via a call to stream.conforms_to_schema(schema). + """ + parser = stream.get_parser() + config_check_result, config_check_error_message = parser.check_config(stream.config) + if config_check_result is False: + return False, config_check_error_message + try: + file = self._check_list_files(stream) + if not parser.parser_max_n_files_for_parsability == 0: + self._check_parse_record(stream, file, logger) + else: + # If the parser is set to not check parsability, we still want to check that we can open the file. + handle = stream.stream_reader.open_file(file, parser.file_read_mode, None, logger) + handle.close() + except AirbyteTracedException as ate: + raise ate + except CheckAvailabilityError: + return False, "".join(traceback.format_exc()) + + return True, None + + def _check_list_files(self, stream: "AbstractFileBasedStream") -> RemoteFile: + """ + Check that we can list files from the stream. + + Returns the first file if successful, otherwise raises a CheckAvailabilityError. + """ + try: + file = next(iter(stream.get_files())) + except StopIteration: + raise CheckAvailabilityError(FileBasedSourceError.EMPTY_STREAM, stream=stream.name) + except CustomFileBasedException as exc: + raise CheckAvailabilityError(str(exc), stream=stream.name) from exc + except Exception as exc: + raise CheckAvailabilityError(FileBasedSourceError.ERROR_LISTING_FILES, stream=stream.name) from exc + + return file + + def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None: + parser = stream.get_parser() + + try: + record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger, discovered_schema=None))) + except StopIteration: + # The file is empty. We've verified that we can open it, so will + # consider the connection check successful even though it means + # we skip the schema validation check. + return + except AirbyteTracedException as ate: + raise ate + except Exception as exc: + raise CheckAvailabilityError(FileBasedSourceError.ERROR_READING_FILE, stream=stream.name, file=file.uri) from exc + + schema = stream.catalog_schema or stream.config.input_schema + if schema and stream.validation_policy.validate_schema_before_sync: + if not conforms_to_schema(record, schema): # type: ignore + raise CheckAvailabilityError( + FileBasedSourceError.ERROR_VALIDATING_RECORD, + stream=stream.name, + file=file.uri, + ) + + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py new file mode 100644 index 000000000000..38159698816c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -0,0 +1,134 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +from abc import abstractmethod +from typing import Any, Dict, List, Literal, Optional, Union + +import dpath +from airbyte_cdk import OneOfOptionConfig +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.utils import schema_helpers +from pydantic.v1 import AnyUrl, BaseModel, Field + + +class DeliverRecords(BaseModel): + class Config(OneOfOptionConfig): + title = "Replicate Records" + description = "Recommended - Extract and load structured records into your destination of choice. This is the classic method of moving data in Airbyte. It allows for blocking and hashing individual fields or files from a structured schema. Data can be flattened, typed and deduped depending on the destination." + discriminator = "delivery_type" + + delivery_type: Literal["use_records_transfer"] = Field("use_records_transfer", const=True) + + +class DeliverRawFiles(BaseModel): + class Config(OneOfOptionConfig): + title = "Copy Raw Files" + description = "Copy raw files without parsing their contents. Bits are copied into the destination exactly as they appeared in the source. Recommended for use with unstructured text data, non-text and compressed files." + discriminator = "delivery_type" + + delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True) + + +class AbstractFileBasedSpec(BaseModel): + """ + Used during spec; allows the developer to configure the cloud provider specific options + that are needed when users configure a file-based source. + """ + + start_date: Optional[str] = Field( + title="Start Date", + description="UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", + examples=["2021-01-01T00:00:00.000000Z"], + format="date-time", + pattern="^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", + pattern_descriptor="YYYY-MM-DDTHH:mm:ss.SSSSSSZ", + order=1, + ) + + streams: List[FileBasedStreamConfig] = Field( + title="The list of streams to sync", + description='Each instance of this configuration defines a stream. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.', + order=10, + ) + + delivery_method: Union[DeliverRecords, DeliverRawFiles] = Field( + title="Delivery Method", + discriminator="delivery_type", + type="object", + order=7, + display_type="radio", + group="advanced", + default="use_records_transfer", + airbyte_hidden=True, + ) + + @classmethod + @abstractmethod + def documentation_url(cls) -> AnyUrl: + """ + :return: link to docs page for this source e.g. "https://docs.airbyte.com/integrations/sources/s3" + """ + + @classmethod + def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]: + """ + Generates the mapping comprised of the config fields + """ + schema = super().schema(*args, **kwargs) + transformed_schema: Dict[str, Any] = copy.deepcopy(schema) + schema_helpers.expand_refs(transformed_schema) + cls.replace_enum_allOf_and_anyOf(transformed_schema) + cls.remove_discriminator(transformed_schema) + + return transformed_schema + + @staticmethod + def remove_discriminator(schema: Dict[str, Any]) -> None: + """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references""" + dpath.delete(schema, "properties/**/discriminator") + + @staticmethod + def replace_enum_allOf_and_anyOf(schema: Dict[str, Any]) -> Dict[str, Any]: + """ + allOfs are not supported by the UI, but pydantic is automatically writing them for enums. + Unpacks the enums under allOf and moves them up a level under the enum key + anyOfs are also not supported by the UI, so we replace them with the similar oneOf, with the + additional validation that an incoming config only matches exactly one of a field's types. + """ + objects_to_check = schema["properties"]["streams"]["items"]["properties"]["format"] + objects_to_check["type"] = "object" + objects_to_check["oneOf"] = objects_to_check.pop("anyOf", []) + for format in objects_to_check["oneOf"]: + for key in format["properties"]: + object_property = format["properties"][key] + AbstractFileBasedSpec.move_enum_to_root(object_property) + + properties_to_change = ["validation_policy"] + for property_to_change in properties_to_change: + property_object = schema["properties"]["streams"]["items"]["properties"][property_to_change] + if "anyOf" in property_object: + schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object" + schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf") + AbstractFileBasedSpec.move_enum_to_root(property_object) + + csv_format_schemas = list( + filter( + lambda format: format["properties"]["filetype"]["default"] == "csv", + schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"], + ) + ) + if len(csv_format_schemas) != 1: + raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}") + csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop( + "anyOf", [] + ) + csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object" + return schema + + @staticmethod + def move_enum_to_root(object_property: Dict[str, Any]) -> None: + if "allOf" in object_property and "enum" in object_property["allOf"][0]: + object_property["enum"] = object_property["allOf"][0]["enum"] + object_property.pop("allOf") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py new file mode 100644 index 000000000000..ac8fafef577e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig +from pydantic.v1 import BaseModel, Field + + +class AvroFormat(BaseModel): + class Config(OneOfOptionConfig): + title = "Avro Format" + discriminator = "filetype" + + filetype: str = Field( + "avro", + const=True, + ) + + double_as_string: bool = Field( + title="Convert Double Fields to Strings", + description="Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.", + default=False, + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py new file mode 100644 index 000000000000..317a01722078 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -0,0 +1,197 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import codecs +from enum import Enum +from typing import Any, Dict, List, Optional, Set, Union + +from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig +from pydantic.v1 import BaseModel, Field, root_validator, validator +from pydantic.v1.error_wrappers import ValidationError + + +class InferenceType(Enum): + NONE = "None" + PRIMITIVE_TYPES_ONLY = "Primitive Types Only" + + +class CsvHeaderDefinitionType(Enum): + FROM_CSV = "From CSV" + AUTOGENERATED = "Autogenerated" + USER_PROVIDED = "User Provided" + + +class CsvHeaderFromCsv(BaseModel): + class Config(OneOfOptionConfig): + title = "From CSV" + discriminator = "header_definition_type" + + header_definition_type: str = Field( + CsvHeaderDefinitionType.FROM_CSV.value, + const=True, + ) + + def has_header_row(self) -> bool: + return True + + +class CsvHeaderAutogenerated(BaseModel): + class Config(OneOfOptionConfig): + title = "Autogenerated" + discriminator = "header_definition_type" + + header_definition_type: str = Field( + CsvHeaderDefinitionType.AUTOGENERATED.value, + const=True, + ) + + def has_header_row(self) -> bool: + return False + + +class CsvHeaderUserProvided(BaseModel): + class Config(OneOfOptionConfig): + title = "User Provided" + discriminator = "header_definition_type" + + header_definition_type: str = Field( + CsvHeaderDefinitionType.USER_PROVIDED.value, + const=True, + ) + column_names: List[str] = Field( + title="Column Names", + description="The column names that will be used while emitting the CSV records", + ) + + def has_header_row(self) -> bool: + return False + + @validator("column_names") + def validate_column_names(cls, v: List[str]) -> List[str]: + if not v: + raise ValueError("At least one column name needs to be provided when using user provided headers") + return v + + +DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"] +DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"] + + +class CsvFormat(BaseModel): + class Config(OneOfOptionConfig): + title = "CSV Format" + discriminator = "filetype" + + filetype: str = Field( + "csv", + const=True, + ) + delimiter: str = Field( + title="Delimiter", + description="The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.", + default=",", + ) + quote_char: str = Field( + title="Quote Character", + default='"', + description="The character used for quoting CSV values. To disallow quoting, make this field blank.", + ) + escape_char: Optional[str] = Field( + title="Escape Character", + default=None, + description="The character used for escaping special characters. To disallow escaping, leave this field blank.", + ) + encoding: Optional[str] = Field( + default="utf8", + description='The character encoding of the CSV data. Leave blank to default to UTF8. See list of python encodings for allowable options.', + ) + double_quote: bool = Field( + title="Double Quote", default=True, description="Whether two quotes in a quoted CSV value denote a single quote in the data." + ) + null_values: Set[str] = Field( + title="Null Values", + default=[], + description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.", + ) + strings_can_be_null: bool = Field( + title="Strings Can Be Null", + default=True, + description="Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.", + ) + skip_rows_before_header: int = Field( + title="Skip Rows Before Header", + default=0, + description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.", + ) + skip_rows_after_header: int = Field( + title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row." + ) + header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field( + title="CSV Header Definition", + default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value), + description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.", + ) + true_values: Set[str] = Field( + title="True Values", + default=DEFAULT_TRUE_VALUES, + description="A set of case-sensitive strings that should be interpreted as true values.", + ) + false_values: Set[str] = Field( + title="False Values", + default=DEFAULT_FALSE_VALUES, + description="A set of case-sensitive strings that should be interpreted as false values.", + ) + inference_type: InferenceType = Field( + title="Inference Type", + default=InferenceType.NONE, + description="How to infer the types of the columns. If none, inference default to strings.", + airbyte_hidden=True, + ) + ignore_errors_on_fields_mismatch: bool = Field( + title="Ignore errors on field mismatch", + default=False, + description="Whether to ignore errors that occur when the number of fields in the CSV does not match the number of columns in the schema.", + ) + + @validator("delimiter") + def validate_delimiter(cls, v: str) -> str: + if v == r"\t": + v = "\t" + if len(v) != 1: + raise ValueError("delimiter should only be one character") + if v in {"\r", "\n"}: + raise ValueError(f"delimiter cannot be {v}") + return v + + @validator("quote_char") + def validate_quote_char(cls, v: str) -> str: + if len(v) != 1: + raise ValueError("quote_char should only be one character") + return v + + @validator("escape_char") + def validate_escape_char(cls, v: str) -> str: + if v is not None and len(v) != 1: + raise ValueError("escape_char should only be one character") + return v + + @validator("encoding") + def validate_encoding(cls, v: str) -> str: + try: + codecs.lookup(v) + except LookupError: + raise ValueError(f"invalid encoding format: {v}") + return v + + @root_validator + def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]: + definition_type = values.get("header_definition_type") + column_names = values.get("user_provided_column_names") + if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names: + raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat) + if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names: + raise ValidationError( + "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat + ) + return values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/excel_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/excel_format.py new file mode 100644 index 000000000000..02a4f52d2a76 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/excel_format.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig +from pydantic.v1 import BaseModel, Field + + +class ExcelFormat(BaseModel): + class Config(OneOfOptionConfig): + title = "Excel Format" + discriminator = "filetype" + + filetype: str = Field( + "excel", + const=True, + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py new file mode 100644 index 000000000000..5419f4a679dd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from enum import Enum +from typing import Any, List, Mapping, Optional, Union + +from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat +from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat +from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat +from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat +from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError +from airbyte_cdk.sources.file_based.schema_helpers import type_mapping_to_jsonschema +from pydantic.v1 import BaseModel, Field, validator + +PrimaryKeyType = Optional[Union[str, List[str]]] + + +class ValidationPolicy(Enum): + emit_record = "Emit Record" + skip_record = "Skip Record" + wait_for_discover = "Wait for Discover" + + +class FileBasedStreamConfig(BaseModel): + name: str = Field(title="Name", description="The name of the stream.") + globs: Optional[List[str]] = Field( + default=["**"], + title="Globs", + description='The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look here.', + order=1, + ) + legacy_prefix: Optional[str] = Field( + title="Legacy Prefix", + description="The path prefix configured in v3 versions of the S3 connector. This option is deprecated in favor of a single glob.", + airbyte_hidden=True, + ) + validation_policy: ValidationPolicy = Field( + title="Validation Policy", + description="The name of the validation policy that dictates sync behavior when a record does not adhere to the stream schema.", + default=ValidationPolicy.emit_record, + ) + input_schema: Optional[str] = Field( + title="Input Schema", + description="The schema that will be used to validate records extracted from the file. This will override the stream schema that is auto-detected from incoming files.", + ) + primary_key: Optional[str] = Field( + title="Primary Key", + description="The column or columns (for a composite key) that serves as the unique identifier of a record. If empty, the primary key will default to the parser's default primary key.", + airbyte_hidden=True, # Users can create/modify primary keys in the connection configuration so we shouldn't duplicate it here. + ) + days_to_sync_if_history_is_full: int = Field( + title="Days To Sync If History Is Full", + description="When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.", + default=3, + ) + format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat] = Field( + title="Format", + description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.", + ) + schemaless: bool = Field( + title="Schemaless", + description="When enabled, syncs will not validate or structure records against the stream's schema.", + default=False, + ) + recent_n_files_to_read_for_schema_discovery: Optional[int] = Field( + title="Files To Read For Schema Discover", + description="The number of resent files which will be used to discover the schema for this stream.", + default=None, + gt=0, + ) + + @validator("input_schema", pre=True) + def validate_input_schema(cls, v: Optional[str]) -> Optional[str]: + if v: + if type_mapping_to_jsonschema(v): + return v + else: + raise ConfigValidationError(FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA) + return None + + def get_input_schema(self) -> Optional[Mapping[str, Any]]: + """ + User defined input_schema is defined as a string in the config. This method takes the string representation + and converts it into a Mapping[str, Any] which is used by file-based CDK components. + """ + if self.input_schema: + schema = type_mapping_to_jsonschema(self.input_schema) + if not schema: + raise ValueError(f"Unable to create JSON schema from input schema {self.input_schema}") + return schema + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/jsonl_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/jsonl_format.py new file mode 100644 index 000000000000..1d9ed54fe083 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/jsonl_format.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig +from pydantic.v1 import BaseModel, Field + + +class JsonlFormat(BaseModel): + class Config(OneOfOptionConfig): + title = "Jsonl Format" + discriminator = "filetype" + + filetype: str = Field( + "jsonl", + const=True, + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/parquet_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/parquet_format.py new file mode 100644 index 000000000000..7c40f8e3d431 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/parquet_format.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig +from pydantic.v1 import BaseModel, Field + + +class ParquetFormat(BaseModel): + class Config(OneOfOptionConfig): + title = "Parquet Format" + discriminator = "filetype" + + filetype: str = Field( + "parquet", + const=True, + ) + # This option is not recommended, but necessary for backwards compatibility + decimal_as_float: bool = Field( + title="Convert Decimal Fields to Floats", + description="Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.", + default=False, + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/unstructured_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/unstructured_format.py new file mode 100644 index 000000000000..7858ae61d327 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/unstructured_format.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import List, Literal, Optional, Union + +from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig +from pydantic.v1 import BaseModel, Field + + +class LocalProcessingConfigModel(BaseModel): + mode: Literal["local"] = Field("local", const=True) + + class Config(OneOfOptionConfig): + title = "Local" + description = "Process files locally, supporting `fast` and `ocr` modes. This is the default option." + discriminator = "mode" + + +class APIParameterConfigModel(BaseModel): + name: str = Field( + title="Parameter name", + description="The name of the unstructured API parameter to use", + examples=["combine_under_n_chars", "languages"], + ) + value: str = Field(title="Value", description="The value of the parameter", examples=["true", "hi_res"]) + + +class APIProcessingConfigModel(BaseModel): + mode: Literal["api"] = Field("api", const=True) + + api_key: str = Field( + default="", + always_show=True, + title="API Key", + airbyte_secret=True, + description="The API key to use matching the environment", + ) + + api_url: str = Field( + default="https://api.unstructured.io", + title="API URL", + always_show=True, + description="The URL of the unstructured API to use", + examples=["https://api.unstructured.com"], + ) + + parameters: Optional[List[APIParameterConfigModel]] = Field( + default=[], + always_show=True, + title="Additional URL Parameters", + description="List of parameters send to the API", + ) + + class Config(OneOfOptionConfig): + title = "via API" + description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured." + discriminator = "mode" + + +class UnstructuredFormat(BaseModel): + class Config(OneOfOptionConfig): + title = "Unstructured Document Format" + description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file." + discriminator = "filetype" + + filetype: str = Field( + "unstructured", + const=True, + ) + + skip_unprocessable_files: bool = Field( + default=True, + title="Skip Unprocessable Files", + description="If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.", + always_show=True, + ) + + strategy: str = Field( + always_show=True, + order=0, + default="auto", + title="Parsing Strategy", + enum=["auto", "fast", "ocr_only", "hi_res"], + description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf", + ) + + processing: Union[LocalProcessingConfigModel, APIProcessingConfigModel,] = Field( + default=LocalProcessingConfigModel(mode="local"), + title="Processing", + description="Processing configuration", + discriminator="mode", + type="object", + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/__init__.py new file mode 100644 index 000000000000..c50aa1a4e70f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/__init__.py @@ -0,0 +1,4 @@ +from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.discovery_policy.default_discovery_policy import DefaultDiscoveryPolicy + +__all__ = ["AbstractDiscoveryPolicy", "DefaultDiscoveryPolicy"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py new file mode 100644 index 000000000000..ca2645787dc7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py @@ -0,0 +1,23 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod + +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser + + +class AbstractDiscoveryPolicy(ABC): + """ + Used during discovery; allows the developer to configure the number of concurrent + requests to send to the source, and the number of files to use for schema discovery. + """ + + @property + @abstractmethod + def n_concurrent_requests(self) -> int: + ... + + @abstractmethod + def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int: + ... diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py new file mode 100644 index 000000000000..3ce098899fc0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py @@ -0,0 +1,28 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser + +DEFAULT_N_CONCURRENT_REQUESTS = 10 +DEFAULT_MAX_N_FILES_FOR_STREAM_SCHEMA_INFERENCE = 10 + + +class DefaultDiscoveryPolicy(AbstractDiscoveryPolicy): + """ + Default number of concurrent requests to send to the source on discover, and number + of files to use for schema inference. + """ + + @property + def n_concurrent_requests(self) -> int: + return DEFAULT_N_CONCURRENT_REQUESTS + + def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int: + return min( + filter( + None, + (DEFAULT_MAX_N_FILES_FOR_STREAM_SCHEMA_INFERENCE, parser.parser_max_n_files_for_schema_inference), + ) + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/exceptions.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/exceptions.py new file mode 100644 index 000000000000..60adf3214e79 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/exceptions.py @@ -0,0 +1,127 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from enum import Enum +from typing import Any, List, Union + +from airbyte_cdk.models import AirbyteMessage, FailureType +from airbyte_cdk.utils import AirbyteTracedException + + +class FileBasedSourceError(Enum): + EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict." + GLOB_PARSE_ERROR = ( + "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split." + ) + ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding." + ERROR_CASTING_VALUE = "Could not cast the value to the expected type." + ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string." + ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record." + ERROR_LISTING_FILES = ( + "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files." + ) + ERROR_READING_FILE = ( + "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files." + ) + ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable." + ERROR_PARSING_USER_PROVIDED_SCHEMA = "The provided schema could not be transformed into valid JSON Schema." + ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy." + ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows." + ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows." + STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = ( + "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema." + ) + NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key." + UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type." + SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?" + INVALID_SCHEMA_ERROR = "No fields were identified for this schema. This may happen if the stream is empty. Please check your configuration to verify that there are files that match the stream's glob patterns." + CONFIG_VALIDATION_ERROR = "Error creating stream config object." + MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing." + UNDEFINED_PARSER = "No parser is defined for this file type." + UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source." + + +class FileBasedErrorsCollector: + """ + The placeholder for all errors collected. + """ + + errors: List[AirbyteMessage] = [] + + def yield_and_raise_collected(self) -> Any: + if self.errors: + # emit collected logged messages + yield from self.errors + # clean the collector + self.errors.clear() + # raising the single exception + raise AirbyteTracedException( + internal_message="Please check the logged errors for more information.", + message="Some errors occured while reading from the source.", + failure_type=FailureType.config_error, + ) + + def collect(self, logged_error: AirbyteMessage) -> None: + self.errors.append(logged_error) + + +class BaseFileBasedSourceError(Exception): + def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa + if isinstance(error, FileBasedSourceError): + error = FileBasedSourceError(error).value + super().__init__(f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}") + + +class ConfigValidationError(BaseFileBasedSourceError): + pass + + +class InvalidSchemaError(BaseFileBasedSourceError): + pass + + +class MissingSchemaError(BaseFileBasedSourceError): + pass + + +class NoFilesMatchingError(BaseFileBasedSourceError): + pass + + +class RecordParseError(BaseFileBasedSourceError): + pass + + +class SchemaInferenceError(BaseFileBasedSourceError): + pass + + +class CheckAvailabilityError(BaseFileBasedSourceError): + pass + + +class UndefinedParserError(BaseFileBasedSourceError): + pass + + +class StopSyncPerValidationPolicy(BaseFileBasedSourceError): + pass + + +class ErrorListingFiles(BaseFileBasedSourceError): + pass + + +class CustomFileBasedException(AirbyteTracedException): + """ + A specialized exception for file-based connectors. + + This exception is designed to bypass the default error handling in the file-based CDK, allowing the use of custom error messages. + """ + + pass + + +class FileSizeLimitError(CustomFileBasedException): + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_source.py new file mode 100644 index 000000000000..65f9e5314f04 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_source.py @@ -0,0 +1,328 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import traceback +from abc import ABC +from collections import Counter +from typing import Any, Iterator, List, Mapping, Optional, Tuple, Type, Union + +from airbyte_cdk.logger import AirbyteLogFormatter, init_logger +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteStateMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConnectorSpecification, + FailureType, + Level, + SyncMode, +) +from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource +from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types import default_parsers +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream +from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade +from airbyte_cdk.sources.file_based.stream.concurrent.cursor import ( + AbstractConcurrentFileBasedCursor, + FileBasedConcurrentCursor, + FileBasedFinalStateCursor, +) +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.utils.analytics_message import create_analytics_message +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from pydantic.v1.error_wrappers import ValidationError + +DEFAULT_CONCURRENCY = 100 +MAX_CONCURRENCY = 100 +INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2 + + +class FileBasedSource(ConcurrentSourceAdapter, ABC): + # We make each source override the concurrency level to give control over when they are upgraded. + _concurrency_level = None + + def __init__( + self, + stream_reader: AbstractFileBasedStreamReader, + spec_class: Type[AbstractFileBasedSpec], + catalog: Optional[ConfiguredAirbyteCatalog], + config: Optional[Mapping[str, Any]], + state: Optional[List[AirbyteStateMessage]], + availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None, + discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(), + parsers: Mapping[Type[Any], FileTypeParser] = default_parsers, + validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES, + cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor, + ): + self.stream_reader = stream_reader + self.spec_class = spec_class + self.config = config + self.catalog = catalog + self.state = state + self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(stream_reader) + self.discovery_policy = discovery_policy + self.parsers = parsers + self.validation_policies = validation_policies + self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {} + self.cursor_cls = cursor_cls + self.logger = init_logger(f"airbyte.{self.name}") + self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector() + self._message_repository: Optional[MessageRepository] = None + concurrent_source = ConcurrentSource.create( + MAX_CONCURRENCY, INITIAL_N_PARTITIONS, self.logger, self._slice_logger, self.message_repository + ) + self._state = None + super().__init__(concurrent_source) + + @property + def message_repository(self) -> MessageRepository: + if self._message_repository is None: + self._message_repository = InMemoryMessageRepository(Level(AirbyteLogFormatter.level_mapping[self.logger.level])) + return self._message_repository + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + """ + Check that the source can be accessed using the user-provided configuration. + + For each stream, verify that we can list and read files. + + Returns (True, None) if the connection check is successful. + + Otherwise, the "error" object should describe what went wrong. + """ + try: + streams = self.streams(config) + except Exception as config_exception: + raise AirbyteTracedException( + internal_message="Please check the logged errors for more information.", + message=FileBasedSourceError.CONFIG_VALIDATION_ERROR.value, + exception=AirbyteTracedException(exception=config_exception), + failure_type=FailureType.config_error, + ) + if len(streams) == 0: + return ( + False, + f"No streams are available for source {self.name}. This is probably an issue with the connector. Please verify that your " + f"configuration provides permissions to list and read files from the source. Contact support if you are unable to " + f"resolve this issue.", + ) + + errors = [] + tracebacks = [] + for stream in streams: + if not isinstance(stream, AbstractFileBasedStream): + raise ValueError(f"Stream {stream} is not a file-based stream.") + try: + parsed_config = self._get_parsed_config(config) + availability_method = ( + stream.availability_strategy.check_availability + if self._use_file_transfer(parsed_config) + else stream.availability_strategy.check_availability_and_parsability + ) + ( + stream_is_available, + reason, + ) = availability_method(stream, logger, self) + except AirbyteTracedException as ate: + errors.append(f"Unable to connect to stream {stream.name} - {ate.message}") + tracebacks.append(traceback.format_exc()) + except Exception: + errors.append(f"Unable to connect to stream {stream.name}") + tracebacks.append(traceback.format_exc()) + else: + if not stream_is_available and reason: + errors.append(reason) + + if len(errors) == 1 and len(tracebacks) == 1: + raise AirbyteTracedException( + internal_message=tracebacks[0], + message=f"{errors[0]}", + failure_type=FailureType.config_error, + ) + if len(errors) == 1 and len(tracebacks) == 0: + raise AirbyteTracedException( + message=f"{errors[0]}", + failure_type=FailureType.config_error, + ) + elif len(errors) > 1: + raise AirbyteTracedException( + internal_message="\n".join(tracebacks), + message=f"{len(errors)} streams with errors: {', '.join(error for error in errors)}", + failure_type=FailureType.config_error, + ) + + return not bool(errors), (errors or None) + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + """ + Return a list of this source's streams. + """ + + if self.catalog: + state_manager = ConnectorStateManager(state=self.state) + else: + # During `check` operations we don't have a catalog so cannot create a state manager. + # Since the state manager is only required for incremental syncs, this is fine. + state_manager = None + + try: + parsed_config = self._get_parsed_config(config) + self.stream_reader.config = parsed_config + streams: List[Stream] = [] + for stream_config in parsed_config.streams: + # Like state_manager, `catalog_stream` may be None during `check` + catalog_stream = self._get_stream_from_catalog(stream_config) + stream_state = ( + state_manager.get_stream_state(catalog_stream.name, catalog_stream.namespace) + if (state_manager and catalog_stream) + else None + ) + self._validate_input_schema(stream_config) + + sync_mode = self._get_sync_mode_from_catalog(stream_config.name) + + if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None: + cursor = FileBasedFinalStateCursor( + stream_config=stream_config, stream_namespace=None, message_repository=self.message_repository + ) + stream = FileBasedStreamFacade.create_from_stream( + stream=self._make_default_stream( + stream_config=stream_config, cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config) + ), + source=self, + logger=self.logger, + state=stream_state, + cursor=cursor, + ) + + elif ( + sync_mode == SyncMode.incremental + and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor) + and hasattr(self, "_concurrency_level") + and self._concurrency_level is not None + ): + assert ( + state_manager is not None + ), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support." + + cursor = self.cursor_cls( + stream_config, + stream_config.name, + None, + stream_state, + self.message_repository, + state_manager, + CursorField(DefaultFileBasedStream.ab_last_mod_col), + ) + stream = FileBasedStreamFacade.create_from_stream( + stream=self._make_default_stream( + stream_config=stream_config, cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config) + ), + source=self, + logger=self.logger, + state=stream_state, + cursor=cursor, + ) + else: + cursor = self.cursor_cls(stream_config) + stream = self._make_default_stream( + stream_config=stream_config, cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config) + ) + + streams.append(stream) + return streams + + except ValidationError as exc: + raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc + + def _make_default_stream( + self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor], use_file_transfer: bool = False + ) -> AbstractFileBasedStream: + return DefaultFileBasedStream( + config=stream_config, + catalog_schema=self.stream_schemas.get(stream_config.name), + stream_reader=self.stream_reader, + availability_strategy=self.availability_strategy, + discovery_policy=self.discovery_policy, + parsers=self.parsers, + validation_policy=self._validate_and_get_validation_policy(stream_config), + errors_collector=self.errors_collector, + cursor=cursor, + use_file_transfer=use_file_transfer, + ) + + def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]: + if self.catalog: + for stream in self.catalog.streams or []: + if stream.stream.name == stream_config.name: + return stream.stream + return None + + def _get_sync_mode_from_catalog(self, stream_name: str) -> Optional[SyncMode]: + if self.catalog: + for catalog_stream in self.catalog.streams: + if stream_name == catalog_stream.stream.name: + return catalog_stream.sync_mode + self.logger.warning(f"No sync mode was found for {stream_name}.") + return None + + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: Optional[List[AirbyteStateMessage]] = None, + ) -> Iterator[AirbyteMessage]: + yield from super().read(logger, config, catalog, state) + # emit all the errors collected + yield from self.errors_collector.yield_and_raise_collected() + # count streams using a certain parser + parsed_config = self._get_parsed_config(config) + for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items(): + yield create_analytics_message(f"file-cdk-{parser}-stream-count", count) + + def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification: + """ + Returns the specification describing what fields can be configured by a user when setting up a file-based source. + """ + + return ConnectorSpecification( + documentationUrl=self.spec_class.documentation_url(), + connectionSpecification=self.spec_class.schema(), + ) + + def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec: + return self.spec_class(**config) + + def _validate_and_get_validation_policy(self, stream_config: FileBasedStreamConfig) -> AbstractSchemaValidationPolicy: + if stream_config.validation_policy not in self.validation_policies: + # This should never happen because we validate the config against the schema's validation_policy enum + raise ValidationError( + f"`validation_policy` must be one of {list(self.validation_policies.keys())}", model=FileBasedStreamConfig + ) + return self.validation_policies[stream_config.validation_policy] + + def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None: + if stream_config.schemaless and stream_config.input_schema: + raise ValidationError("`input_schema` and `schemaless` options cannot both be set", model=FileBasedStreamConfig) + + @staticmethod + def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool: + use_file_transfer = ( + hasattr(parsed_config.delivery_method, "delivery_type") and parsed_config.delivery_method.delivery_type == "use_file_transfer" + ) + return use_file_transfer diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_stream_reader.py new file mode 100644 index 000000000000..d98513daebc9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -0,0 +1,159 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from datetime import datetime +from enum import Enum +from io import IOBase +from os import makedirs, path +from typing import Any, Dict, Iterable, List, Optional, Set + +from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from wcmatch.glob import GLOBSTAR, globmatch + + +class FileReadMode(Enum): + READ = "r" + READ_BINARY = "rb" + + +class AbstractFileBasedStreamReader(ABC): + DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + + def __init__(self) -> None: + self._config = None + + @property + def config(self) -> Optional[AbstractFileBasedSpec]: + return self._config + + @config.setter + @abstractmethod + def config(self, value: AbstractFileBasedSpec) -> None: + """ + FileBasedSource reads the config from disk and parses it, and once parsed, the source sets the config on its StreamReader. + + Note: FileBasedSource only requires the keys defined in the abstract config, whereas concrete implementations of StreamReader + will require keys that (for example) allow it to authenticate with the 3rd party. + + Therefore, concrete implementations of AbstractFileBasedStreamReader's config setter should assert that `value` is of the correct + config type for that type of StreamReader. + """ + ... + + @abstractmethod + def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase: + """ + Return a file handle for reading. + + Many sources will be able to use smart_open to implement this method, + for example: + + client = boto3.Session(...) + return smart_open.open(remote_file.uri, transport_params={"client": client}) + """ + ... + + @abstractmethod + def get_matching_files( + self, + globs: List[str], + prefix: Optional[str], + logger: logging.Logger, + ) -> Iterable[RemoteFile]: + """ + Return all files that match any of the globs. + + Example: + + The source has files "a.json", "foo/a.json", "foo/bar/a.json" + + If globs = ["*.json"] then this method returns ["a.json"]. + + If globs = ["foo/*.json"] then this method returns ["foo/a.json"]. + + Utility method `self.filter_files_by_globs` and `self.get_prefixes_from_globs` + are available, which may be helpful when implementing this method. + """ + ... + + def filter_files_by_globs_and_start_date(self, files: List[RemoteFile], globs: List[str]) -> Iterable[RemoteFile]: + """ + Utility method for filtering files based on globs. + """ + start_date = datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT) if self.config and self.config.start_date else None + seen = set() + + for file in files: + if self.file_matches_globs(file, globs): + if file.uri not in seen and (not start_date or file.last_modified >= start_date): + seen.add(file.uri) + yield file + + @abstractmethod + def file_size(self, file: RemoteFile) -> int: + """Utility method to get size of the remote file. + + This is required for connectors that will support writing to + files. If the connector does not support writing files, then the + subclass can simply `return 0`. + """ + ... + + @staticmethod + def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool: + # Use the GLOBSTAR flag to enable recursive ** matching + # (https://facelessuser.github.io/wcmatch/wcmatch/#globstar) + return any(globmatch(file.uri, g, flags=GLOBSTAR) for g in globs) + + @staticmethod + def get_prefixes_from_globs(globs: List[str]) -> Set[str]: + """ + Utility method for extracting prefixes from the globs. + """ + prefixes = {glob.split("*")[0] for glob in globs} + return set(filter(lambda x: bool(x), prefixes)) + + def use_file_transfer(self) -> bool: + if self.config: + use_file_transfer = ( + hasattr(self.config.delivery_method, "delivery_type") and self.config.delivery_method.delivery_type == "use_file_transfer" + ) + return use_file_transfer + return False + + @abstractmethod + def get_file(self, file: RemoteFile, local_directory: str, logger: logging.Logger) -> Dict[str, Any]: + """ + This is required for connectors that will support writing to + files. It will handle the logic to download,get,read,acquire or + whatever is more efficient to get a file from the source. + + Args: + file (RemoteFile): The remote file object containing URI and metadata. + local_directory (str): The local directory path where the file will be downloaded. + logger (logging.Logger): Logger for logging information and errors. + + Returns: + dict: A dictionary containing the following: + - "file_url" (str): The absolute path of the downloaded file. + - "bytes" (int): The file size in bytes. + - "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as + this a mounted volume in the pod container. + + """ + ... + + @staticmethod + def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]: + # Remove left slashes from source path format to make relative path for writing locally + file_relative_path = file.uri.lstrip("/") + local_file_path = path.join(local_directory, file_relative_path) + + # Ensure the local directory exists + makedirs(path.dirname(local_file_path), exist_ok=True) + absolute_file_path = path.abspath(local_file_path) + return [file_relative_path, local_file_path, absolute_file_path] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/__init__.py new file mode 100644 index 000000000000..083df8f17235 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/__init__.py @@ -0,0 +1,28 @@ +from typing import Any, Mapping, Type + +from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat +from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat +from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat +from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat +from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat + +from .avro_parser import AvroParser +from .csv_parser import CsvParser +from .excel_parser import ExcelParser +from .file_type_parser import FileTypeParser +from .jsonl_parser import JsonlParser +from .parquet_parser import ParquetParser +from .unstructured_parser import UnstructuredParser +from .file_transfer import FileTransfer + +default_parsers: Mapping[Type[Any], FileTypeParser] = { + AvroFormat: AvroParser(), + CsvFormat: CsvParser(), + ExcelFormat: ExcelParser(), + JsonlFormat: JsonlParser(), + ParquetFormat: ParquetParser(), + UnstructuredFormat: UnstructuredParser(), +} + +__all__ = ["AvroParser", "CsvParser", "ExcelParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "FileTransfer", "default_parsers"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py new file mode 100644 index 000000000000..b033afa57fb3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -0,0 +1,188 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple + +import fastavro +from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType + +AVRO_TYPE_TO_JSON_TYPE = { + "null": "null", + "boolean": "boolean", + "int": "integer", + "long": "integer", + "float": "number", + "double": "string", # double -> number conversions can lose precision + "bytes": "string", + "string": "string", +} + +AVRO_LOGICAL_TYPE_TO_JSON = { + "decimal": {"type": "string"}, + "uuid": {"type": "string"}, + "date": {"type": "string", "format": "date"}, + "time-millis": {"type": "integer"}, + "time-micros": {"type": "integer"}, + "timestamp-millis": {"type": "string", "format": "date-time"}, + "timestamp-micros": {"type": "string"}, + "local-timestamp-millis": {"type": "string", "format": "date-time"}, + "local-timestamp-micros": {"type": "string"}, + # fastavro does not support duration https://fastavro.readthedocs.io/en/latest/logical_types.html +} + + +class AvroParser(FileTypeParser): + ENCODING = None + + def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: + """ + AvroParser does not require config checks, implicit pydantic validation is enough. + """ + return True, None + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> SchemaType: + avro_format = config.format + if not isinstance(avro_format, AvroFormat): + raise ValueError(f"Expected ParquetFormat, got {avro_format}") + + with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: + avro_reader = fastavro.reader(fp) + avro_schema = avro_reader.writer_schema + if not avro_schema["type"] == "record": + unsupported_type = avro_schema["type"] + raise ValueError(f"Only record based avro files are supported. Found {unsupported_type}") + json_schema = { + field["name"]: AvroParser._convert_avro_type_to_json(avro_format, field["name"], field["type"]) + for field in avro_schema["fields"] + } + return json_schema + + @classmethod + def _convert_avro_type_to_json(cls, avro_format: AvroFormat, field_name: str, avro_field: str) -> Mapping[str, Any]: + if isinstance(avro_field, str) and avro_field in AVRO_TYPE_TO_JSON_TYPE: + # Legacy behavior to retain backwards compatibility. Long term we should always represent doubles as strings + if avro_field == "double" and not avro_format.double_as_string: + return {"type": "number"} + return {"type": AVRO_TYPE_TO_JSON_TYPE[avro_field]} + if isinstance(avro_field, Mapping): + if avro_field["type"] == "record": + return { + "type": "object", + "properties": { + object_field["name"]: AvroParser._convert_avro_type_to_json(avro_format, object_field["name"], object_field["type"]) + for object_field in avro_field["fields"] + }, + } + elif avro_field["type"] == "array": + if "items" not in avro_field: + raise ValueError(f"{field_name} array type does not have a required field items") + return {"type": "array", "items": AvroParser._convert_avro_type_to_json(avro_format, "", avro_field["items"])} + elif avro_field["type"] == "enum": + if "symbols" not in avro_field: + raise ValueError(f"{field_name} enum type does not have a required field symbols") + if "name" not in avro_field: + raise ValueError(f"{field_name} enum type does not have a required field name") + return {"type": "string", "enum": avro_field["symbols"]} + elif avro_field["type"] == "map": + if "values" not in avro_field: + raise ValueError(f"{field_name} map type does not have a required field values") + return { + "type": "object", + "additionalProperties": AvroParser._convert_avro_type_to_json(avro_format, "", avro_field["values"]), + } + elif avro_field["type"] == "fixed" and avro_field.get("logicalType") != "duration": + if "size" not in avro_field: + raise ValueError(f"{field_name} fixed type does not have a required field size") + if not isinstance(avro_field["size"], int): + raise ValueError(f"{field_name} fixed type size value is not an integer") + return { + "type": "string", + "pattern": f"^[0-9A-Fa-f]{{{avro_field['size'] * 2}}}$", + } + elif avro_field.get("logicalType") == "decimal": + if "precision" not in avro_field: + raise ValueError(f"{field_name} decimal type does not have a required field precision") + if "scale" not in avro_field: + raise ValueError(f"{field_name} decimal type does not have a required field scale") + max_whole_number_range = avro_field["precision"] - avro_field["scale"] + decimal_range = avro_field["scale"] + + # This regex looks like a mess, but it is validation for at least one whole number and optional fractional numbers + # For example: ^-?\d{1,5}(?:\.\d{1,3})?$ would accept 12345.123 and 123456.12345 would be rejected + return {"type": "string", "pattern": f"^-?\\d{{{1,max_whole_number_range}}}(?:\\.\\d{1,decimal_range})?$"} + elif "logicalType" in avro_field: + if avro_field["logicalType"] not in AVRO_LOGICAL_TYPE_TO_JSON: + raise ValueError(f"{avro_field['logicalType']} is not a valid Avro logical type") + return AVRO_LOGICAL_TYPE_TO_JSON[avro_field["logicalType"]] + else: + raise ValueError(f"Unsupported avro type: {avro_field}") + else: + raise ValueError(f"Unsupported avro type: {avro_field}") + + def parse_records( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + discovered_schema: Optional[Mapping[str, SchemaType]], + ) -> Iterable[Dict[str, Any]]: + avro_format = config.format or AvroFormat(filetype="avro") + if not isinstance(avro_format, AvroFormat): + raise ValueError(f"Expected ParquetFormat, got {avro_format}") + + line_no = 0 + try: + with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: + avro_reader = fastavro.reader(fp) + schema = avro_reader.writer_schema + schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]} + for record in avro_reader: + line_no += 1 + yield { + record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field]) + for record_field, record_value in schema_field_name_to_type.items() + } + except Exception as exc: + raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc + + @property + def file_read_mode(self) -> FileReadMode: + return FileReadMode.READ_BINARY + + @staticmethod + def _to_output_value(avro_format: AvroFormat, record_type: Mapping[str, Any], record_value: Any) -> Any: + if isinstance(record_value, bytes): + return record_value.decode() + elif not isinstance(record_type, Mapping): + if record_type == "double" and avro_format.double_as_string: + return str(record_value) + return record_value + if record_type.get("logicalType") in ("decimal", "uuid"): + return str(record_value) + elif record_type.get("logicalType") == "date": + return record_value.isoformat() + elif record_type.get("logicalType") == "timestamp-millis": + return record_value.isoformat(sep="T", timespec="milliseconds") + elif record_type.get("logicalType") == "timestamp-micros": + return record_value.isoformat(sep="T", timespec="microseconds") + elif record_type.get("logicalType") == "local-timestamp-millis": + return record_value.isoformat(sep="T", timespec="milliseconds") + elif record_type.get("logicalType") == "local-timestamp-micros": + return record_value.isoformat(sep="T", timespec="microseconds") + else: + return record_value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py new file mode 100644 index 000000000000..961fc8f14cad --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -0,0 +1,463 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import csv +import json +import logging +from abc import ABC, abstractmethod +from collections import defaultdict +from functools import partial +from io import IOBase +from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set, Tuple +from uuid import uuid4 + +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from orjson import orjson + +DIALECT_NAME = "_config_dialect" + + +class _CsvReader: + def read_data( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + file_read_mode: FileReadMode, + ) -> Generator[Dict[str, Any], None, None]: + config_format = _extract_format(config) + lineno = 0 + + # Formats are configured individually per-stream so a unique dialect should be registered for each stream. + # We don't unregister the dialect because we are lazily parsing each csv file to generate records + # Give each stream's dialect a unique name; otherwise, when we are doing a concurrent sync we can end up + # with a race condition where a thread attempts to use a dialect before a separate thread has finished + # registering it. + dialect_name = f"{config.name}_{str(uuid4())}_{DIALECT_NAME}" + csv.register_dialect( + dialect_name, + delimiter=config_format.delimiter, + quotechar=config_format.quote_char, + escapechar=config_format.escape_char, + doublequote=config_format.double_quote, + quoting=csv.QUOTE_MINIMAL, + ) + with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp: + try: + headers = self._get_headers(fp, config_format, dialect_name) + except UnicodeError: + raise AirbyteTracedException( + message=f"{FileBasedSourceError.ENCODING_ERROR.value} Expected encoding: {config_format.encoding}", + ) + + rows_to_skip = ( + config_format.skip_rows_before_header + + (1 if config_format.header_definition.has_header_row() else 0) + + config_format.skip_rows_after_header + ) + self._skip_rows(fp, rows_to_skip) + lineno += rows_to_skip + + reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore + try: + for row in reader: + lineno += 1 + + # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns + # than headers or more headers dans columns + if None in row: + if config_format.ignore_errors_on_fields_mismatch: + logger.error(f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with missing column.") + else: + raise RecordParseError( + FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_COLUMNS, + filename=file.uri, + lineno=lineno, + ) + if None in row.values(): + if config_format.ignore_errors_on_fields_mismatch: + logger.error(f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with extra column.") + else: + raise RecordParseError( + FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_ROWS, filename=file.uri, lineno=lineno + ) + yield row + finally: + # due to RecordParseError or GeneratorExit + csv.unregister_dialect(dialect_name) + + def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + """ + Assumes the fp is pointing to the beginning of the files and will reset it as such + """ + # Note that this method assumes the dialect has already been registered if we're parsing the headers + if isinstance(config_format.header_definition, CsvHeaderUserProvided): + return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type + + if isinstance(config_format.header_definition, CsvHeaderAutogenerated): + self._skip_rows(fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header) + headers = self._auto_generate_headers(fp, dialect_name) + else: + # Then read the header + self._skip_rows(fp, config_format.skip_rows_before_header) + reader = csv.reader(fp, dialect=dialect_name) # type: ignore + headers = list(next(reader)) + + fp.seek(0) + return headers + + def _auto_generate_headers(self, fp: IOBase, dialect_name: str) -> List[str]: + """ + Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True. + See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html + """ + reader = csv.reader(fp, dialect=dialect_name) # type: ignore + number_of_columns = len(next(reader)) # type: ignore + return [f"f{i}" for i in range(number_of_columns)] + + @staticmethod + def _skip_rows(fp: IOBase, rows_to_skip: int) -> None: + """ + Skip rows before the header. This has to be done on the file object itself, not the reader + """ + for _ in range(rows_to_skip): + fp.readline() + + +class CsvParser(FileTypeParser): + _MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000 + + def __init__(self, csv_reader: Optional[_CsvReader] = None, csv_field_max_bytes: int = 2**31): + # Increase the maximum length of data that can be parsed in a single CSV field. The default is 128k, which is typically sufficient + # but given the use of Airbyte in loading a large variety of data it is best to allow for a larger maximum field size to avoid + # skipping data on load. https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072 + csv.field_size_limit(csv_field_max_bytes) + self._csv_reader = csv_reader if csv_reader else _CsvReader() + + def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: + """ + CsvParser does not require config checks, implicit pydantic validation is enough. + """ + return True, None + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> SchemaType: + input_schema = config.get_input_schema() + if input_schema: + return input_schema + + # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual + # sources will likely require one. Rather than modify the interface now we can wait until the real use case + config_format = _extract_format(config) + type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( + lambda: _JsonTypeInferrer(config_format.true_values, config_format.false_values, config_format.null_values) + if config_format.inference_type != InferenceType.NONE + else _DisabledTypeInferrer() + ) + data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) + read_bytes = 0 + for row in data_generator: + for header, value in row.items(): + type_inferrer_by_field[header].add_value(value) + # This is not accurate as a representation of how many bytes were read because csv does some processing on the actual value + # before returning. Given we would like to be more accurate, we could wrap the IO file using a decorator + read_bytes += len(value) + read_bytes += len(row) - 1 # for separators + if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE: + break + + if not type_inferrer_by_field: + raise AirbyteTracedException( + message=f"Could not infer schema as there are no rows in {file.uri}. If having an empty CSV file is expected, ignore this. " + f"Else, please contact Airbyte.", + failure_type=FailureType.config_error, + ) + schema = {header.strip(): {"type": type_inferred.infer()} for header, type_inferred in type_inferrer_by_field.items()} + data_generator.close() + return schema + + def parse_records( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + discovered_schema: Optional[Mapping[str, SchemaType]], + ) -> Iterable[Dict[str, Any]]: + line_no = 0 + try: + config_format = _extract_format(config) + if discovered_schema: + property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping + deduped_property_types = CsvParser._pre_propcess_property_types(property_types) + else: + deduped_property_types = {} + cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless) + data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) + for row in data_generator: + line_no += 1 + yield CsvParser._to_nullable( + cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null + ) + except RecordParseError as parse_err: + raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err + finally: + data_generator.close() + + @property + def file_read_mode(self) -> FileReadMode: + return FileReadMode.READ + + @staticmethod + def _get_cast_function( + deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger, schemaless: bool + ) -> Callable[[Mapping[str, str]], Mapping[str, str]]: + # Only cast values if the schema is provided + if deduped_property_types and not schemaless: + return partial(CsvParser._cast_types, deduped_property_types=deduped_property_types, config_format=config_format, logger=logger) + else: + # If no schema is provided, yield the rows as they are + return _no_cast + + @staticmethod + def _to_nullable( + row: Mapping[str, str], deduped_property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool + ) -> Dict[str, Optional[str]]: + nullable = { + k: None if CsvParser._value_is_none(v, deduped_property_types.get(k), null_values, strings_can_be_null) else v + for k, v in row.items() + } + return nullable + + @staticmethod + def _value_is_none(value: Any, deduped_property_type: Optional[str], null_values: Set[str], strings_can_be_null: bool) -> bool: + return value in null_values and (strings_can_be_null or deduped_property_type != "string") + + @staticmethod + def _pre_propcess_property_types(property_types: Dict[str, Any]) -> Mapping[str, str]: + """ + Transform the property types to be non-nullable and remove duplicate types if any. + Sample input: + { + "col1": ["string", "null"], + "col2": ["string", "string", "null"], + "col3": "integer" + } + + Sample output: + { + "col1": "string", + "col2": "string", + "col3": "integer", + } + """ + output = {} + for prop, prop_type in property_types.items(): + if isinstance(prop_type, list): + prop_type_distinct = set(prop_type) + prop_type_distinct.remove("null") + if len(prop_type_distinct) != 1: + raise ValueError(f"Could not get non nullable type from {prop_type}") + output[prop] = next(iter(prop_type_distinct)) + else: + output[prop] = prop_type + return output + + @staticmethod + def _cast_types( + row: Dict[str, str], deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger + ) -> Dict[str, Any]: + """ + Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. + + Array and object types are only handled if they can be deserialized as JSON. + + If any errors are encountered, the value will be emitted as a string. + """ + warnings = [] + result = {} + + for key, value in row.items(): + prop_type = deduped_property_types.get(key) + cast_value: Any = value + + if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: + _, python_type = TYPE_PYTHON_MAPPING[prop_type] + + if python_type is None: + if value == "": + cast_value = None + else: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == bool: + try: + cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values) + except ValueError: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == dict: + try: + # we don't re-use _value_to_object here because we type the column as object as long as there is only one object + cast_value = orjson.loads(value) + except orjson.JSONDecodeError: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == list: + try: + cast_value = _value_to_list(value) + except (ValueError, json.JSONDecodeError): + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type: + try: + cast_value = _value_to_python_type(value, python_type) + except ValueError: + warnings.append(_format_warning(key, value, prop_type)) + + result[key] = cast_value + + if warnings: + logger.warning( + f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}", + ) + return result + + +class _TypeInferrer(ABC): + @abstractmethod + def add_value(self, value: Any) -> None: + pass + + @abstractmethod + def infer(self) -> str: + pass + + +class _DisabledTypeInferrer(_TypeInferrer): + def add_value(self, value: Any) -> None: + pass + + def infer(self) -> str: + return "string" + + +class _JsonTypeInferrer(_TypeInferrer): + _NULL_TYPE = "null" + _BOOLEAN_TYPE = "boolean" + _INTEGER_TYPE = "integer" + _NUMBER_TYPE = "number" + _STRING_TYPE = "string" + + def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str]) -> None: + self._boolean_trues = boolean_trues + self._boolean_falses = boolean_falses + self._null_values = null_values + self._values: Set[str] = set() + + def add_value(self, value: Any) -> None: + self._values.add(value) + + def infer(self) -> str: + types_by_value = {value: self._infer_type(value) for value in self._values} + types_excluding_null_values = [types for types in types_by_value.values() if self._NULL_TYPE not in types] + if not types_excluding_null_values: + # this is highly unusual but we will consider the column as a string + return self._STRING_TYPE + + types = set.intersection(*types_excluding_null_values) + if self._BOOLEAN_TYPE in types: + return self._BOOLEAN_TYPE + elif self._INTEGER_TYPE in types: + return self._INTEGER_TYPE + elif self._NUMBER_TYPE in types: + return self._NUMBER_TYPE + return self._STRING_TYPE + + def _infer_type(self, value: str) -> Set[str]: + inferred_types = set() + + if value in self._null_values: + inferred_types.add(self._NULL_TYPE) + if self._is_boolean(value): + inferred_types.add(self._BOOLEAN_TYPE) + if self._is_integer(value): + inferred_types.add(self._INTEGER_TYPE) + inferred_types.add(self._NUMBER_TYPE) + elif self._is_number(value): + inferred_types.add(self._NUMBER_TYPE) + + inferred_types.add(self._STRING_TYPE) + return inferred_types + + def _is_boolean(self, value: str) -> bool: + try: + _value_to_bool(value, self._boolean_trues, self._boolean_falses) + return True + except ValueError: + return False + + @staticmethod + def _is_integer(value: str) -> bool: + try: + _value_to_python_type(value, int) + return True + except ValueError: + return False + + @staticmethod + def _is_number(value: str) -> bool: + try: + _value_to_python_type(value, float) + return True + except ValueError: + return False + + +def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool: + if value in true_values: + return True + if value in false_values: + return False + raise ValueError(f"Value {value} is not a valid boolean value") + + +def _value_to_list(value: str) -> List[Any]: + parsed_value = json.loads(value) + if isinstance(parsed_value, list): + return parsed_value + raise ValueError(f"Value {parsed_value} is not a valid list value") + + +def _value_to_python_type(value: str, python_type: type) -> Any: + return python_type(value) + + +def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str: + return f"{key}: value={value},expected_type={expected_type}" + + +def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]: + return row + + +def _extract_format(config: FileBasedStreamConfig) -> CsvFormat: + config_format = config.format + if not isinstance(config_format, CsvFormat): + raise ValueError(f"Invalid format config: {config_format}") + return config_format diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/excel_parser.py new file mode 100644 index 000000000000..93add4108dea --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -0,0 +1,172 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import logging +from io import IOBase +from pathlib import Path +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union + +import pandas as pd +from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType +from numpy import datetime64 +from numpy import dtype as dtype_ +from numpy import issubdtype +from orjson import orjson +from pydantic.v1 import BaseModel + + +class ExcelParser(FileTypeParser): + ENCODING = None + + def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: + """ + ExcelParser does not require config checks, implicit pydantic validation is enough. + """ + return True, None + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> SchemaType: + """ + Infers the schema of the Excel file by examining its contents. + + Args: + config (FileBasedStreamConfig): Configuration for the file-based stream. + file (RemoteFile): The remote file to be read. + stream_reader (AbstractFileBasedStreamReader): Reader to read the file. + logger (logging.Logger): Logger for logging information and errors. + + Returns: + SchemaType: Inferred schema of the Excel file. + """ + + # Validate the format of the config + self.validate_format(config.format, logger) + + fields: Dict[str, str] = {} + + with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: + df = self.open_and_parse_file(fp) + for column, df_type in df.dtypes.items(): + # Choose the broadest data type if the column's data type differs in dataframes + prev_frame_column_type = fields.get(column) + fields[column] = self.dtype_to_json_type(prev_frame_column_type, df_type) + + schema = { + field: ({"type": "string", "format": "date-time"} if fields[field] == "date-time" else {"type": fields[field]}) + for field in fields + } + return schema + + def parse_records( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + discovered_schema: Optional[Mapping[str, SchemaType]] = None, + ) -> Iterable[Dict[str, Any]]: + """ + Parses records from an Excel file based on the provided configuration. + + Args: + config (FileBasedStreamConfig): Configuration for the file-based stream. + file (RemoteFile): The remote file to be read. + stream_reader (AbstractFileBasedStreamReader): Reader to read the file. + logger (logging.Logger): Logger for logging information and errors. + discovered_schema (Optional[Mapping[str, SchemaType]]): Discovered schema for validation. + + Yields: + Iterable[Dict[str, Any]]: Parsed records from the Excel file. + """ + + # Validate the format of the config + self.validate_format(config.format, logger) + + try: + # Open and parse the file using the stream reader + with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: + df = self.open_and_parse_file(fp) + # Yield records as dictionaries + # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson + # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior + # see PR description: https://github.com/airbytehq/airbyte/pull/44444/ + yield from orjson.loads(df.to_json(orient="records", date_format="iso", date_unit="us")) + + except Exception as exc: + # Raise a RecordParseError if any exception occurs during parsing + raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri) from exc + + @property + def file_read_mode(self) -> FileReadMode: + """ + Returns the file read mode for the Excel file. + + Returns: + FileReadMode: The file read mode (binary). + """ + return FileReadMode.READ_BINARY + + @staticmethod + def dtype_to_json_type(current_type: Optional[str], dtype: dtype_) -> str: + """ + Convert Pandas DataFrame types to Airbyte Types. + + Args: + current_type (Optional[str]): One of the previous types based on earlier dataframes. + dtype: Pandas DataFrame type. + + Returns: + str: Corresponding Airbyte Type. + """ + number_types = ("int64", "float64") + if current_type == "string": + # Previous column values were of the string type, no need to look further. + return current_type + if dtype == object: + return "string" + if dtype in number_types and (not current_type or current_type == "number"): + return "number" + if dtype == "bool" and (not current_type or current_type == "boolean"): + return "boolean" + if issubdtype(dtype, datetime64): + return "date-time" + return "string" + + @staticmethod + def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None: + """ + Validates if the given format is of type ExcelFormat. + + Args: + excel_format (Any): The format to be validated. + + Raises: + ConfigValidationError: If the format is not ExcelFormat. + """ + if not isinstance(excel_format, ExcelFormat): + logger.info(f"Expected ExcelFormat, got {excel_format}") + raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) + + @staticmethod + def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame: + """ + Opens and parses the Excel file. + + Args: + fp: File pointer to the Excel file. + + Returns: + pd.DataFrame: Parsed data from the Excel file. + """ + return pd.ExcelFile(fp, engine="calamine").parse() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_transfer.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_transfer.py new file mode 100644 index 000000000000..e34818670289 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_transfer.py @@ -0,0 +1,31 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# +import logging +import os +from typing import Any, Dict, Iterable + +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.remote_file import RemoteFile + +AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files") +DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer" + + +class FileTransfer: + def __init__(self) -> None: + self._local_directory = AIRBYTE_STAGING_DIRECTORY if os.path.exists(AIRBYTE_STAGING_DIRECTORY) else DEFAULT_LOCAL_DIRECTORY + + def get_file( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> Iterable[Dict[str, Any]]: + try: + yield stream_reader.get_file(file=file, local_directory=self._local_directory, logger=logger) + except Exception as ex: + logger.error("An error has occurred while getting file: %s", str(ex)) + raise ex diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py new file mode 100644 index 000000000000..d334621ada66 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple + +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType + +Record = Dict[str, Any] + + +class FileTypeParser(ABC): + """ + An abstract class containing methods that must be implemented for each + supported file type. + """ + + @property + def parser_max_n_files_for_schema_inference(self) -> Optional[int]: + """ + The discovery policy decides how many files are loaded for schema inference. This method can provide a parser-specific override. If it's defined, the smaller of the two values will be used. + """ + return None + + @property + def parser_max_n_files_for_parsability(self) -> Optional[int]: + """ + The availability policy decides how many files are loaded for checking whether parsing works correctly. This method can provide a parser-specific override. If it's defined, the smaller of the two values will be used. + """ + return None + + def get_parser_defined_primary_key(self, config: FileBasedStreamConfig) -> Optional[str]: + """ + The parser can define a primary key. If no user-defined primary key is provided, this will be used. + """ + return None + + @abstractmethod + def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: + """ + Check whether the config is valid for this file type. If it is, return True and None. If it's not, return False and an error message explaining why it's invalid. + """ + return True, None + + @abstractmethod + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> SchemaType: + """ + Infer the JSON Schema for this file. + """ + ... + + @abstractmethod + def parse_records( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + discovered_schema: Optional[Mapping[str, SchemaType]], + ) -> Iterable[Record]: + """ + Parse and emit each record. + """ + ... + + @property + @abstractmethod + def file_read_mode(self) -> FileReadMode: + """ + The mode in which the file should be opened for reading. + """ + ... diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py new file mode 100644 index 000000000000..f0603f4ecd2f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py @@ -0,0 +1,130 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union + +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, SchemaType, merge_schemas +from orjson import orjson + + +class JsonlParser(FileTypeParser): + + MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000 + ENCODING = "utf8" + + def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: + """ + JsonlParser does not require config checks, implicit pydantic validation is enough. + """ + return True, None + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> SchemaType: + """ + Infers the schema for the file by inferring the schema for each line, and merging + it with the previously-inferred schema. + """ + inferred_schema: Mapping[str, Any] = {} + + for entry in self._parse_jsonl_entries(file, stream_reader, logger, read_limit=True): + line_schema = self._infer_schema_for_record(entry) + inferred_schema = merge_schemas(inferred_schema, line_schema) + + return inferred_schema + + def parse_records( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + discovered_schema: Optional[Mapping[str, SchemaType]], + ) -> Iterable[Dict[str, Any]]: + """ + This code supports parsing json objects over multiple lines even though this does not align with the JSONL format. This is for + backward compatibility reasons i.e. the previous source-s3 parser did support this. The drawback is: + * performance as the way we support json over multiple lines is very brute forced + * given that we don't have `newlines_in_values` config to scope the possible inputs, we might parse the whole file before knowing if + the input is improperly formatted or if the json is over multiple lines + + The goal is to run the V4 of source-s3 in production, track the warning log emitted when there are multiline json objects and + deprecate this feature if it's not a valid use case. + """ + yield from self._parse_jsonl_entries(file, stream_reader, logger) + + @classmethod + def _infer_schema_for_record(cls, record: Dict[str, Any]) -> Dict[str, Any]: + record_schema = {} + for key, value in record.items(): + if value is None: + record_schema[key] = {"type": "null"} + else: + record_schema[key] = {"type": PYTHON_TYPE_MAPPING[type(value)]} + + return record_schema + + @property + def file_read_mode(self) -> FileReadMode: + return FileReadMode.READ + + def _parse_jsonl_entries( + self, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + read_limit: bool = False, + ) -> Iterable[Dict[str, Any]]: + with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: + read_bytes = 0 + + had_json_parsing_error = False + has_warned_for_multiline_json_object = False + yielded_at_least_once = False + + accumulator = None + for line in fp: + if not accumulator: + accumulator = self._instantiate_accumulator(line) + read_bytes += len(line) + accumulator += line # type: ignore [operator] # In reality, it's either bytes or string and we add the same type + try: + record = orjson.loads(accumulator) + if had_json_parsing_error and not has_warned_for_multiline_json_object: + logger.warning(f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced") + has_warned_for_multiline_json_object = True + + yield record + yielded_at_least_once = True + accumulator = self._instantiate_accumulator(line) + except orjson.JSONDecodeError: + had_json_parsing_error = True + + if read_limit and yielded_at_least_once and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE: + logger.warning( + f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). " + f"Inferring schema from an incomplete set of records." + ) + break + + if had_json_parsing_error and not yielded_at_least_once: + raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line) + + @staticmethod + def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]: + if isinstance(line, bytes): + return bytes("", json.detect_encoding(line)) + elif isinstance(line, str): + return "" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py new file mode 100644 index 000000000000..7e3d3013c4e0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -0,0 +1,233 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +import os +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from urllib.parse import unquote + +import pyarrow as pa +import pyarrow.parquet as pq +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType +from pyarrow import DictionaryArray, Scalar + + +class ParquetParser(FileTypeParser): + + ENCODING = None + + def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: + """ + ParquetParser does not require config checks, implicit pydantic validation is enough. + """ + return True, None + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> SchemaType: + parquet_format = config.format + if not isinstance(parquet_format, ParquetFormat): + raise ValueError(f"Expected ParquetFormat, got {parquet_format}") + + with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: + parquet_file = pq.ParquetFile(fp) + parquet_schema = parquet_file.schema_arrow + + # Inferred non-partition schema + schema = {field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format) for field in parquet_schema} + # Inferred partition schema + partition_columns = {partition.split("=")[0]: {"type": "string"} for partition in self._extract_partitions(file.uri)} + + schema.update(partition_columns) + return schema + + def parse_records( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + discovered_schema: Optional[Mapping[str, SchemaType]], + ) -> Iterable[Dict[str, Any]]: + parquet_format = config.format + if not isinstance(parquet_format, ParquetFormat): + logger.info(f"Expected ParquetFormat, got {parquet_format}") + raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) + + line_no = 0 + try: + with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: + reader = pq.ParquetFile(fp) + partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)} + for row_group in range(reader.num_row_groups): + batch = reader.read_row_group(row_group) + for row in range(batch.num_rows): + line_no += 1 + yield { + **{ + column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format) + for column in batch.column_names + }, + **partition_columns, + } + except Exception as exc: + raise RecordParseError( + FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}" + ) from exc + + @staticmethod + def _extract_partitions(filepath: str) -> List[str]: + return [unquote(partition) for partition in filepath.split(os.sep) if "=" in partition] + + @property + def file_read_mode(self) -> FileReadMode: + return FileReadMode.READ_BINARY + + @staticmethod + def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any: + """ + Convert an entry in a pyarrow table to a value that can be output by the source. + """ + if isinstance(parquet_value, DictionaryArray): + return ParquetParser._dictionary_array_to_python_value(parquet_value) + else: + return ParquetParser._scalar_to_python_value(parquet_value, parquet_format) + + @staticmethod + def _scalar_to_python_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any: + """ + Convert a pyarrow scalar to a value that can be output by the source. + """ + if parquet_value.as_py() is None: + return None + + # Convert date and datetime objects to isoformat strings + if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type): + return parquet_value.as_py().isoformat() + + # Convert month_day_nano_interval to array + if parquet_value.type == pa.month_day_nano_interval(): + return json.loads(json.dumps(parquet_value.as_py())) + + # Decode binary strings to utf-8 + if ParquetParser._is_binary(parquet_value.type): + return parquet_value.as_py().decode("utf-8") + + if pa.types.is_decimal(parquet_value.type): + if parquet_format.decimal_as_float: + return float(parquet_value.as_py()) + else: + return str(parquet_value.as_py()) + + if pa.types.is_map(parquet_value.type): + return {k: v for k, v in parquet_value.as_py()} + + if pa.types.is_null(parquet_value.type): + return None + + # Convert duration to seconds, then convert to the appropriate unit + if pa.types.is_duration(parquet_value.type): + duration = parquet_value.as_py() + duration_seconds = duration.total_seconds() + if parquet_value.type.unit == "s": + return duration_seconds + elif parquet_value.type.unit == "ms": + return duration_seconds * 1000 + elif parquet_value.type.unit == "us": + return duration_seconds * 1_000_000 + elif parquet_value.type.unit == "ns": + return duration_seconds * 1_000_000_000 + duration.nanoseconds + else: + raise ValueError(f"Unknown duration unit: {parquet_value.type.unit}") + else: + return parquet_value.as_py() + + @staticmethod + def _dictionary_array_to_python_value(parquet_value: DictionaryArray) -> Dict[str, Any]: + """ + Convert a pyarrow dictionary array to a value that can be output by the source. + + Dictionaries are stored as two columns: indices and values + The indices column is an array of integers that maps to the values column + """ + + return { + "indices": parquet_value.indices.tolist(), + "values": parquet_value.dictionary.tolist(), + } + + @staticmethod + def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]: + """ + Convert a pyarrow data type to an Airbyte schema type. + Parquet data types are defined at https://arrow.apache.org/docs/python/api/datatypes.html + """ + + if pa.types.is_timestamp(parquet_type): + return {"type": "string", "format": "date-time"} + elif pa.types.is_date(parquet_type): + return {"type": "string", "format": "date"} + elif ParquetParser._is_string(parquet_type, parquet_format): + return {"type": "string"} + elif pa.types.is_boolean(parquet_type): + return {"type": "boolean"} + elif ParquetParser._is_integer(parquet_type): + return {"type": "integer"} + elif ParquetParser._is_float(parquet_type, parquet_format): + return {"type": "number"} + elif ParquetParser._is_object(parquet_type): + return {"type": "object"} + elif ParquetParser._is_list(parquet_type): + return {"type": "array"} + elif pa.types.is_null(parquet_type): + return {"type": "null"} + else: + raise ValueError(f"Unsupported parquet type: {parquet_type}") + + @staticmethod + def _is_binary(parquet_type: pa.DataType) -> bool: + return bool( + pa.types.is_binary(parquet_type) or pa.types.is_large_binary(parquet_type) or pa.types.is_fixed_size_binary(parquet_type) + ) + + @staticmethod + def _is_integer(parquet_type: pa.DataType) -> bool: + return bool(pa.types.is_integer(parquet_type) or pa.types.is_duration(parquet_type)) + + @staticmethod + def _is_float(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> bool: + if pa.types.is_decimal(parquet_type): + return parquet_format.decimal_as_float + else: + return bool(pa.types.is_floating(parquet_type)) + + @staticmethod + def _is_string(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> bool: + if pa.types.is_decimal(parquet_type): + return not parquet_format.decimal_as_float + else: + return bool( + pa.types.is_time(parquet_type) + or pa.types.is_string(parquet_type) + or pa.types.is_large_string(parquet_type) + or ParquetParser._is_binary(parquet_type) # Best we can do is return as a string since we do not support binary + ) + + @staticmethod + def _is_object(parquet_type: pa.DataType) -> bool: + return bool(pa.types.is_dictionary(parquet_type) or pa.types.is_struct(parquet_type) or pa.types.is_map(parquet_type)) + + @staticmethod + def _is_list(parquet_type: pa.DataType) -> bool: + return bool(pa.types.is_list(parquet_type) or pa.types.is_large_list(parquet_type) or parquet_type == pa.month_day_nano_interval()) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py new file mode 100644 index 000000000000..659fbd2c4734 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -0,0 +1,357 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +import traceback +from datetime import datetime +from io import BytesIO, IOBase +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union + +import backoff +import dpath +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.config.unstructured_format import ( + APIParameterConfigModel, + APIProcessingConfigModel, + LocalProcessingConfigModel, + UnstructuredFormat, +) +from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType +from airbyte_cdk.utils import is_cloud_environment +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType, detect_filetype + +unstructured_partition_pdf = None +unstructured_partition_docx = None +unstructured_partition_pptx = None + + +def optional_decode(contents: Union[str, bytes]) -> str: + if isinstance(contents, bytes): + return contents.decode("utf-8") + return contents + + +def _import_unstructured() -> None: + """Dynamically imported as needed, due to slow import speed.""" + global unstructured_partition_pdf + global unstructured_partition_docx + global unstructured_partition_pptx + from unstructured.partition.docx import partition_docx + from unstructured.partition.pdf import partition_pdf + from unstructured.partition.pptx import partition_pptx + + # separate global variables to properly propagate typing + unstructured_partition_pdf = partition_pdf + unstructured_partition_docx = partition_docx + unstructured_partition_pptx = partition_pptx + + +def user_error(e: Exception) -> bool: + """ + Return True if this exception is caused by user error, False otherwise. + """ + if not isinstance(e, RecordParseError): + return False + if not isinstance(e, requests.exceptions.RequestException): + return False + return bool(e.response and 400 <= e.response.status_code < 500) + + +CLOUD_DEPLOYMENT_MODE = "cloud" + + +class UnstructuredParser(FileTypeParser): + @property + def parser_max_n_files_for_schema_inference(self) -> Optional[int]: + """ + Just check one file as the schema is static + """ + return 1 + + @property + def parser_max_n_files_for_parsability(self) -> Optional[int]: + """ + Do not check any files for parsability because it might be an expensive operation and doesn't give much confidence whether the sync will succeed. + """ + return 0 + + def get_parser_defined_primary_key(self, config: FileBasedStreamConfig) -> Optional[str]: + """ + Return the document_key field as the primary key. + + his will pre-select the document key column as the primary key when setting up a connection, making it easier for the user to configure normalization in the destination. + """ + return "document_key" + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> SchemaType: + format = _extract_format(config) + with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle: + filetype = self._get_filetype(file_handle, file) + + if filetype not in self._supported_file_types() and not format.skip_unprocessable_files: + raise self._create_parse_error(file, self._get_file_type_error_message(filetype)) + + return { + "content": { + "type": "string", + "description": "Content of the file as markdown. Might be null if the file could not be parsed", + }, + "document_key": {"type": "string", "description": "Unique identifier of the document, e.g. the file path"}, + "_ab_source_file_parse_error": { + "type": "string", + "description": "Error message if the file could not be parsed even though the file is supported", + }, + } + + def parse_records( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + discovered_schema: Optional[Mapping[str, SchemaType]], + ) -> Iterable[Dict[str, Any]]: + format = _extract_format(config) + with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle: + try: + markdown = self._read_file(file_handle, file, format, logger) + yield { + "content": markdown, + "document_key": file.uri, + "_ab_source_file_parse_error": None, + } + except RecordParseError as e: + # RecordParseError is raised when the file can't be parsed because of a problem with the file content (either the file is not supported or the file is corrupted) + # if the skip_unprocessable_files flag is set, we log a warning and pass the error as part of the document + # otherwise, we raise the error to fail the sync + if format.skip_unprocessable_files: + exception_str = str(e) + logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.") + yield { + "content": None, + "document_key": file.uri, + "_ab_source_file_parse_error": exception_str, + } + logger.warn(f"File {file.uri} cannot be parsed. Skipping it.") + else: + raise e + + def _read_file(self, file_handle: IOBase, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> str: + _import_unstructured() + if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx): + # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) + raise Exception("unstructured library is not available") + + filetype = self._get_filetype(file_handle, remote_file) + + if filetype == FileType.MD or filetype == FileType.TXT: + file_content: bytes = file_handle.read() + decoded_content: str = optional_decode(file_content) + return decoded_content + if filetype not in self._supported_file_types(): + raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype)) + if format.processing.mode == "local": + return self._read_file_locally(file_handle, filetype, format.strategy, remote_file) + elif format.processing.mode == "api": + try: + result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file) + except Exception as e: + # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow. + # + # For other exceptions, re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA. + # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient. + if isinstance(e, RecordParseError): + raise e + raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error) + + return result + + def _params_to_dict(self, params: Optional[List[APIParameterConfigModel]], strategy: str) -> Dict[str, Union[str, List[str]]]: + result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy} + if params is None: + return result_dict + for item in params: + key = item.name + value = item.value + if key in result_dict: + existing_value = result_dict[key] + # If the key already exists, append the new value to its list + if isinstance(existing_value, list): + existing_value.append(value) + else: + result_dict[key] = [existing_value, value] + else: + # If the key doesn't exist, add it to the dictionary + result_dict[key] = value + + return result_dict + + def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: + """ + Perform a connection check for the parser config: + - Verify that encryption is enabled if the API is hosted on a cloud instance. + - Verify that the API can extract text from a file. + + For local processing, we don't need to perform any additional checks, implicit pydantic validation is enough. + """ + format_config = _extract_format(config) + if isinstance(format_config.processing, LocalProcessingConfigModel): + if format_config.strategy == "hi_res": + return False, "Hi-res strategy is not supported for local processing" + return True, None + + if is_cloud_environment() and not format_config.processing.api_url.startswith("https://"): + return False, "Base URL must start with https://" + + try: + self._read_file_remotely( + BytesIO(b"# Airbyte source connection test"), + format_config.processing, + FileType.MD, + "auto", + RemoteFile(uri="test", last_modified=datetime.now()), + ) + except Exception: + return False, "".join(traceback.format_exc()) + + return True, None + + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error) + def _read_file_remotely_with_retries( + self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile + ) -> str: + """ + Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily. + """ + return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file) + + def _read_file_remotely( + self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile + ) -> str: + headers = {"accept": "application/json", "unstructured-api-key": format.api_key} + + data = self._params_to_dict(format.parameters, strategy) + + file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])} + + response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data) + + if response.status_code == 422: + # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination). + raise self._create_parse_error(remote_file, response.json()) + else: + # Other error statuses are raised as requests exceptions (retry everything except user errors) + response.raise_for_status() + + json_response = response.json() + + return self._render_markdown(json_response) + + def _read_file_locally(self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile) -> str: + _import_unstructured() + if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx): + # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) + raise Exception("unstructured library is not available") + + file: Any = file_handle + + # before the parsing logic is entered, the file is read completely to make sure it is in local memory + file_handle.seek(0) + file_handle.read() + file_handle.seek(0) + + try: + if filetype == FileType.PDF: + # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects + file_handle.seek(0) + with BytesIO(file_handle.read()) as file: + file_handle.seek(0) + elements = unstructured_partition_pdf(file=file, strategy=strategy) + elif filetype == FileType.DOCX: + elements = unstructured_partition_docx(file=file) + elif filetype == FileType.PPTX: + elements = unstructured_partition_pptx(file=file) + except Exception as e: + raise self._create_parse_error(remote_file, str(e)) + + return self._render_markdown([element.to_dict() for element in elements]) + + def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError: + return RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message) + + def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]: + """ + Detect the file type based on the file name and the file content. + + There are three strategies to determine the file type: + 1. Use the mime type if available (only some sources support it) + 2. Use the file name if available + 3. Use the file content + """ + if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE: + return STR_TO_FILETYPE[remote_file.mime_type] + + # set name to none, otherwise unstructured will try to get the modified date from the local file system + if hasattr(file, "name"): + file.name = None + + # detect_filetype is either using the file name or file content + # if possible, try to leverage the file name to detect the file type + # if the file name is not available, use the file content + file_type = detect_filetype( + filename=remote_file.uri, + ) + if file_type is not None and not file_type == FileType.UNK: + return file_type + + type_based_on_content = detect_filetype(file=file) + + # detect_filetype is reading to read the file content + file.seek(0) + + return type_based_on_content + + def _supported_file_types(self) -> List[Any]: + return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT] + + def _get_file_type_error_message(self, file_type: FileType) -> str: + supported_file_types = ", ".join([str(type) for type in self._supported_file_types()]) + return f"File type {file_type} is not supported. Supported file types are {supported_file_types}" + + def _render_markdown(self, elements: List[Any]) -> str: + return "\n\n".join((self._convert_to_markdown(el) for el in elements)) + + def _convert_to_markdown(self, el: Dict[str, Any]) -> str: + if dpath.get(el, "type") == "Title": + heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1) + return f"{heading_str} {dpath.get(el, 'text')}" + elif dpath.get(el, "type") == "ListItem": + return f"- {dpath.get(el, 'text')}" + elif dpath.get(el, "type") == "Formula": + return f"```\n{dpath.get(el, 'text')}\n```" + else: + return str(dpath.get(el, "text", default="")) + + @property + def file_read_mode(self) -> FileReadMode: + return FileReadMode.READ_BINARY + + +def _extract_format(config: FileBasedStreamConfig) -> UnstructuredFormat: + config_format = config.format + if not isinstance(config_format, UnstructuredFormat): + raise ValueError(f"Invalid format config: {config_format}") + return config_format diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/remote_file.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/remote_file.py new file mode 100644 index 000000000000..0197a35fdbb1 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/remote_file.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from datetime import datetime +from typing import Optional + +from pydantic.v1 import BaseModel + + +class RemoteFile(BaseModel): + """ + A file in a file-based stream. + """ + + uri: str + last_modified: datetime + mime_type: Optional[str] = None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py new file mode 100644 index 000000000000..fb7141201d79 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py @@ -0,0 +1,250 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from copy import deepcopy +from enum import Enum +from functools import total_ordering +from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Type, Union + +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, SchemaInferenceError + +JsonSchemaSupportedType = Union[List[str], Literal["string"], str] +SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]] + +schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}} +file_transfer_schema = {"type": "object", "properties": {"data": {"type": "object"}, "file": {"type": "object"}}} + + +@total_ordering +class ComparableType(Enum): + NULL = 0 + BOOLEAN = 1 + INTEGER = 2 + NUMBER = 3 + STRING = 4 + OBJECT = 5 + + def __lt__(self, other: Any) -> bool: + if self.__class__ is other.__class__: + return self.value < other.value # type: ignore + else: + return NotImplemented + + +TYPE_PYTHON_MAPPING: Mapping[str, Tuple[str, Optional[Type[Any]]]] = { + "null": ("null", None), + "array": ("array", list), + "boolean": ("boolean", bool), + "float": ("number", float), + "integer": ("integer", int), + "number": ("number", float), + "object": ("object", dict), + "string": ("string", str), +} +PYTHON_TYPE_MAPPING = {t: k for k, (_, t) in TYPE_PYTHON_MAPPING.items()} + + +def get_comparable_type(value: Any) -> Optional[ComparableType]: + if value == "null": + return ComparableType.NULL + if value == "boolean": + return ComparableType.BOOLEAN + if value == "integer": + return ComparableType.INTEGER + if value == "number": + return ComparableType.NUMBER + if value == "string": + return ComparableType.STRING + if value == "object": + return ComparableType.OBJECT + else: + return None + + +def get_inferred_type(value: Any) -> Optional[ComparableType]: + if value is None: + return ComparableType.NULL + if isinstance(value, bool): + return ComparableType.BOOLEAN + if isinstance(value, int): + return ComparableType.INTEGER + if isinstance(value, float): + return ComparableType.NUMBER + if isinstance(value, str): + return ComparableType.STRING + if isinstance(value, dict): + return ComparableType.OBJECT + else: + return None + + +def merge_schemas(schema1: SchemaType, schema2: SchemaType) -> SchemaType: + """ + Returns a new dictionary that contains schema1 and schema2. + + Schemas are merged as follows + - If a key is in one schema but not the other, add it to the base schema with its existing type. + - If a key is in both schemas but with different types, use the wider type. + - If the type is a list in one schema but a different type of element in the other schema, raise an exception. + - If the type is an object in both schemas but the objects are different raise an exception. + - If the type is an object in one schema but not in the other schema, raise an exception. + + In other words, we support merging + - any atomic type with any other atomic type (choose the wider of the two) + - list with list (union) + and nothing else. + """ + for k, t in list(schema1.items()) + list(schema2.items()): + if not isinstance(t, dict) or "type" not in t or not _is_valid_type(t["type"]): + raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=k, type=t) + + merged_schema: Dict[str, Any] = deepcopy(schema1) # type: ignore # as of 2023-08-08, deepcopy can copy Mapping + for k2, t2 in schema2.items(): + t1 = merged_schema.get(k2) + if t1 is None: + merged_schema[k2] = t2 + elif t1 == t2: + continue + else: + merged_schema[k2] = _choose_wider_type(k2, t1, t2) + + return merged_schema + + +def _is_valid_type(t: JsonSchemaSupportedType) -> bool: + return t == "array" or get_comparable_type(t) is not None + + +def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -> Mapping[str, Any]: + t1_type = t1["type"] + t2_type = t2["type"] + + if (t1_type == "array" or t2_type == "array") and t1 != t2: + raise SchemaInferenceError( + FileBasedSourceError.SCHEMA_INFERENCE_ERROR, + details="Cannot merge schema for unequal array types.", + key=key, + detected_types=f"{t1},{t2}", + ) + # Schemas can still be merged if a key contains a null value in either t1 or t2, but it is still an object + elif (t1_type == "object" or t2_type == "object") and t1_type != "null" and t2_type != "null" and t1 != t2: + raise SchemaInferenceError( + FileBasedSourceError.SCHEMA_INFERENCE_ERROR, + details="Cannot merge schema for unequal object types.", + key=key, + detected_types=f"{t1},{t2}", + ) + else: + comparable_t1 = get_comparable_type(TYPE_PYTHON_MAPPING[t1_type][0]) # accessing the type_mapping value + comparable_t2 = get_comparable_type(TYPE_PYTHON_MAPPING[t2_type][0]) # accessing the type_mapping value + if not comparable_t1 and comparable_t2: + raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}") + return max( + [t1, t2], key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0])) + ) # accessing the type_mapping value + + +def is_equal_or_narrower_type(value: Any, expected_type: str) -> bool: + if isinstance(value, list): + # We do not compare lists directly; the individual items are compared. + # If we hit this condition, it means that the expected type is not + # compatible with the inferred type. + return False + + inferred_type = ComparableType(get_inferred_type(value)) + + if inferred_type is None: + return False + + return ComparableType(inferred_type) <= ComparableType(get_comparable_type(expected_type)) + + +def conforms_to_schema(record: Mapping[str, Any], schema: Mapping[str, Any]) -> bool: + """ + Return true iff the record conforms to the supplied schema. + + The record conforms to the supplied schema iff: + - All columns in the record are in the schema. + - For every column in the record, that column's type is equal to or narrower than the same column's + type in the schema. + """ + schema_columns = set(schema.get("properties", {}).keys()) + record_columns = set(record.keys()) + + if not record_columns.issubset(schema_columns): + return False + + for column, definition in schema.get("properties", {}).items(): + expected_type = definition.get("type") + value = record.get(column) + + if value is not None: + if isinstance(expected_type, list): + return any(is_equal_or_narrower_type(value, e) for e in expected_type) + elif expected_type == "object": + return isinstance(value, dict) + elif expected_type == "array": + if not isinstance(value, list): + return False + array_type = definition.get("items", {}).get("type") + if not all(is_equal_or_narrower_type(v, array_type) for v in value): + return False + elif not is_equal_or_narrower_type(value, expected_type): + return False + + return True + + +def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[Mapping[str, str]]: + try: + if isinstance(input_schema, str): + schema: Mapping[str, str] = json.loads(input_schema) + else: + schema = input_schema + if not all(isinstance(s, str) for s in schema.values()): + raise ConfigValidationError( + FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details="Invalid input schema; nested schemas are not supported." + ) + + except json.decoder.JSONDecodeError: + return None + + return schema + + +def type_mapping_to_jsonschema(input_schema: Optional[Union[str, Mapping[str, str]]]) -> Optional[Mapping[str, Any]]: + """ + Return the user input schema (type mapping), transformed to JSON Schema format. + + Verify that the input schema: + - is a key:value map + - all values in the map correspond to a JsonSchema datatype + """ + if not input_schema: + return None + + result_schema = {} + + json_mapping = _parse_json_input(input_schema) or {} + + for col_name, type_name in json_mapping.items(): + col_name, type_name = col_name.strip(), type_name.strip() + if not (col_name and type_name): + raise ConfigValidationError( + FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, + details=f"Invalid input schema; expected mapping in the format column_name: type, got {input_schema}.", + ) + + _json_schema_type = TYPE_PYTHON_MAPPING.get(type_name.casefold()) + + if not _json_schema_type: + raise ConfigValidationError( + FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details=f"Invalid type '{type_name}' for property '{col_name}'." + ) + + json_schema_type = _json_schema_type[0] + result_schema[col_name] = {"type": json_schema_type} + + return {"type": "object", "properties": result_schema} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py new file mode 100644 index 000000000000..d2cc0e63b214 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py @@ -0,0 +1,15 @@ +from airbyte_cdk.sources.file_based.schema_validation_policies.abstract_schema_validation_policy import AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.schema_validation_policies.default_schema_validation_policies import ( + DEFAULT_SCHEMA_VALIDATION_POLICIES, + EmitRecordPolicy, + SkipRecordPolicy, + WaitForDiscoverPolicy, +) + +__all__ = [ + "DEFAULT_SCHEMA_VALIDATION_POLICIES", + "AbstractSchemaValidationPolicy", + "EmitRecordPolicy", + "SkipRecordPolicy", + "WaitForDiscoverPolicy", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py new file mode 100644 index 000000000000..004139b78b10 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Any, Mapping, Optional + + +class AbstractSchemaValidationPolicy(ABC): + name: str + validate_schema_before_sync = False # Whether to verify that records conform to the schema during the stream's availabilty check + + @abstractmethod + def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool: + """ + Return True if the record passes the user's validation policy. + """ + raise NotImplementedError() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py new file mode 100644 index 000000000000..02134d1b839f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Mapping, Optional + +from airbyte_cdk.sources.file_based.config.file_based_stream_config import ValidationPolicy +from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, StopSyncPerValidationPolicy +from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema +from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy + + +class EmitRecordPolicy(AbstractSchemaValidationPolicy): + name = "emit_record" + + def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool: + return True + + +class SkipRecordPolicy(AbstractSchemaValidationPolicy): + name = "skip_record" + + def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool: + return schema is not None and conforms_to_schema(record, schema) + + +class WaitForDiscoverPolicy(AbstractSchemaValidationPolicy): + name = "wait_for_discover" + validate_schema_before_sync = True + + def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool: + if schema is None or not conforms_to_schema(record, schema): + raise StopSyncPerValidationPolicy(FileBasedSourceError.STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY) + return True + + +DEFAULT_SCHEMA_VALIDATION_POLICIES = { + ValidationPolicy.emit_record: EmitRecordPolicy(), + ValidationPolicy.skip_record: SkipRecordPolicy(), + ValidationPolicy.wait_for_discover: WaitForDiscoverPolicy(), +} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/__init__.py new file mode 100644 index 000000000000..4b5c4bc2edd5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/__init__.py @@ -0,0 +1,4 @@ +from airbyte_cdk.sources.file_based.stream.abstract_file_based_stream import AbstractFileBasedStream +from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream + +__all__ = ["AbstractFileBasedStream", "DefaultFileBasedStream"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py new file mode 100644 index 000000000000..850c4c936d6c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py @@ -0,0 +1,173 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from functools import cache, cached_property, lru_cache +from typing import Any, Dict, Iterable, List, Mapping, Optional, Type + +from airbyte_cdk import AirbyteMessage +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.types import StreamSlice +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.checkpoint import Cursor +from deprecated import deprecated + + +class AbstractFileBasedStream(Stream): + """ + A file-based stream in an Airbyte source. + + In addition to the base Stream attributes, a file-based stream has + - A config object (derived from the corresponding stream section in source config). + This contains the globs defining the stream's files. + - A StreamReader, which knows how to list and open files in the stream. + - A FileBasedAvailabilityStrategy, which knows how to verify that we can list and open + files in the stream. + - A DiscoveryPolicy that controls the number of concurrent requests sent to the source + during discover, and the number of files used for schema discovery. + - A dictionary of FileType:Parser that holds all the file types that can be handled + by the stream. + """ + + def __init__( + self, + config: FileBasedStreamConfig, + catalog_schema: Optional[Mapping[str, Any]], + stream_reader: AbstractFileBasedStreamReader, + availability_strategy: AbstractFileBasedAvailabilityStrategy, + discovery_policy: AbstractDiscoveryPolicy, + parsers: Dict[Type[Any], FileTypeParser], + validation_policy: AbstractSchemaValidationPolicy, + errors_collector: FileBasedErrorsCollector, + cursor: AbstractFileBasedCursor, + ): + super().__init__() + self.config = config + self.catalog_schema = catalog_schema + self.validation_policy = validation_policy + self.stream_reader = stream_reader + self._discovery_policy = discovery_policy + self._availability_strategy = availability_strategy + self._parsers = parsers + self.errors_collector = errors_collector + self._cursor = cursor + + @property + @abstractmethod + def primary_key(self) -> PrimaryKeyType: + ... + + @cache + def list_files(self) -> List[RemoteFile]: + """ + List all files that belong to the stream. + + The output of this method is cached so we don't need to list the files more than once. + This means we won't pick up changes to the files during a sync. This method uses the + get_files method which is implemented by the concrete stream class. + """ + return list(self.get_files()) + + @abstractmethod + def get_files(self) -> Iterable[RemoteFile]: + """ + List all files that belong to the stream as defined by the stream's globs. + """ + ... + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[StreamSlice] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any] | AirbyteMessage]: + """ + Yield all records from all remote files in `list_files_for_this_sync`. + This method acts as an adapter between the generic Stream interface and the file-based's + stream since file-based streams manage their own states. + """ + if stream_slice is None: + raise ValueError("stream_slice must be set") + return self.read_records_from_slice(stream_slice) + + @abstractmethod + def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping[str, Any] | AirbyteMessage]: + """ + Yield all records from all remote files in `list_files_for_this_sync`. + """ + ... + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + """ + This method acts as an adapter between the generic Stream interface and the file-based's + stream since file-based streams manage their own states. + """ + return self.compute_slices() + + @abstractmethod + def compute_slices(self) -> Iterable[Optional[StreamSlice]]: + """ + Return a list of slices that will be used to read files in the current sync. + :return: The slices to use for the current sync. + """ + ... + + @abstractmethod + @lru_cache(maxsize=None) + def get_json_schema(self) -> Mapping[str, Any]: + """ + Return the JSON Schema for a stream. + """ + ... + + @abstractmethod + def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: + """ + Infer the schema for files in the stream. + """ + ... + + def get_parser(self) -> FileTypeParser: + try: + return self._parsers[type(self.config.format)] + except KeyError: + raise UndefinedParserError(FileBasedSourceError.UNDEFINED_PARSER, stream=self.name, format=type(self.config.format)) + + def record_passes_validation_policy(self, record: Mapping[str, Any]) -> bool: + if self.validation_policy: + return self.validation_policy.record_passes_validation_policy(record=record, schema=self.catalog_schema) + else: + raise RecordParseError( + FileBasedSourceError.UNDEFINED_VALIDATION_POLICY, stream=self.name, validation_policy=self.config.validation_policy + ) + + @cached_property + @deprecated(version="3.7.0") + def availability_strategy(self) -> AbstractFileBasedAvailabilityStrategy: + return self._availability_strategy + + @property + def name(self) -> str: + return self.config.name + + def get_cursor(self) -> Optional[Cursor]: + """ + This is a temporary hack. Because file-based, declarative, and concurrent have _slightly_ different cursor implementations + the file-based cursor isn't compatible with the cursor-based iteration flow in core.py top-level CDK. By setting this to + None, we defer to the regular incremental checkpoint flow. Once all cursors are consolidated under a common interface + then this override can be removed. + """ + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py new file mode 100644 index 000000000000..d335819d47a2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/adapters.py @@ -0,0 +1,323 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import copy +import logging +from functools import cache, lru_cache +from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteStream, Level, SyncMode, Type +from airbyte_cdk.sources import AbstractSource +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.file_based.availability_strategy import ( + AbstractFileBasedAvailabilityStrategy, + AbstractFileBasedAvailabilityStrategyWrapper, +) +from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream +from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedFinalStateCursor +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.types import StreamSlice +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage +from airbyte_cdk.sources.streams.concurrent.helpers import get_cursor_field_from_stream, get_primary_key_from_stream +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.utils.schema_helpers import InternalConfig +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from deprecated.classic import deprecated + +if TYPE_CHECKING: + from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor + +""" +This module contains adapters to help enabling concurrency on File-based Stream objects without needing to migrate to AbstractStream +""" + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBasedStream): + @classmethod + def create_from_stream( + cls, + stream: AbstractFileBasedStream, + source: AbstractSource, + logger: logging.Logger, + state: Optional[MutableMapping[str, Any]], + cursor: "AbstractConcurrentFileBasedCursor", + ) -> "FileBasedStreamFacade": + """ + Create a ConcurrentStream from a FileBasedStream object. + """ + pk = get_primary_key_from_stream(stream.primary_key) + cursor_field = get_cursor_field_from_stream(stream) + stream._cursor = cursor + + if not source.message_repository: + raise ValueError( + "A message repository is required to emit non-record messages. Please set the message repository on the source." + ) + + message_repository = source.message_repository + return FileBasedStreamFacade( + DefaultStream( + partition_generator=FileBasedStreamPartitionGenerator( + stream, + message_repository, + SyncMode.full_refresh if isinstance(cursor, FileBasedFinalStateCursor) else SyncMode.incremental, + [cursor_field] if cursor_field is not None else None, + state, + cursor, + ), + name=stream.name, + json_schema=stream.get_json_schema(), + availability_strategy=AbstractFileBasedAvailabilityStrategyWrapper(stream), + primary_key=pk, + cursor_field=cursor_field, + logger=logger, + namespace=stream.namespace, + cursor=cursor, + ), + stream, + cursor, + logger=logger, + slice_logger=source._slice_logger, + ) + + def __init__( + self, + stream: DefaultStream, + legacy_stream: AbstractFileBasedStream, + cursor: AbstractFileBasedCursor, + slice_logger: SliceLogger, + logger: logging.Logger, + ): + """ + :param stream: The underlying AbstractStream + """ + self._abstract_stream = stream + self._legacy_stream = legacy_stream + self._cursor = cursor + self._slice_logger = slice_logger + self._logger = logger + self.catalog_schema = legacy_stream.catalog_schema + self.config = legacy_stream.config + self.validation_policy = legacy_stream.validation_policy + + @property + def cursor_field(self) -> Union[str, List[str]]: + if self._abstract_stream.cursor_field is None: + return [] + else: + return self._abstract_stream.cursor_field + + @property + def name(self) -> str: + return self._abstract_stream.name + + @property + def supports_incremental(self) -> bool: + return self._legacy_stream.supports_incremental + + @property + @deprecated(version="3.7.0") + def availability_strategy(self) -> AbstractFileBasedAvailabilityStrategy: + return self._legacy_stream.availability_strategy + + @lru_cache(maxsize=None) + def get_json_schema(self) -> Mapping[str, Any]: + return self._abstract_stream.get_json_schema() + + @property + def primary_key(self) -> PrimaryKeyType: + return self._legacy_stream.config.primary_key or self.get_parser().get_parser_defined_primary_key(self._legacy_stream.config) + + def get_parser(self) -> FileTypeParser: + return self._legacy_stream.get_parser() + + def get_files(self) -> Iterable[RemoteFile]: + return self._legacy_stream.get_files() + + def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping[str, Any]]: + yield from self._legacy_stream.read_records_from_slice(stream_slice) # type: ignore[misc] # Only Mapping[str, Any] is expected for legacy streams, not AirbyteMessage + + def compute_slices(self) -> Iterable[Optional[StreamSlice]]: + return self._legacy_stream.compute_slices() + + def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: + return self._legacy_stream.infer_schema(files) + + def get_underlying_stream(self) -> DefaultStream: + return self._abstract_stream + + def read( + self, + configured_stream: ConfiguredAirbyteStream, + logger: logging.Logger, + slice_logger: SliceLogger, + stream_state: MutableMapping[str, Any], + state_manager: ConnectorStateManager, + internal_config: InternalConfig, + ) -> Iterable[StreamData]: + yield from self._read_records() + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + try: + yield from self._read_records() + except Exception as exc: + if hasattr(self._cursor, "state"): + state = str(self._cursor.state) + else: + # This shouldn't happen if the ConcurrentCursor was used + state = "unknown; no state attribute was available on the cursor" + yield AirbyteMessage( + type=Type.LOG, log=AirbyteLogMessage(level=Level.ERROR, message=f"Cursor State at time of exception: {state}") + ) + raise exc + + def _read_records(self) -> Iterable[StreamData]: + for partition in self._abstract_stream.generate_partitions(): + if self._slice_logger.should_log_slice_message(self._logger): + yield self._slice_logger.create_slice_log_message(partition.to_slice()) + for record in partition.read(): + yield record.data + + +class FileBasedStreamPartition(Partition): + def __init__( + self, + stream: AbstractFileBasedStream, + _slice: Optional[Mapping[str, Any]], + message_repository: MessageRepository, + sync_mode: SyncMode, + cursor_field: Optional[List[str]], + state: Optional[MutableMapping[str, Any]], + cursor: "AbstractConcurrentFileBasedCursor", + ): + self._stream = stream + self._slice = _slice + self._message_repository = message_repository + self._sync_mode = sync_mode + self._cursor_field = cursor_field + self._state = state + self._cursor = cursor + self._is_closed = False + + def read(self) -> Iterable[Record]: + try: + for record_data in self._stream.read_records( + cursor_field=self._cursor_field, + sync_mode=SyncMode.full_refresh, + stream_slice=copy.deepcopy(self._slice), + stream_state=self._state, + ): + if isinstance(record_data, Mapping): + data_to_return = dict(record_data) + self._stream.transformer.transform(data_to_return, self._stream.get_json_schema()) + yield Record(data_to_return, self) + elif isinstance(record_data, AirbyteMessage) and record_data.type == Type.RECORD and record_data.record is not None: + # `AirbyteMessage`s of type `Record` should also be yielded so they are enqueued + # If stream is flagged for file_transfer the record should data in file key + record_message_data = record_data.record.file if self._use_file_transfer() else record_data.record.data + if not record_message_data: + raise ExceptionWithDisplayMessage("A record without data was found") + else: + yield Record(data=record_message_data, partition=self, is_file_transfer_message=self._use_file_transfer()) + else: + self._message_repository.emit_message(record_data) + except Exception as e: + display_message = self._stream.get_error_display_message(e) + if display_message: + raise ExceptionWithDisplayMessage(display_message) from e + else: + raise e + + def to_slice(self) -> Optional[Mapping[str, Any]]: + if self._slice is None: + return None + assert ( + len(self._slice["files"]) == 1 + ), f"Expected 1 file per partition but got {len(self._slice['files'])} for stream {self.stream_name()}" + file = self._slice["files"][0] + return {"files": [file]} + + def close(self) -> None: + self._cursor.close_partition(self) + self._is_closed = True + + def is_closed(self) -> bool: + return self._is_closed + + def __hash__(self) -> int: + if self._slice: + # Convert the slice to a string so that it can be hashed + if len(self._slice["files"]) != 1: + raise ValueError( + f"Slices for file-based streams should be of length 1, but got {len(self._slice['files'])}. This is unexpected. Please contact Support." + ) + else: + s = f"{self._slice['files'][0].last_modified.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}_{self._slice['files'][0].uri}" + return hash((self._stream.name, s)) + else: + return hash(self._stream.name) + + def stream_name(self) -> str: + return self._stream.name + + @cache + def _use_file_transfer(self) -> bool: + return hasattr(self._stream, "use_file_transfer") and self._stream.use_file_transfer + + def __repr__(self) -> str: + return f"FileBasedStreamPartition({self._stream.name}, {self._slice})" + + +class FileBasedStreamPartitionGenerator(PartitionGenerator): + def __init__( + self, + stream: AbstractFileBasedStream, + message_repository: MessageRepository, + sync_mode: SyncMode, + cursor_field: Optional[List[str]], + state: Optional[MutableMapping[str, Any]], + cursor: "AbstractConcurrentFileBasedCursor", + ): + self._stream = stream + self._message_repository = message_repository + self._sync_mode = sync_mode + self._cursor_field = cursor_field + self._state = state + self._cursor = cursor + + def generate(self) -> Iterable[FileBasedStreamPartition]: + pending_partitions = [] + for _slice in self._stream.stream_slices(sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state): + if _slice is not None: + for file in _slice.get("files", []): + pending_partitions.append( + FileBasedStreamPartition( + self._stream, + {"files": [copy.deepcopy(file)]}, + self._message_repository, + self._sync_mode, + self._cursor_field, + self._state, + self._cursor, + ) + ) + self._cursor.set_pending_partitions(pending_partitions) + yield from pending_partitions diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py new file mode 100644 index 000000000000..590f37bb6d63 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py @@ -0,0 +1,5 @@ +from .abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor +from .file_based_concurrent_cursor import FileBasedConcurrentCursor +from .file_based_final_state_cursor import FileBasedFinalStateCursor + +__all__ = ["AbstractConcurrentFileBasedCursor", "FileBasedConcurrentCursor", "FileBasedFinalStateCursor"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py new file mode 100644 index 000000000000..d21a6a01e70e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py @@ -0,0 +1,68 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from datetime import datetime +from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping + +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.types import StreamState +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record + +if TYPE_CHECKING: + from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition + + +class AbstractConcurrentFileBasedCursor(Cursor, AbstractFileBasedCursor, ABC): + def __init__(self, *args: Any, **kwargs: Any) -> None: + pass + + @property + @abstractmethod + def state(self) -> MutableMapping[str, Any]: + ... + + @abstractmethod + def observe(self, record: Record) -> None: + ... + + @abstractmethod + def close_partition(self, partition: Partition) -> None: + ... + + @abstractmethod + def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None: + ... + + @abstractmethod + def add_file(self, file: RemoteFile) -> None: + ... + + @abstractmethod + def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]: + ... + + @abstractmethod + def get_state(self) -> MutableMapping[str, Any]: + ... + + @abstractmethod + def set_initial_state(self, value: StreamState) -> None: + ... + + @abstractmethod + def get_start_time(self) -> datetime: + ... + + @abstractmethod + def emit_state_message(self) -> None: + ... + + @abstractmethod + def ensure_at_least_one_state_emitted(self) -> None: + ... diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py new file mode 100644 index 000000000000..0e3acaf85366 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py @@ -0,0 +1,277 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from datetime import datetime, timedelta +from threading import RLock +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, MutableMapping, Optional, Tuple + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor +from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor +from airbyte_cdk.sources.file_based.types import StreamState +from airbyte_cdk.sources.message.repository import MessageRepository +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record + +if TYPE_CHECKING: + from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition + +_NULL_FILE = "" + + +class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor): + CURSOR_FIELD = "_ab_source_file_last_modified" + DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = DefaultFileBasedCursor.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL + DEFAULT_MAX_HISTORY_SIZE = 10_000 + DATE_TIME_FORMAT = DefaultFileBasedCursor.DATE_TIME_FORMAT + zero_value = datetime.min + zero_cursor_value = f"0001-01-01T00:00:00.000000Z_{_NULL_FILE}" + + def __init__( + self, + stream_config: FileBasedStreamConfig, + stream_name: str, + stream_namespace: Optional[str], + stream_state: MutableMapping[str, Any], + message_repository: MessageRepository, + connector_state_manager: ConnectorStateManager, + cursor_field: CursorField, + ) -> None: + super().__init__() + self._stream_name = stream_name + self._stream_namespace = stream_namespace + self._state = stream_state + self._message_repository = message_repository + self._connector_state_manager = connector_state_manager + self._cursor_field = cursor_field + self._time_window_if_history_is_full = timedelta( + days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL + ) + self._state_lock = RLock() + self._pending_files_lock = RLock() + self._pending_files: Optional[Dict[str, RemoteFile]] = None + self._file_to_datetime_history = stream_state.get("history", {}) if stream_state else {} + self._prev_cursor_value = self._compute_prev_sync_cursor(stream_state) + self._sync_start = self._compute_start_time() + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + def observe(self, record: Record) -> None: + pass + + def close_partition(self, partition: Partition) -> None: + with self._pending_files_lock: + if self._pending_files is None: + raise RuntimeError("Expected pending partitions to be set but it was not. This is unexpected. Please contact Support.") + + def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None: + with self._pending_files_lock: + self._pending_files = {} + for partition in partitions: + _slice = partition.to_slice() + if _slice is None: + continue + for file in _slice["files"]: + if file.uri in self._pending_files.keys(): + raise RuntimeError(f"Already found file {_slice} in pending files. This is unexpected. Please contact Support.") + self._pending_files.update({file.uri: file}) + + def _compute_prev_sync_cursor(self, value: Optional[StreamState]) -> Tuple[datetime, str]: + if not value: + return self.zero_value, "" + prev_cursor_str = value.get(self._cursor_field.cursor_field_key) or self.zero_cursor_value + # So if we see a cursor greater than the earliest file, it means that we have likely synced all files. + # However, we take the earliest file as the cursor value for the purpose of checking which files to + # sync, in case new files have been uploaded in the meantime. + # This should be very rare, as it would indicate a race condition where a file with an earlier + # last_modified time was uploaded after a file with a later last_modified time. Since last_modified + # represents the start time that the file was uploaded, we can usually expect that all previous + # files have already been uploaded. If that's the case, they'll be in history and we'll skip + # re-uploading them. + earliest_file_cursor_value = self._get_cursor_key_from_file(self._compute_earliest_file_in_history()) + cursor_str = min(prev_cursor_str, earliest_file_cursor_value) + cursor_dt, cursor_uri = cursor_str.split("_", 1) + return datetime.strptime(cursor_dt, self.DATE_TIME_FORMAT), cursor_uri + + def _get_cursor_key_from_file(self, file: Optional[RemoteFile]) -> str: + if file: + return f"{datetime.strftime(file.last_modified, self.DATE_TIME_FORMAT)}_{file.uri}" + return self.zero_cursor_value + + def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]: + with self._state_lock: + if self._file_to_datetime_history: + filename, last_modified = min(self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])) + return RemoteFile(uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT)) + else: + return None + + def add_file(self, file: RemoteFile) -> None: + """ + Add a file to the cursor. This method is called when a file is processed by the stream. + :param file: The file to add + """ + if self._pending_files is None: + raise RuntimeError("Expected pending partitions to be set but it was not. This is unexpected. Please contact Support.") + with self._pending_files_lock: + with self._state_lock: + if file.uri not in self._pending_files: + self._message_repository.emit_message( + AirbyteMessage( + type=Type.LOG, + log=AirbyteLogMessage( + level=Level.WARN, + message=f"The file {file.uri} was not found in the list of pending files. This is unexpected. Please contact Support", + ), + ) + ) + else: + self._pending_files.pop(file.uri) + self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT) + if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE: + # Get the earliest file based on its last modified date and its uri + oldest_file = self._compute_earliest_file_in_history() + if oldest_file: + del self._file_to_datetime_history[oldest_file.uri] + else: + raise Exception( + "The history is full but there is no files in the history. This should never happen and might be indicative of a bug in the CDK." + ) + self.emit_state_message() + + def emit_state_message(self) -> None: + with self._state_lock: + new_state = self.get_state() + self._connector_state_manager.update_state_for_stream( + self._stream_name, + self._stream_namespace, + new_state, + ) + state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace) + self._message_repository.emit_message(state_message) + + def _get_new_cursor_value(self) -> str: + with self._pending_files_lock: + with self._state_lock: + if self._pending_files: + # If there are partitions that haven't been synced, we don't know whether the files that have been synced + # represent a contiguous region. + # To avoid missing files, we only increment the cursor up to the oldest pending file, because we know + # that all older files have been synced. + return self._get_cursor_key_from_file(self._compute_earliest_pending_file()) + elif self._file_to_datetime_history: + # If all partitions have been synced, we know that the sync is up-to-date and so can advance + # the cursor to the newest file in history. + return self._get_cursor_key_from_file(self._compute_latest_file_in_history()) + else: + return f"{self.zero_value.strftime(self.DATE_TIME_FORMAT)}_" + + def _compute_earliest_pending_file(self) -> Optional[RemoteFile]: + if self._pending_files: + return min(self._pending_files.values(), key=lambda x: x.last_modified) + else: + return None + + def _compute_latest_file_in_history(self) -> Optional[RemoteFile]: + with self._state_lock: + if self._file_to_datetime_history: + filename, last_modified = max(self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])) + return RemoteFile(uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT)) + else: + return None + + def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]: + """ + Given the list of files in the source, return the files that should be synced. + :param all_files: All files in the source + :param logger: + :return: The files that should be synced + """ + with self._state_lock: + if self._is_history_full(): + logger.warning( + f"The state history is full. " + f"This sync and future syncs won't be able to use the history to filter out duplicate files. " + f"It will instead use the time window of {self._time_window_if_history_is_full} to filter out files." + ) + for f in all_files: + if self._should_sync_file(f, logger): + yield f + + def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool: + with self._state_lock: + if file.uri in self._file_to_datetime_history: + # If the file's uri is in the history, we should sync the file if it has been modified since it was synced + updated_at_from_history = datetime.strptime(self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT) + if file.last_modified < updated_at_from_history: + self._message_repository.emit_message( + AirbyteMessage( + type=Type.LOG, + log=AirbyteLogMessage( + level=Level.WARN, + message=f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file.", + ), + ) + ) + return False + else: + return file.last_modified > updated_at_from_history + + prev_cursor_timestamp, prev_cursor_uri = self._prev_cursor_value + if self._is_history_full(): + if file.last_modified > prev_cursor_timestamp: + # If the history is partial and the file's datetime is strictly greater than the cursor, we should sync it + return True + elif file.last_modified == prev_cursor_timestamp: + # If the history is partial and the file's datetime is equal to the earliest file in the history, + # we should sync it if its uri is greater than or equal to the cursor value. + return file.uri > prev_cursor_uri + else: + return file.last_modified >= self._sync_start + else: + # The file is not in the history and the history is complete. We know we need to sync the file + return True + + def _is_history_full(self) -> bool: + """ + Returns true if the state's history is full, meaning new entries will start to replace old entries. + """ + with self._state_lock: + if self._file_to_datetime_history is None: + raise RuntimeError("The history object has not been set. This is unexpected. Please contact Support.") + return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE + + def _compute_start_time(self) -> datetime: + if not self._file_to_datetime_history: + return datetime.min + else: + earliest = min(self._file_to_datetime_history.values()) + earliest_dt = datetime.strptime(earliest, self.DATE_TIME_FORMAT) + if self._is_history_full(): + time_window = datetime.now() - self._time_window_if_history_is_full + earliest_dt = min(earliest_dt, time_window) + return earliest_dt + + def get_start_time(self) -> datetime: + return self._sync_start + + def get_state(self) -> MutableMapping[str, Any]: + """ + Get the state of the cursor. + """ + with self._state_lock: + return {"history": self._file_to_datetime_history, self._cursor_field.cursor_field_key: self._get_new_cursor_value()} + + def set_initial_state(self, value: StreamState) -> None: + pass + + def ensure_at_least_one_state_emitted(self) -> None: + self.emit_state_message() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py new file mode 100644 index 000000000000..7181ecd15d7f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py @@ -0,0 +1,71 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from datetime import datetime +from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping, Optional + +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor +from airbyte_cdk.sources.file_based.types import StreamState +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record + +if TYPE_CHECKING: + from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition + + +class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor): + """Cursor that is used to guarantee at least one state message is emitted for a concurrent file-based stream.""" + + def __init__( + self, stream_config: FileBasedStreamConfig, message_repository: MessageRepository, stream_namespace: Optional[str], **kwargs: Any + ): + self._stream_name = stream_config.name + self._stream_namespace = stream_namespace + self._message_repository = message_repository + # Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel + # state message rather than manage overall source state. This is also only temporary as we move to the resumable + # full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state. + self._connector_state_manager = ConnectorStateManager() + + @property + def state(self) -> MutableMapping[str, Any]: + return {NO_CURSOR_STATE_KEY: True} + + def observe(self, record: Record) -> None: + pass + + def close_partition(self, partition: Partition) -> None: + pass + + def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None: + pass + + def add_file(self, file: RemoteFile) -> None: + pass + + def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]: + return all_files + + def get_state(self) -> MutableMapping[str, Any]: + return {} + + def set_initial_state(self, value: StreamState) -> None: + return None + + def get_start_time(self) -> datetime: + return datetime.min + + def emit_state_message(self) -> None: + pass + + def ensure_at_least_one_state_emitted(self) -> None: + self._connector_state_manager.update_state_for_stream(self._stream_name, self._stream_namespace, self.state) + state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace) + self._message_repository.emit_message(state_message) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/__init__.py new file mode 100644 index 000000000000..c1bf15a5d01f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/__init__.py @@ -0,0 +1,4 @@ +from .abstract_file_based_cursor import AbstractFileBasedCursor +from .default_file_based_cursor import DefaultFileBasedCursor + +__all__ = ["AbstractFileBasedCursor", "DefaultFileBasedCursor"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py new file mode 100644 index 000000000000..f38a5364135c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py @@ -0,0 +1,64 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from datetime import datetime +from typing import Any, Iterable, MutableMapping + +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.types import StreamState + + +class AbstractFileBasedCursor(ABC): + """ + Abstract base class for cursors used by file-based streams. + """ + + @abstractmethod + def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any): + """ + Common interface for all cursors. + """ + ... + + @abstractmethod + def add_file(self, file: RemoteFile) -> None: + """ + Add a file to the cursor. This method is called when a file is processed by the stream. + :param file: The file to add + """ + ... + + @abstractmethod + def set_initial_state(self, value: StreamState) -> None: + """ + Set the initial state of the cursor. The cursor cannot be initialized at construction time because the stream doesn't know its state yet. + :param value: The stream state + """ + + @abstractmethod + def get_state(self) -> MutableMapping[str, Any]: + """ + Get the state of the cursor. + """ + ... + + @abstractmethod + def get_start_time(self) -> datetime: + """ + Returns the start time of the current sync. + """ + ... + + @abstractmethod + def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]: + """ + Given the list of files in the source, return the files that should be synced. + :param all_files: All files in the source + :param logger: + :return: The files that should be synced + """ + ... diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py new file mode 100644 index 000000000000..58d64acbf63d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py @@ -0,0 +1,132 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from datetime import datetime, timedelta +from typing import Any, Iterable, MutableMapping, Optional + +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.types import StreamState + + +class DefaultFileBasedCursor(AbstractFileBasedCursor): + DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3 + DEFAULT_MAX_HISTORY_SIZE = 10_000 + DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + CURSOR_FIELD = "_ab_source_file_last_modified" + + def __init__(self, stream_config: FileBasedStreamConfig, **_: Any): + super().__init__(stream_config) + self._file_to_datetime_history: MutableMapping[str, str] = {} + self._time_window_if_history_is_full = timedelta( + days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL + ) + + if self._time_window_if_history_is_full <= timedelta(): + raise ValueError(f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}") + + self._start_time = self._compute_start_time() + self._initial_earliest_file_in_history: Optional[RemoteFile] = None + + def set_initial_state(self, value: StreamState) -> None: + self._file_to_datetime_history = value.get("history", {}) + self._start_time = self._compute_start_time() + self._initial_earliest_file_in_history = self._compute_earliest_file_in_history() + + def add_file(self, file: RemoteFile) -> None: + self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT) + if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE: + # Get the earliest file based on its last modified date and its uri + oldest_file = self._compute_earliest_file_in_history() + if oldest_file: + del self._file_to_datetime_history[oldest_file.uri] + else: + raise Exception( + "The history is full but there is no files in the history. This should never happen and might be indicative of a bug in the CDK." + ) + + def get_state(self) -> StreamState: + state = {"history": self._file_to_datetime_history, self.CURSOR_FIELD: self._get_cursor()} + return state + + def _get_cursor(self) -> Optional[str]: + """ + Returns the cursor value. + + Files are synced in order of last-modified with secondary sort on filename, so the cursor value is + a string joining the last-modified timestamp of the last synced file and the name of the file. + """ + if self._file_to_datetime_history.items(): + filename, timestamp = max(self._file_to_datetime_history.items(), key=lambda x: (x[1], x[0])) + return f"{timestamp}_{filename}" + return None + + def _is_history_full(self) -> bool: + """ + Returns true if the state's history is full, meaning new entries will start to replace old entries. + """ + return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE + + def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool: + if file.uri in self._file_to_datetime_history: + # If the file's uri is in the history, we should sync the file if it has been modified since it was synced + updated_at_from_history = datetime.strptime(self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT) + if file.last_modified < updated_at_from_history: + logger.warning( + f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file." + ) + else: + return file.last_modified > updated_at_from_history + return file.last_modified > updated_at_from_history + if self._is_history_full(): + if self._initial_earliest_file_in_history is None: + return True + if file.last_modified > self._initial_earliest_file_in_history.last_modified: + # If the history is partial and the file's datetime is strictly greater than the earliest file in the history, + # we should sync it + return True + elif file.last_modified == self._initial_earliest_file_in_history.last_modified: + # If the history is partial and the file's datetime is equal to the earliest file in the history, + # we should sync it if its uri is strictly greater than the earliest file in the history + return file.uri > self._initial_earliest_file_in_history.uri + else: + # Otherwise, only sync the file if it has been modified since the start of the time window + return file.last_modified >= self.get_start_time() + else: + # The file is not in the history and the history is complete. We know we need to sync the file + return True + + def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]: + if self._is_history_full(): + logger.warning( + f"The state history is full. " + f"This sync and future syncs won't be able to use the history to filter out duplicate files. " + f"It will instead use the time window of {self._time_window_if_history_is_full} to filter out files." + ) + for f in all_files: + if self._should_sync_file(f, logger): + yield f + + def get_start_time(self) -> datetime: + return self._start_time + + def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]: + if self._file_to_datetime_history: + filename, last_modified = min(self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])) + return RemoteFile(uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT)) + else: + return None + + def _compute_start_time(self) -> datetime: + if not self._file_to_datetime_history: + return datetime.min + else: + earliest = min(self._file_to_datetime_history.values()) + earliest_dt = datetime.strptime(earliest, self.DATE_TIME_FORMAT) + if self._is_history_full(): + time_window = datetime.now() - self._time_window_if_history_is_full + earliest_dt = min(earliest_dt, time_window) + return earliest_dt diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py new file mode 100644 index 000000000000..2b92f103be60 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -0,0 +1,342 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import asyncio +import itertools +import traceback +from copy import deepcopy +from functools import cache +from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType +from airbyte_cdk.sources.file_based.exceptions import ( + FileBasedSourceError, + InvalidSchemaError, + MissingSchemaError, + RecordParseError, + SchemaInferenceError, + StopSyncPerValidationPolicy, +) +from airbyte_cdk.sources.file_based.file_types import FileTransfer +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType, file_transfer_schema, merge_schemas, schemaless_schema +from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.types import StreamSlice +from airbyte_cdk.sources.streams import IncrementalMixin +from airbyte_cdk.sources.streams.core import JsonSchema +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): + + """ + The default file-based stream. + """ + + FILE_TRANSFER_KW = "use_file_transfer" + DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + ab_last_mod_col = "_ab_source_file_last_modified" + ab_file_name_col = "_ab_source_file_url" + modified = "modified" + source_file_url = "source_file_url" + airbyte_columns = [ab_last_mod_col, ab_file_name_col] + use_file_transfer = False + + def __init__(self, **kwargs: Any): + if self.FILE_TRANSFER_KW in kwargs: + self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False) + super().__init__(**kwargs) + + @property + def state(self) -> MutableMapping[str, Any]: + return self._cursor.get_state() + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + """State setter, accept state serialized by state getter.""" + self._cursor.set_initial_state(value) + + @property # type: ignore # mypy complains wrong type, but AbstractFileBasedCursor is parent of file-based cursors + def cursor(self) -> Optional[AbstractFileBasedCursor]: + return self._cursor + + @cursor.setter + def cursor(self, value: AbstractFileBasedCursor) -> None: + if self._cursor is not None: + raise RuntimeError(f"Cursor for stream {self.name} is already set. This is unexpected. Please contact Support.") + self._cursor = value + + @property + def primary_key(self) -> PrimaryKeyType: + return self.config.primary_key or self.get_parser().get_parser_defined_primary_key(self.config) + + def _filter_schema_invalid_properties(self, configured_catalog_json_schema: Dict[str, Any]) -> Dict[str, Any]: + if self.use_file_transfer: + return { + "type": "object", + "properties": {"file_path": {"type": "string"}, "file_size": {"type": "string"}, self.ab_file_name_col: {"type": "string"}}, + } + else: + return super()._filter_schema_invalid_properties(configured_catalog_json_schema) + + def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]: + # Sort files by last_modified, uri and return them grouped by last_modified + all_files = self.list_files() + files_to_read = self._cursor.get_files_to_sync(all_files, self.logger) + sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri)) + slices = [{"files": list(group[1])} for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)] + return slices + + def transform_record(self, record: dict[str, Any], file: RemoteFile, last_updated: str) -> dict[str, Any]: + # adds _ab_source_file_last_modified and _ab_source_file_url to the record + record[self.ab_last_mod_col] = last_updated + record[self.ab_file_name_col] = file.uri + return record + + def transform_record_for_file_transfer(self, record: dict[str, Any], file: RemoteFile) -> dict[str, Any]: + # timstamp() returns a float representing the number of seconds since the unix epoch + record[self.modified] = int(file.last_modified.timestamp()) * 1000 + record[self.source_file_url] = file.uri + return record + + def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]: + """ + Yield all records from all remote files in `list_files_for_this_sync`. + + If an error is encountered reading records from a file, log a message and do not attempt + to sync the rest of the file. + """ + schema = self.catalog_schema + if schema is None: + # On read requests we should always have the catalog available + raise MissingSchemaError(FileBasedSourceError.MISSING_SCHEMA, stream=self.name) + # The stream only supports a single file type, so we can use the same parser for all files + parser = self.get_parser() + for file in stream_slice["files"]: + # only serialize the datetime once + file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT) + n_skipped = line_no = 0 + + try: + if self.use_file_transfer: + self.logger.info(f"{self.name}: {file} file-based syncing") + # todo: complete here the code to not rely on local parser + file_transfer = FileTransfer() + for record in file_transfer.get_file(self.config, file, self.stream_reader, self.logger): + line_no += 1 + if not self.record_passes_validation_policy(record): + n_skipped += 1 + continue + record = self.transform_record_for_file_transfer(record, file) + yield stream_data_to_airbyte_message(self.name, record, is_file_transfer_message=True) + else: + for record in parser.parse_records(self.config, file, self.stream_reader, self.logger, schema): + line_no += 1 + if self.config.schemaless: + record = {"data": record} + elif not self.record_passes_validation_policy(record): + n_skipped += 1 + continue + record = self.transform_record(record, file, file_datetime_string) + yield stream_data_to_airbyte_message(self.name, record) + self._cursor.add_file(file) + + except StopSyncPerValidationPolicy: + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.WARN, + message=f"Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream={self.name} file={file.uri} validation_policy={self.config.validation_policy.value} n_skipped={n_skipped}", + ), + ) + break + + except RecordParseError: + # Increment line_no because the exception was raised before we could increment it + line_no += 1 + self.errors_collector.collect( + AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.ERROR, + message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}", + stack_trace=traceback.format_exc(), + ), + ), + ) + + except AirbyteTracedException as exc: + # Re-raise the exception to stop the whole sync immediately as this is a fatal error + raise exc + + except Exception: + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.ERROR, + message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}", + stack_trace=traceback.format_exc(), + ), + ) + + finally: + if n_skipped: + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.WARN, + message=f"Records in file did not pass validation policy. stream={self.name} file={file.uri} n_skipped={n_skipped} validation_policy={self.validation_policy.name}", + ), + ) + + @property + def cursor_field(self) -> Union[str, List[str]]: + """ + Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. + :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. + """ + return self.ab_last_mod_col + + @cache + def get_json_schema(self) -> JsonSchema: + extra_fields = { + self.ab_last_mod_col: {"type": "string"}, + self.ab_file_name_col: {"type": "string"}, + } + try: + schema = self._get_raw_json_schema() + except InvalidSchemaError as config_exception: + raise AirbyteTracedException( + internal_message="Please check the logged errors for more information.", + message=FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value, + exception=AirbyteTracedException(exception=config_exception), + failure_type=FailureType.config_error, + ) + except AirbyteTracedException as ate: + raise ate + except Exception as exc: + raise SchemaInferenceError(FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name) from exc + else: + return {"type": "object", "properties": {**extra_fields, **schema["properties"]}} + + def _get_raw_json_schema(self) -> JsonSchema: + if self.use_file_transfer: + return file_transfer_schema + elif self.config.input_schema: + return self.config.get_input_schema() # type: ignore + elif self.config.schemaless: + return schemaless_schema + else: + files = self.list_files() + first_n_files = len(files) + + if self.config.recent_n_files_to_read_for_schema_discovery: + self.logger.info( + msg=( + f"Only first {self.config.recent_n_files_to_read_for_schema_discovery} files will be used to infer schema " + f"for stream {self.name} due to limitation in config." + ) + ) + first_n_files = self.config.recent_n_files_to_read_for_schema_discovery + + if first_n_files == 0: + self.logger.warning(msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream.") + return schemaless_schema + + max_n_files_for_schema_inference = self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser()) + + if first_n_files > max_n_files_for_schema_inference: + # Use the most recent files for schema inference, so we pick up schema changes during discovery. + self.logger.warning(msg=f"Refusing to infer schema for {first_n_files} files; using {max_n_files_for_schema_inference} files.") + first_n_files = max_n_files_for_schema_inference + + files = sorted(files, key=lambda x: x.last_modified, reverse=True)[:first_n_files] + + inferred_schema = self.infer_schema(files) + + if not inferred_schema: + raise InvalidSchemaError( + FileBasedSourceError.INVALID_SCHEMA_ERROR, + details=f"Empty schema. Please check that the files are valid for format {self.config.format}", + stream=self.name, + ) + + schema = {"type": "object", "properties": inferred_schema} + + return schema + + def get_files(self) -> Iterable[RemoteFile]: + """ + Return all files that belong to the stream as defined by the stream's globs. + """ + return self.stream_reader.get_matching_files(self.config.globs or [], self.config.legacy_prefix, self.logger) + + def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: + loop = asyncio.get_event_loop() + schema = loop.run_until_complete(self._infer_schema(files)) + # as infer schema returns a Mapping that is assumed to be immutable, we need to create a deepcopy to avoid modifying the reference + return self._fill_nulls(deepcopy(schema)) + + @staticmethod + def _fill_nulls(schema: Mapping[str, Any]) -> Mapping[str, Any]: + if isinstance(schema, dict): + for k, v in schema.items(): + if k == "type": + if isinstance(v, list): + if "null" not in v: + schema[k] = ["null"] + v + elif v != "null": + schema[k] = ["null", v] + else: + DefaultFileBasedStream._fill_nulls(v) + elif isinstance(schema, list): + for item in schema: + DefaultFileBasedStream._fill_nulls(item) + return schema + + async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: + """ + Infer the schema for a stream. + + Each file type has a corresponding `infer_schema` handler. + Dispatch on file type. + """ + base_schema: SchemaType = {} + pending_tasks: Set[asyncio.tasks.Task[SchemaType]] = set() + + n_started, n_files = 0, len(files) + files_iterator = iter(files) + while pending_tasks or n_started < n_files: + while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (file := next(files_iterator, None)): + pending_tasks.add(asyncio.create_task(self._infer_file_schema(file))) + n_started += 1 + # Return when the first task is completed so that we can enqueue a new task as soon as the + # number of concurrent tasks drops below the number allowed. + done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) + for task in done: + try: + base_schema = merge_schemas(base_schema, task.result()) + except AirbyteTracedException as ate: + raise ate + except Exception as exc: + self.logger.error(f"An error occurred inferring the schema. \n {traceback.format_exc()}", exc_info=exc) + + return base_schema + + async def _infer_file_schema(self, file: RemoteFile) -> SchemaType: + try: + return await self.get_parser().infer_schema(self.config, file, self.stream_reader, self.logger) + except AirbyteTracedException as ate: + raise ate + except Exception as exc: + raise SchemaInferenceError( + FileBasedSourceError.SCHEMA_INFERENCE_ERROR, + file=file.uri, + format=str(self.config.format), + stream=self.name, + ) from exc diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/types.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/types.py new file mode 100644 index 000000000000..b83bf37a37a7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/types.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from __future__ import annotations + +from typing import Any, Mapping, MutableMapping + +StreamSlice = Mapping[str, Any] +StreamState = MutableMapping[str, Any] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/http_config.py b/airbyte-cdk/python/airbyte_cdk/sources/http_config.py new file mode 100644 index 000000000000..289ed9a923fb --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/http_config.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +# The goal of this variable is to make an implicit dependency explicit. As part of of the Concurrent CDK work, we are facing a situation +# where the connection pool size is too small to serve all the threads (see https://github.com/airbytehq/airbyte/issues/32072). In +# order to fix that, we will increase the requests library pool_maxsize. As there are many pieces of code that sets a requests.Session, we +# are creating this variable here so that a change in one affects the other. This can be removed once we merge how we do HTTP requests in +# one piece of code or once we make connection pool size configurable for each piece of code +MAX_CONNECTION_POOL_SIZE = 20 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/http_logger.py b/airbyte-cdk/python/airbyte_cdk/sources/http_logger.py new file mode 100644 index 000000000000..7158c8003e5b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/http_logger.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Optional, Union + +import requests +from airbyte_cdk.sources.message import LogMessage + + +def format_http_message( + response: requests.Response, title: str, description: str, stream_name: Optional[str], is_auxiliary: bool = None +) -> LogMessage: + request = response.request + log_message = { + "http": { + "title": title, + "description": description, + "request": { + "method": request.method, + "body": { + "content": _normalize_body_string(request.body), + }, + "headers": dict(request.headers), + }, + "response": { + "body": { + "content": response.text, + }, + "headers": dict(response.headers), + "status_code": response.status_code, + }, + }, + "log": { + "level": "debug", + }, + "url": {"full": request.url}, + } + if is_auxiliary is not None: + log_message["http"]["is_auxiliary"] = is_auxiliary + if stream_name: + log_message["airbyte_cdk"] = {"stream": {"name": stream_name}} + return log_message + + +def _normalize_body_string(body_str: Optional[Union[str, bytes]]) -> Optional[str]: + return body_str.decode() if isinstance(body_str, (bytes, bytearray)) else body_str diff --git a/airbyte-cdk/python/airbyte_cdk/sources/message/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/message/__init__.py new file mode 100644 index 000000000000..c545c0d736ab --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/message/__init__.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# + +from .repository import ( + InMemoryMessageRepository, + LogAppenderMessageRepositoryDecorator, + LogMessage, + MessageRepository, + NoopMessageRepository, +) + +__all__ = ["InMemoryMessageRepository", "LogAppenderMessageRepositoryDecorator", "LogMessage", "MessageRepository", "NoopMessageRepository"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/message/repository.py b/airbyte-cdk/python/airbyte_cdk/sources/message/repository.py new file mode 100644 index 000000000000..bf90830900ae --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/message/repository.py @@ -0,0 +1,123 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from abc import ABC, abstractmethod +from collections import deque +from typing import Callable, Deque, Iterable, List, Optional + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type +from airbyte_cdk.sources.utils.types import JsonType +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets + +_LOGGER = logging.getLogger("MessageRepository") +_SUPPORTED_MESSAGE_TYPES = {Type.CONTROL, Type.LOG} +LogMessage = dict[str, JsonType] + +_SEVERITY_BY_LOG_LEVEL = { + Level.FATAL: 1, + Level.ERROR: 2, + Level.WARN: 3, + Level.INFO: 4, + Level.DEBUG: 5, + Level.TRACE: 5, +} + + +def _is_severe_enough(threshold: Level, level: Level) -> bool: + if threshold not in _SEVERITY_BY_LOG_LEVEL: + _LOGGER.warning(f"Log level {threshold} for threshold is not supported. This is probably a CDK bug. Please contact Airbyte.") + return True + + if level not in _SEVERITY_BY_LOG_LEVEL: + _LOGGER.warning( + f"Log level {level} is not supported. This is probably a source bug. Please contact the owner of the source or Airbyte." + ) + return True + + return _SEVERITY_BY_LOG_LEVEL[threshold] >= _SEVERITY_BY_LOG_LEVEL[level] + + +class MessageRepository(ABC): + @abstractmethod + def emit_message(self, message: AirbyteMessage) -> None: + raise NotImplementedError() + + @abstractmethod + def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: + """ + Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if + the log level is less severe than what is configured + """ + raise NotImplementedError() + + @abstractmethod + def consume_queue(self) -> Iterable[AirbyteMessage]: + raise NotImplementedError() + + +class NoopMessageRepository(MessageRepository): + def emit_message(self, message: AirbyteMessage) -> None: + pass + + def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: + pass + + def consume_queue(self) -> Iterable[AirbyteMessage]: + return [] + + +class InMemoryMessageRepository(MessageRepository): + def __init__(self, log_level: Level = Level.INFO) -> None: + self._message_queue: Deque[AirbyteMessage] = deque() + self._log_level = log_level + + def emit_message(self, message: AirbyteMessage) -> None: + self._message_queue.append(message) + + def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: + if _is_severe_enough(self._log_level, level): + self.emit_message( + AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=level, message=filter_secrets(json.dumps(message_provider())))) + ) + + def consume_queue(self) -> Iterable[AirbyteMessage]: + while self._message_queue: + yield self._message_queue.popleft() + + +class LogAppenderMessageRepositoryDecorator(MessageRepository): + def __init__(self, dict_to_append: LogMessage, decorated: MessageRepository, log_level: Level = Level.INFO): + self._dict_to_append = dict_to_append + self._decorated = decorated + self._log_level = log_level + + def emit_message(self, message: AirbyteMessage) -> None: + self._decorated.emit_message(message) + + def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: + if _is_severe_enough(self._log_level, level): + message = message_provider() + self._append_second_to_first(message, self._dict_to_append) + self._decorated.log_message(level, lambda: message) + + def consume_queue(self) -> Iterable[AirbyteMessage]: + return self._decorated.consume_queue() + + def _append_second_to_first(self, first: LogMessage, second: LogMessage, path: Optional[List[str]] = None) -> LogMessage: + if path is None: + path = [] + + for key in second: + if key in first: + if isinstance(first[key], dict) and isinstance(second[key], dict): + self._append_second_to_first(first[key], second[key], path + [str(key)]) # type: ignore # type is verified above + else: + if first[key] != second[key]: + _LOGGER.warning("Conflict at %s" % ".".join(path + [str(key)])) + first[key] = second[key] + else: + first[key] = second[key] + return first diff --git a/airbyte-cdk/python/airbyte_cdk/sources/source.py b/airbyte-cdk/python/airbyte_cdk/sources/source.py new file mode 100644 index 000000000000..975770c88949 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/source.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import logging +from abc import ABC, abstractmethod +from typing import Any, Generic, Iterable, List, Mapping, Optional, TypeVar + +from airbyte_cdk.connector import BaseConnector, DefaultConnectorMixin, TConfig +from airbyte_cdk.models import ( + AirbyteCatalog, + AirbyteMessage, + AirbyteStateMessage, + AirbyteStateMessageSerializer, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, +) + +TState = TypeVar("TState") +TCatalog = TypeVar("TCatalog") + + +class ExperimentalClassWarning(DeprecationWarning): + pass + + +class BaseSource(BaseConnector[TConfig], ABC, Generic[TConfig, TState, TCatalog]): + @abstractmethod + def read_state(self, state_path: str) -> TState: + ... + + @abstractmethod + def read_catalog(self, catalog_path: str) -> TCatalog: + ... + + @abstractmethod + def read(self, logger: logging.Logger, config: TConfig, catalog: TCatalog, state: Optional[TState] = None) -> Iterable[AirbyteMessage]: + """ + Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. + """ + + @abstractmethod + def discover(self, logger: logging.Logger, config: TConfig) -> AirbyteCatalog: + """ + Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a + Postgres database, returns an Airbyte catalog where each postgres table is a stream, and each table column is a field. + """ + + +class Source( + DefaultConnectorMixin, + BaseSource[Mapping[str, Any], List[AirbyteStateMessage], ConfiguredAirbyteCatalog], + ABC, +): + # can be overridden to change an input state. + @classmethod + def read_state(cls, state_path: str) -> List[AirbyteStateMessage]: + """ + Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either + a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the + incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s). + :param state_path: The filepath to where the stream states are located + :return: The complete stream state based on the connector's previous sync + """ + parsed_state_messages = [] + if state_path: + state_obj = BaseConnector._read_json_file(state_path) + if state_obj: + for state in state_obj: # type: ignore # `isinstance(state_obj, List)` ensures that this is a list + parsed_message = AirbyteStateMessageSerializer.load(state) + if not parsed_message.stream and not parsed_message.data and not parsed_message.global_: + raise ValueError("AirbyteStateMessage should contain either a stream, global, or state field") + parsed_state_messages.append(parsed_message) + return parsed_state_messages + + # can be overridden to change an input catalog + @classmethod + def read_catalog(cls, catalog_path: str) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalogSerializer.load(cls._read_json_file(catalog_path)) + + @property + def name(self) -> str: + """Source name""" + return self.__class__.__name__ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/__init__.py new file mode 100644 index 000000000000..030502822f94 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +# Initialize Streams Package +from .core import NO_CURSOR_STATE_KEY, IncrementalMixin, CheckpointMixin, Stream + +__all__ = ["NO_CURSOR_STATE_KEY", "IncrementalMixin", "CheckpointMixin", "Stream"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/availability_strategy.py new file mode 100644 index 000000000000..f2042bc1cb92 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/availability_strategy.py @@ -0,0 +1,78 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import typing +from abc import ABC, abstractmethod +from typing import Any, Mapping, Optional, Tuple + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.streams.core import Stream, StreamData + +if typing.TYPE_CHECKING: + from airbyte_cdk.sources import Source + + +class AvailabilityStrategy(ABC): + """ + Abstract base class for checking stream availability. + """ + + @abstractmethod + def check_availability(self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None) -> Tuple[bool, Optional[str]]: + """ + Checks stream availability. + + :param stream: stream + :param logger: source logger + :param source: (optional) source + :return: A tuple of (boolean, str). If boolean is true, then the stream + is available, and no str is required. Otherwise, the stream is unavailable + for some reason and the str should describe what went wrong and how to + resolve the unavailability, if possible. + """ + + @staticmethod + def get_first_stream_slice(stream: Stream) -> Optional[Mapping[str, Any]]: + """ + Gets the first stream_slice from a given stream's stream_slices. + :param stream: stream + :raises StopIteration: if there is no first slice to return (the stream_slices generator is empty) + :return: first stream slice from 'stream_slices' generator (`None` is a valid stream slice) + """ + # We wrap the return output of stream_slices() because some implementations return types that are iterable, + # but not iterators such as lists or tuples + slices = iter( + stream.stream_slices( + cursor_field=stream.cursor_field, # type: ignore[arg-type] + sync_mode=SyncMode.full_refresh, + ) + ) + return next(slices) + + @staticmethod + def get_first_record_for_slice(stream: Stream, stream_slice: Optional[Mapping[str, Any]]) -> StreamData: + """ + Gets the first record for a stream_slice of a stream. + + :param stream: stream instance from which to read records + :param stream_slice: stream_slice parameters for slicing the stream + :raises StopIteration: if there is no first record to return (the read_records generator is empty) + :return: StreamData containing the first record in the slice + """ + # Store the original value of exit_on_rate_limit + original_exit_on_rate_limit = stream.exit_on_rate_limit + + try: + # Ensure exit_on_rate_limit is safely set to True if possible + stream.exit_on_rate_limit = True + + # We wrap the return output of read_records() because some implementations return types that are iterable, + # but not iterators such as lists or tuples + records_for_slice = iter(stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice)) + + return next(records_for_slice) + finally: + # Restore the original exit_on_rate_limit value + stream.exit_on_rate_limit = original_exit_on_rate_limit diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/call_rate.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/call_rate.py new file mode 100644 index 000000000000..eb33754504ee --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/call_rate.py @@ -0,0 +1,523 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import abc +import dataclasses +import datetime +import logging +import time +from datetime import timedelta +from threading import RLock +from typing import TYPE_CHECKING, Any, Mapping, Optional +from urllib import parse + +import requests +import requests_cache +from pyrate_limiter import InMemoryBucket, Limiter +from pyrate_limiter import Rate as PyRateRate +from pyrate_limiter import RateItem, TimeClock +from pyrate_limiter.exceptions import BucketFullException + +# prevents mypy from complaining about missing session attributes in LimiterMixin +if TYPE_CHECKING: + MIXIN_BASE = requests.Session +else: + MIXIN_BASE = object + +logger = logging.getLogger("airbyte") + + +@dataclasses.dataclass +class Rate: + """Call rate limit""" + + limit: int + interval: timedelta + + +class CallRateLimitHit(Exception): + def __init__(self, error: str, item: Any, weight: int, rate: str, time_to_wait: timedelta): + """Constructor + + :param error: error message + :param item: object passed into acquire_call + :param weight: how many credits were requested + :param rate: string representation of the rate violated + :param time_to_wait: how long should wait util more call will be available + """ + self.item = item + self.weight = weight + self.rate = rate + self.time_to_wait = time_to_wait + super().__init__(error) + + +class AbstractCallRatePolicy(abc.ABC): + """Call rate policy interface. + Should be configurable with different rules, like N per M for endpoint X. Endpoint X is matched with APIBudget. + """ + + @abc.abstractmethod + def matches(self, request: Any) -> bool: + """Tells if this policy matches specific request and should apply to it + + :param request: + :return: True if policy should apply to this request, False - otherwise + """ + + @abc.abstractmethod + def try_acquire(self, request: Any, weight: int) -> None: + """Try to acquire request + + :param request: a request object representing a single call to API + :param weight: number of requests to deduct from credit + :return: + """ + + @abc.abstractmethod + def update(self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]) -> None: + """Update call rate counting with current values + + :param available_calls: + :param call_reset_ts: + """ + + +class RequestMatcher(abc.ABC): + """Callable that help to match a request object with call rate policies.""" + + @abc.abstractmethod + def __call__(self, request: Any) -> bool: + """ + + :param request: + :return: True if matches the provided request object, False - otherwise + """ + + +class HttpRequestMatcher(RequestMatcher): + """Simple implementation of RequestMatcher for http requests case""" + + def __init__( + self, + method: Optional[str] = None, + url: Optional[str] = None, + params: Optional[Mapping[str, Any]] = None, + headers: Optional[Mapping[str, Any]] = None, + ): + """Constructor + + :param method: + :param url: + :param params: + :param headers: + """ + self._method = method + self._url = url + self._params = {str(k): str(v) for k, v in (params or {}).items()} + self._headers = {str(k): str(v) for k, v in (headers or {}).items()} + + @staticmethod + def _match_dict(obj: Mapping[str, Any], pattern: Mapping[str, Any]) -> bool: + """Check that all elements from pattern dict present and have the same values in obj dict + + :param obj: + :param pattern: + :return: + """ + return pattern.items() <= obj.items() + + def __call__(self, request: Any) -> bool: + """ + + :param request: + :return: True if matches the provided request object, False - otherwise + """ + if isinstance(request, requests.Request): + prepared_request = request.prepare() + elif isinstance(request, requests.PreparedRequest): + prepared_request = request + else: + return False + + if self._method is not None: + if prepared_request.method != self._method: + return False + if self._url is not None and prepared_request.url is not None: + url_without_params = prepared_request.url.split("?")[0] + if url_without_params != self._url: + return False + if self._params is not None: + parsed_url = parse.urlsplit(prepared_request.url) + params = dict(parse.parse_qsl(str(parsed_url.query))) + if not self._match_dict(params, self._params): + return False + if self._headers is not None: + if not self._match_dict(prepared_request.headers, self._headers): + return False + return True + + +class BaseCallRatePolicy(AbstractCallRatePolicy, abc.ABC): + def __init__(self, matchers: list[RequestMatcher]): + self._matchers = matchers + + def matches(self, request: Any) -> bool: + """Tell if this policy matches specific request and should apply to it + + :param request: + :return: True if policy should apply to this request, False - otherwise + """ + + if not self._matchers: + return True + return any(matcher(request) for matcher in self._matchers) + + +class UnlimitedCallRatePolicy(BaseCallRatePolicy): + """ + This policy is for explicit unlimited call rates. + It can be used when we want to match a specific group of requests and don't apply any limits. + + Example: + + APICallBudget( + [ + UnlimitedCallRatePolicy( + matchers=[HttpRequestMatcher(url="/some/method", headers={"sandbox": true})], + ), + FixedWindowCallRatePolicy( + matchers=[HttpRequestMatcher(url="/some/method")], + next_reset_ts=datetime.now(), + period=timedelta(hours=1) + call_limit=1000, + ), + ] + ) + + The code above will limit all calls to /some/method except calls that have header sandbox=True + """ + + def try_acquire(self, request: Any, weight: int) -> None: + """Do nothing""" + + def update(self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]) -> None: + """Do nothing""" + + +class FixedWindowCallRatePolicy(BaseCallRatePolicy): + def __init__(self, next_reset_ts: datetime.datetime, period: timedelta, call_limit: int, matchers: list[RequestMatcher]): + """A policy that allows {call_limit} calls within a {period} time interval + + :param next_reset_ts: next call rate reset time point + :param period: call rate reset period + :param call_limit: + :param matchers: + """ + + self._next_reset_ts = next_reset_ts + self._offset = period + self._call_limit = call_limit + self._calls_num = 0 + self._lock = RLock() + super().__init__(matchers=matchers) + + def try_acquire(self, request: Any, weight: int) -> None: + if weight > self._call_limit: + raise ValueError("Weight can not exceed the call limit") + if not self.matches(request): + raise ValueError("Request does not match the policy") + + with self._lock: + self._update_current_window() + + if self._calls_num + weight > self._call_limit: + reset_in = self._next_reset_ts - datetime.datetime.now() + error_message = ( + f"reached maximum number of allowed calls {self._call_limit} " f"per {self._offset} interval, next reset in {reset_in}." + ) + raise CallRateLimitHit( + error=error_message, + item=request, + weight=weight, + rate=f"{self._call_limit} per {self._offset}", + time_to_wait=reset_in, + ) + + self._calls_num += weight + + def update(self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]) -> None: + """Update call rate counters, by default, only reacts to decreasing updates of available_calls and changes to call_reset_ts. + We ignore updates with available_calls > current_available_calls to support call rate limits that are lower than API limits. + + :param available_calls: + :param call_reset_ts: + """ + with self._lock: + self._update_current_window() + current_available_calls = self._call_limit - self._calls_num + + if available_calls is not None and current_available_calls > available_calls: + logger.debug( + "got rate limit update from api, adjusting available calls from %s to %s", current_available_calls, available_calls + ) + self._calls_num = self._call_limit - available_calls + + if call_reset_ts is not None and call_reset_ts != self._next_reset_ts: + logger.debug("got rate limit update from api, adjusting reset time from %s to %s", self._next_reset_ts, call_reset_ts) + self._next_reset_ts = call_reset_ts + + def _update_current_window(self) -> None: + now = datetime.datetime.now() + if now > self._next_reset_ts: + logger.debug("started new window, %s calls available now", self._call_limit) + self._next_reset_ts = self._next_reset_ts + self._offset + self._calls_num = 0 + + +class MovingWindowCallRatePolicy(BaseCallRatePolicy): + """ + Policy to control requests rate implemented on top of PyRateLimiter lib. + The main difference between this policy and FixedWindowCallRatePolicy is that the rate-limiting window + is moving along requests that we made, and there is no moment when we reset an available number of calls. + This strategy requires saving of timestamps of all requests within a window. + """ + + def __init__(self, rates: list[Rate], matchers: list[RequestMatcher]): + """Constructor + + :param rates: list of rates, the order is important and must be ascending + :param matchers: + """ + if not rates: + raise ValueError("The list of rates can not be empty") + pyrate_rates = [PyRateRate(limit=rate.limit, interval=int(rate.interval.total_seconds() * 1000)) for rate in rates] + self._bucket = InMemoryBucket(pyrate_rates) + # Limiter will create the background task that clears old requests in the bucket + self._limiter = Limiter(self._bucket) + super().__init__(matchers=matchers) + + def try_acquire(self, request: Any, weight: int) -> None: + if not self.matches(request): + raise ValueError("Request does not match the policy") + + try: + self._limiter.try_acquire(request, weight=weight) + except BucketFullException as exc: + item = self._limiter.bucket_factory.wrap_item(request, weight) + assert isinstance(item, RateItem) + + with self._limiter.lock: + time_to_wait = self._bucket.waiting(item) + assert isinstance(time_to_wait, int) + + raise CallRateLimitHit( + error=str(exc.meta_info["error"]), + item=request, + weight=int(exc.meta_info["weight"]), + rate=str(exc.meta_info["rate"]), + time_to_wait=timedelta(milliseconds=time_to_wait), + ) + + def update(self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]) -> None: + """Adjust call bucket to reflect the state of the API server + + :param available_calls: + :param call_reset_ts: + :return: + """ + if available_calls is not None and call_reset_ts is None: # we do our best to sync buckets with API + if available_calls == 0: + with self._limiter.lock: + items_to_add = self._bucket.count() < self._bucket.rates[0].limit + if items_to_add > 0: + now: int = TimeClock().now() # type: ignore[no-untyped-call] + self._bucket.put(RateItem(name="dummy", timestamp=now, weight=items_to_add)) + # TODO: add support if needed, it might be that it is not possible to make a good solution for this case + # if available_calls is not None and call_reset_ts is not None: + # ts = call_reset_ts.timestamp() + + +class AbstractAPIBudget(abc.ABC): + """Interface to some API where a client allowed to have N calls per T interval. + + Important: APIBudget is not doing any API calls, the end user code is responsible to call this interface + to respect call rate limitation of the API. + + It supports multiple policies applied to different group of requests. To distinct these groups we use RequestMatchers. + Individual policy represented by MovingWindowCallRatePolicy and currently supports only moving window strategy. + """ + + @abc.abstractmethod + def acquire_call(self, request: Any, block: bool = True, timeout: Optional[float] = None) -> None: + """Try to get a call from budget, will block by default + + :param request: + :param block: when true (default) will block the current thread until call credit is available + :param timeout: if set will limit maximum time in block, otherwise will wait until credit is available + :raises: CallRateLimitHit - when no credits left and if timeout was set the waiting time exceed the timeout + """ + + @abc.abstractmethod + def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]: + """Find matching call rate policy for specific request""" + + @abc.abstractmethod + def update_from_response(self, request: Any, response: Any) -> None: + """Update budget information based on response from API + + :param request: the initial request that triggered this response + :param response: response from the API + """ + + +class APIBudget(AbstractAPIBudget): + """Default APIBudget implementation""" + + def __init__(self, policies: list[AbstractCallRatePolicy], maximum_attempts_to_acquire: int = 100000) -> None: + """Constructor + + :param policies: list of policies in this budget + :param maximum_attempts_to_acquire: number of attempts before throwing hit ratelimit exception, we put some big number here + to avoid situations when many threads compete with each other for a few lots over a significant amount of time + """ + + self._policies = policies + self._maximum_attempts_to_acquire = maximum_attempts_to_acquire + + def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]: + for policy in self._policies: + if policy.matches(request): + return policy + return None + + def acquire_call(self, request: Any, block: bool = True, timeout: Optional[float] = None) -> None: + """Try to get a call from budget, will block by default. + Matchers will be called sequentially in the same order they were added. + The first matcher that returns True will + + :param request: + :param block: when true (default) will block the current thread until call credit is available + :param timeout: if provided will limit maximum time in block, otherwise will wait until credit is available + :raises: CallRateLimitHit - when no calls left and if timeout was set the waiting time exceed the timeout + """ + + policy = self.get_matching_policy(request) + if policy: + self._do_acquire(request=request, policy=policy, block=block, timeout=timeout) + elif self._policies: + logger.info("no policies matched with requests, allow call by default") + + def update_from_response(self, request: Any, response: Any) -> None: + """Update budget information based on response from API + + :param request: the initial request that triggered this response + :param response: response from the API + """ + pass + + def _do_acquire(self, request: Any, policy: AbstractCallRatePolicy, block: bool, timeout: Optional[float]) -> None: + """Internal method to try to acquire a call credit + + :param request: + :param policy: + :param block: + :param timeout: + """ + last_exception = None + # sometimes we spend all budget before a second attempt, so we have few more here + for attempt in range(1, self._maximum_attempts_to_acquire): + try: + policy.try_acquire(request, weight=1) + return + except CallRateLimitHit as exc: + last_exception = exc + if block: + if timeout is not None: + time_to_wait = min(timedelta(seconds=timeout), exc.time_to_wait) + else: + time_to_wait = exc.time_to_wait + + time_to_wait = max(timedelta(0), time_to_wait) # sometimes we get negative duration + logger.info("reached call limit %s. going to sleep for %s", exc.rate, time_to_wait) + time.sleep(time_to_wait.total_seconds()) + else: + raise + + if last_exception: + logger.info("we used all %s attempts to acquire and failed", self._maximum_attempts_to_acquire) + raise last_exception + + +class HttpAPIBudget(APIBudget): + """Implementation of AbstractAPIBudget for HTTP""" + + def __init__( + self, + ratelimit_reset_header: str = "ratelimit-reset", + ratelimit_remaining_header: str = "ratelimit-remaining", + status_codes_for_ratelimit_hit: tuple[int] = (429,), + **kwargs: Any, + ): + """Constructor + + :param ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget + :param ratelimit_remaining_header: name of the header that has the number of calls left + :param status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit + """ + self._ratelimit_reset_header = ratelimit_reset_header + self._ratelimit_remaining_header = ratelimit_remaining_header + self._status_codes_for_ratelimit_hit = status_codes_for_ratelimit_hit + super().__init__(**kwargs) + + def update_from_response(self, request: Any, response: Any) -> None: + policy = self.get_matching_policy(request) + if not policy: + return + + if isinstance(response, requests.Response): + available_calls = self.get_calls_left_from_response(response) + reset_ts = self.get_reset_ts_from_response(response) + policy.update(available_calls=available_calls, call_reset_ts=reset_ts) + + def get_reset_ts_from_response(self, response: requests.Response) -> Optional[datetime.datetime]: + if response.headers.get(self._ratelimit_reset_header): + return datetime.datetime.fromtimestamp(int(response.headers[self._ratelimit_reset_header])) + return None + + def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]: + if response.headers.get(self._ratelimit_remaining_header): + return int(response.headers[self._ratelimit_remaining_header]) + + if response.status_code in self._status_codes_for_ratelimit_hit: + return 0 + + return None + + +class LimiterMixin(MIXIN_BASE): + """Mixin class that adds rate-limiting behavior to requests.""" + + def __init__( + self, + api_budget: AbstractAPIBudget, + **kwargs: Any, + ): + self._api_budget = api_budget + super().__init__(**kwargs) # type: ignore # Base Session doesn't take any kwargs + + def send(self, request: requests.PreparedRequest, **kwargs: Any) -> requests.Response: + """Send a request with rate-limiting.""" + self._api_budget.acquire_call(request) + response = super().send(request, **kwargs) + self._api_budget.update_from_response(request, response) + return response + + +class LimiterSession(LimiterMixin, requests.Session): + """Session that adds rate-limiting behavior to requests.""" + + +class CachedLimiterSession(requests_cache.CacheMixin, LimiterMixin, requests.Session): + """Session class with caching and rate-limiting behavior.""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/__init__.py new file mode 100644 index 000000000000..0b122acbab6f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + + +from .checkpoint_reader import ( + CheckpointMode, + CheckpointReader, + CursorBasedCheckpointReader, + FullRefreshCheckpointReader, + IncrementalCheckpointReader, + LegacyCursorBasedCheckpointReader, + ResumableFullRefreshCheckpointReader +) +from .cursor import Cursor +from .resumable_full_refresh_cursor import ResumableFullRefreshCursor + + +__all__ = [ + "CheckpointMode", + "CheckpointReader", + "Cursor", + "CursorBasedCheckpointReader", + "FullRefreshCheckpointReader", + "IncrementalCheckpointReader", + "LegacyCursorBasedCheckpointReader", + "ResumableFullRefreshCheckpointReader", + "ResumableFullRefreshCursor" +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py new file mode 100644 index 000000000000..1b6d63247206 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py @@ -0,0 +1,311 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Any, Iterable, Mapping, Optional + +from airbyte_cdk.sources.types import StreamSlice + +from .cursor import Cursor + + +class CheckpointMode(Enum): + INCREMENTAL = "incremental" + RESUMABLE_FULL_REFRESH = "resumable_full_refresh" + FULL_REFRESH = "full_refresh" + + +FULL_REFRESH_COMPLETE_STATE: Mapping[str, Any] = {"__ab_full_refresh_sync_complete": True} + + +class CheckpointReader(ABC): + """ + CheckpointReader manages how to iterate over a stream's partitions and serves as the bridge for interpreting the current state + of the stream that should be emitted back to the platform. + """ + + @abstractmethod + def next(self) -> Optional[Mapping[str, Any]]: + """ + Returns the next slice that will be used to fetch the next group of records. Returning None indicates that the reader + has finished iterating over all slices. + """ + + @abstractmethod + def observe(self, new_state: Mapping[str, Any]) -> None: + """ + Updates the internal state of the checkpoint reader based on the incoming stream state from a connector. + + WARNING: This is used to retain backwards compatibility with streams using the legacy get_stream_state() method. + In order to uptake Resumable Full Refresh, connectors must migrate streams to use the state setter/getter methods. + """ + + @abstractmethod + def get_checkpoint(self) -> Optional[Mapping[str, Any]]: + """ + Retrieves the current state value of the stream. The connector does not emit state messages if the checkpoint value is None. + """ + + +class IncrementalCheckpointReader(CheckpointReader): + """ + IncrementalCheckpointReader handles iterating through a stream based on partitioned windows of data that are determined + before syncing data. + """ + + def __init__(self, stream_state: Mapping[str, Any], stream_slices: Iterable[Optional[Mapping[str, Any]]]): + self._state: Optional[Mapping[str, Any]] = stream_state + self._stream_slices = iter(stream_slices) + self._has_slices = False + + def next(self) -> Optional[Mapping[str, Any]]: + try: + next_slice = next(self._stream_slices) + self._has_slices = True + return next_slice + except StopIteration: + # This is used to avoid sending a duplicate state message at the end of a sync since the stream has already + # emitted state at the end of each slice. If we want to avoid this extra complexity, we can also just accept + # that every sync emits a final duplicate state + if self._has_slices: + self._state = None + return None + + def observe(self, new_state: Mapping[str, Any]) -> None: + self._state = new_state + + def get_checkpoint(self) -> Optional[Mapping[str, Any]]: + return self._state + + +class CursorBasedCheckpointReader(CheckpointReader): + """ + CursorBasedCheckpointReader is used by streams that implement a Cursor in order to manage state. This allows the checkpoint + reader to delegate the complexity of fetching state to the cursor and focus on the iteration over a stream's partitions. + + This reader supports the Cursor interface used by Python and low-code sources. Not to be confused with Cursor interface + that belongs to the Concurrent CDK. + """ + + def __init__(self, cursor: Cursor, stream_slices: Iterable[Optional[Mapping[str, Any]]], read_state_from_cursor: bool = False): + self._cursor = cursor + self._stream_slices = iter(stream_slices) + # read_state_from_cursor is used to delineate that partitions should determine when to stop syncing dynamically according + # to the value of the state at runtime. This currently only applies to streams that use resumable full refresh. + self._read_state_from_cursor = read_state_from_cursor + self._current_slice: Optional[StreamSlice] = None + self._finished_sync = False + self._previous_state: Optional[Mapping[str, Any]] = None + + def next(self) -> Optional[Mapping[str, Any]]: + try: + self.current_slice = self._find_next_slice() + return self.current_slice + except StopIteration: + self._finished_sync = True + return None + + def observe(self, new_state: Mapping[str, Any]) -> None: + # Cursor based checkpoint readers don't need to observe the new state because it has already been updated by the cursor + # while processing records + pass + + def get_checkpoint(self) -> Optional[Mapping[str, Any]]: + # This is used to avoid sending a duplicate state messages + new_state = self._cursor.get_stream_state() + if new_state != self._previous_state: + self._previous_state = new_state + return new_state + else: + return None + + def _find_next_slice(self) -> StreamSlice: + """ + _find_next_slice() returns the next slice of data should be synced for the current stream according to its cursor. + This function supports iterating over a stream's slices across two dimensions. The first dimension is the stream's + partitions like parent records for a substream. The inner dimension iterates over the cursor value like a date + range for incremental streams or a pagination checkpoint for resumable full refresh. + + The basic algorithm for iterating through a stream's slices is: + 1. The first time next() is invoked we get the first partition + 2. If the current partition is already complete as a result of a previous sync attempt, continue iterating until + we find an un-synced partition. + 2. For streams whose cursor value is determined dynamically using stream state + 1. Get the state for the current partition + 2. If the current partition's state is complete, continue iterating over partitions + 3. If the current partition's state is still in progress, emit the next cursor value + 4. If the current partition is complete as delineated by the sentinel value, get the next incomplete partition + 3. When stream has processed all partitions, the iterator will raise a StopIteration exception signaling there are no more + slices left for extracting more records. + """ + + if self._read_state_from_cursor: + if self.current_slice is None: + # current_slice is None represents the first time we are iterating over a stream's slices. The first slice to + # sync not been assigned yet and must first be read from the iterator + next_slice = self.read_and_convert_slice() + state_for_slice = self._cursor.select_state(next_slice) + if state_for_slice == FULL_REFRESH_COMPLETE_STATE: + # Skip every slice that already has the terminal complete value indicating that a previous attempt + # successfully synced the slice + has_more = True + while has_more: + next_slice = self.read_and_convert_slice() + state_for_slice = self._cursor.select_state(next_slice) + has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE + return StreamSlice(cursor_slice=state_for_slice or {}, partition=next_slice.partition, extra_fields=next_slice.extra_fields) + else: + state_for_slice = self._cursor.select_state(self.current_slice) + if state_for_slice == FULL_REFRESH_COMPLETE_STATE: + # If the current slice is is complete, move to the next slice and skip the next slices that already + # have the terminal complete value indicating that a previous attempt was successfully read. + # Dummy initialization for mypy since we'll iterate at least once to get the next slice + next_candidate_slice = StreamSlice(cursor_slice={}, partition={}) + has_more = True + while has_more: + next_candidate_slice = self.read_and_convert_slice() + state_for_slice = self._cursor.select_state(next_candidate_slice) + has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE + return StreamSlice( + cursor_slice=state_for_slice or {}, + partition=next_candidate_slice.partition, + extra_fields=next_candidate_slice.extra_fields, + ) + # The reader continues to process the current partition if it's state is still in progress + return StreamSlice( + cursor_slice=state_for_slice or {}, partition=self.current_slice.partition, extra_fields=self.current_slice.extra_fields + ) + else: + # Unlike RFR cursors that iterate dynamically according to how stream state is updated, most cursors operate + # on a fixed set of slices determined before reading records. They just iterate to the next slice + return self.read_and_convert_slice() + + @property + def current_slice(self) -> Optional[StreamSlice]: + return self._current_slice + + @current_slice.setter + def current_slice(self, value: StreamSlice) -> None: + self._current_slice = value + + def read_and_convert_slice(self) -> StreamSlice: + next_slice = next(self._stream_slices) + if not isinstance(next_slice, StreamSlice): + raise ValueError( + f"{self.current_slice} should be of type StreamSlice. This is likely a bug in the CDK, please contact Airbyte support" + ) + return next_slice + + +class LegacyCursorBasedCheckpointReader(CursorBasedCheckpointReader): + """ + This (unfortunate) class operates like an adapter to retain backwards compatibility with legacy sources that take in stream_slice + in the form of a Mapping instead of the StreamSlice object. Internally, the reader still operates over StreamSlices, but it + is instantiated with and emits stream slices in the form of a Mapping[str, Any]. The logic of how partitions and cursors + are iterated over is synonymous with CursorBasedCheckpointReader. + + We also retain the existing top level fields defined by the connector so the fields are present on dependent methods. For example, + the resulting mapping structure passed back to the stream's read_records() method looks like: + { + "cursor_slice": { + "next_page_token": 10 + }, + "partition": { + "repository": "airbytehq/airbyte" + }, + "next_page_token": 10, + "repository": "airbytehq/airbyte" + } + """ + + def __init__(self, cursor: Cursor, stream_slices: Iterable[Optional[Mapping[str, Any]]], read_state_from_cursor: bool = False): + super().__init__(cursor=cursor, stream_slices=stream_slices, read_state_from_cursor=read_state_from_cursor) + + def next(self) -> Optional[Mapping[str, Any]]: + try: + self.current_slice = self._find_next_slice() + + if "partition" in dict(self.current_slice): + raise ValueError("Stream is configured to use invalid stream slice key 'partition'") + elif "cursor_slice" in dict(self.current_slice): + raise ValueError("Stream is configured to use invalid stream slice key 'cursor_slice'") + + # We convert StreamSlice to a regular mapping because legacy connectors operate on the basic Mapping object. We + # also duplicate all fields at the top level for backwards compatibility for existing Python sources + return { + "partition": self.current_slice.partition, + "cursor_slice": self.current_slice.cursor_slice, + **dict(self.current_slice), + } + except StopIteration: + self._finished_sync = True + return None + + def read_and_convert_slice(self) -> StreamSlice: + next_mapping_slice = next(self._stream_slices) + if not isinstance(next_mapping_slice, Mapping): + raise ValueError( + f"{self.current_slice} should be of type Mapping. This is likely a bug in the CDK, please contact Airbyte support" + ) + + # The legacy reader is instantiated with an iterable of stream slice mappings. We convert each into a StreamSlice + # to sanely process them during the sync and to reuse the existing Python defined cursors + return StreamSlice( + partition=next_mapping_slice, + cursor_slice={}, + ) + + +class ResumableFullRefreshCheckpointReader(CheckpointReader): + """ + ResumableFullRefreshCheckpointReader allows for iteration over an unbounded set of records based on the pagination strategy + of the stream. Because the number of pages is unknown, the stream's current state is used to determine whether to continue + fetching more pages or stopping the sync. + """ + + def __init__(self, stream_state: Mapping[str, Any]): + # The first attempt of an RFR stream has an empty {} incoming state, but should still make a first attempt to read records + # from the first page in next(). + self._first_page = bool(stream_state == {}) + self._state: Mapping[str, Any] = stream_state + + def next(self) -> Optional[Mapping[str, Any]]: + if self._first_page: + self._first_page = False + return self._state + elif self._state == FULL_REFRESH_COMPLETE_STATE: + return None + else: + return self._state + + def observe(self, new_state: Mapping[str, Any]) -> None: + self._state = new_state + + def get_checkpoint(self) -> Optional[Mapping[str, Any]]: + return self._state or {} + + +class FullRefreshCheckpointReader(CheckpointReader): + """ + FullRefreshCheckpointReader iterates over data that cannot be checkpointed incrementally during the sync because the stream + is not capable of managing state. At the end of a sync, a final state message is emitted to signal completion. + """ + + def __init__(self, stream_slices: Iterable[Optional[Mapping[str, Any]]]): + self._stream_slices = iter(stream_slices) + self._final_checkpoint = False + + def next(self) -> Optional[Mapping[str, Any]]: + try: + return next(self._stream_slices) + except StopIteration: + self._final_checkpoint = True + return None + + def observe(self, new_state: Mapping[str, Any]) -> None: + pass + + def get_checkpoint(self) -> Optional[Mapping[str, Any]]: + if self._final_checkpoint: + return {"__ab_no_cursor_state_message": True} + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/cursor.py new file mode 100644 index 000000000000..6d758bf4edb8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/cursor.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Any, Optional + +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState + + +class Cursor(ABC): + """ + Cursors are components that allow for checkpointing the current state of a sync. They keep track of what data has been consumed + and allows for syncs to be resumed from a specific point based on that information. + """ + + @abstractmethod + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called + before calling anything else + + :param stream_state: The state of the stream as returned by get_stream_state + """ + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + """ + Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read. + + :param stream_slice: The current slice, which may or may not contain the most recently observed record + :param record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the + stream state may need to be deferred depending on whether the source reliably orders records by the cursor field. + """ + pass + + @abstractmethod + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + """ + Update state based on the stream slice. Note that `stream_slice.cursor_slice` and `most_recent_record.associated_slice` are expected + to be the same but we make it explicit here that `stream_slice` should be leveraged to update the state. We do not pass in the + latest record, since cursor instances should maintain the relevant internal state on their own. + + :param stream_slice: slice to close + """ + + @abstractmethod + def get_stream_state(self) -> StreamState: + """ + Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it + is used for two things: + * Interpolation of the requests + * Transformation of records + * Saving the state + + For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that + allows for emitting the state to the platform. + """ + + @abstractmethod + def should_be_synced(self, record: Record) -> bool: + """ + Evaluating if a record should be synced allows for filtering and stop condition on pagination + """ + + @abstractmethod + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + """ + Evaluating which record is greater in terms of cursor. This is used to avoid having to capture all the records to close a slice + """ + + @abstractmethod + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + """ + Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in + a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of + a specific parent delineated by the incoming slice's partition object. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py new file mode 100644 index 000000000000..e0dee4a92b89 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import json +from typing import Any, Mapping + + +class PerPartitionKeySerializer: + """ + We are concerned of the performance of looping through the `states` list and evaluating equality on the partition. To reduce this + concern, we wanted to use dictionaries to map `partition -> cursor`. However, partitions are dict and dict can't be used as dict keys + since they are not hashable. By creating json string using the dict, we can have a use the dict as a key to the dict since strings are + hashable. + """ + + @staticmethod + def to_partition_key(to_serialize: Any) -> str: + # separators have changed in Python 3.4. To avoid being impacted by further change, we explicitly specify our own value + return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True) + + @staticmethod + def to_partition(to_deserialize: Any) -> Mapping[str, Any]: + return json.loads(to_deserialize) # type: ignore # The partition is known to be a dict, but the type hint is Any diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py new file mode 100644 index 000000000000..86abd253f137 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from dataclasses import dataclass +from typing import Any, Optional + +from airbyte_cdk.sources.streams.checkpoint import Cursor +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState + + +@dataclass +class ResumableFullRefreshCursor(Cursor): + """ + Cursor that allows for the checkpointing of sync progress according to a synthetic cursor based on the pagination state + of the stream. Resumable full refresh syncs are only intended to retain state in between sync attempts of the same job + with the platform responsible for removing said state. + """ + + def __init__(self) -> None: + self._cursor: StreamState = {} + + def get_stream_state(self) -> StreamState: + return self._cursor + + def set_initial_state(self, stream_state: StreamState) -> None: + self._cursor = stream_state + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + """ + Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records. + """ + pass + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + self._cursor = stream_slice.cursor_slice + + def should_be_synced(self, record: Record) -> bool: + """ + Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages + that don't have filterable bounds. We should always return them. + """ + return True + + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + """ + RFR record don't have ordering to be compared between one another. + """ + return False + + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + # A top-level RFR cursor only manages the state of a single partition + return self._cursor diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py new file mode 100644 index 000000000000..8ebadcafbd19 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py @@ -0,0 +1,106 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from dataclasses import dataclass +from typing import Any, Mapping, MutableMapping, Optional + +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.streams.checkpoint import Cursor +from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import PerPartitionKeySerializer +from airbyte_cdk.sources.types import Record, StreamSlice, StreamState +from airbyte_cdk.utils import AirbyteTracedException + +FULL_REFRESH_COMPLETE_STATE: Mapping[str, Any] = {"__ab_full_refresh_sync_complete": True} + + +@dataclass +class SubstreamResumableFullRefreshCursor(Cursor): + def __init__(self) -> None: + self._per_partition_state: MutableMapping[str, StreamState] = {} + self._partition_serializer = PerPartitionKeySerializer() + + def get_stream_state(self) -> StreamState: + return {"states": list(self._per_partition_state.values())} + + def set_initial_state(self, stream_state: StreamState) -> None: + """ + Set the initial state for the cursors. + + This method initializes the state for each partition cursor using the provided stream state. + If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state. + + To simplify processing and state management, we do not maintain the checkpointed state of the parent partitions. + Instead, we are tracking whether a parent has already successfully synced on a prior attempt and skipping over it + allowing the sync to continue making progress. And this works for RFR because the platform will dispose of this + state on the next sync job. + + Args: + stream_state (StreamState): The state of the streams to be set. The format of the stream state should be: + { + "states": [ + { + "partition": { + "partition_key": "value_0" + }, + "cursor": { + "__ab_full_refresh_sync_complete": True + } + }, + { + "partition": { + "partition_key": "value_1" + }, + "cursor": {}, + }, + ] + } + """ + if not stream_state: + return + + if "states" not in stream_state: + raise AirbyteTracedException( + internal_message=f"Could not sync parse the following state: {stream_state}", + message="The state for is format invalid. Validate that the migration steps included a reset and that it was performed " + "properly. Otherwise, please contact Airbyte support.", + failure_type=FailureType.config_error, + ) + + for state in stream_state["states"]: + self._per_partition_state[self._to_partition_key(state["partition"])] = state + + def observe(self, stream_slice: StreamSlice, record: Record) -> None: + """ + Substream resumable full refresh manages state by closing the slice after syncing a parent so observe is not used. + """ + pass + + def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: + self._per_partition_state[self._to_partition_key(stream_slice.partition)] = { + "partition": stream_slice.partition, + "cursor": FULL_REFRESH_COMPLETE_STATE, + } + + def should_be_synced(self, record: Record) -> bool: + """ + Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages + that don't have filterable bounds. We should always return them. + """ + return True + + def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: + """ + RFR record don't have ordering to be compared between one another. + """ + return False + + def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: + if not stream_slice: + raise ValueError("A partition needs to be provided in order to extract a state") + + return self._per_partition_state.get(self._to_partition_key(stream_slice.partition), {}).get("cursor") + + def _to_partition_key(self, partition: Mapping[str, Any]) -> str: + return self._partition_serializer.to_partition_key(partition) + + def _to_dict(self, partition_key: str) -> Mapping[str, Any]: + return self._partition_serializer.to_partition(partition_key) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/README.md b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/README.md new file mode 100644 index 000000000000..6970c3acd05f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/README.md @@ -0,0 +1,7 @@ +## Breaking Changes & Limitations + +- [bigger scope than Concurrent CDK] checkpointing state was acting on the number of records per slice. This has been changed to consider the number of records per syncs +- `Source.read_state` and `Source._emit_legacy_state_format` are now classmethods to allow for developers to have access to the state before instantiating the source +- send_per_stream_state is always True for Concurrent CDK +- Using stream_state during read_records: The concern is that today, stream_instance.get_updated_state is called on every record and read_records on every slice. The implication is that the argument stream_state passed to read_records will have the value after the last stream_instance.get_updated_state of the previous slice. For Concurrent CDK, this is not possible as slices are processed in an unordered way. +- Cursor fields can only be data-time formatted as epoch. Eventually, we want to move to ISO 8601 as it provides more flexibility but for the first iteration on Stripe, it was easier to use the same format that was already used diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/abstract_stream.py new file mode 100644 index 000000000000..da99ae10bd83 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -0,0 +1,92 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Any, Iterable, Mapping, Optional + +from airbyte_cdk.models import AirbyteStream +from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from deprecated.classic import deprecated + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +class AbstractStream(ABC): + """ + AbstractStream is an experimental interface for streams developed as part of the Concurrent CDK. + This interface is not yet stable and may change in the future. Use at your own risk. + + Why create a new interface instead of adding concurrency capabilities the existing Stream? + We learnt a lot since the initial design of the Stream interface, and we wanted to take the opportunity to improve. + + High level, the changes we are targeting are: + - Removing superfluous or leaky parameters from the methods' interfaces + - Using composition instead of inheritance to add new capabilities + + To allow us to iterate fast while ensuring backwards compatibility, we are creating a new interface with a facade object that will bridge the old and the new interfaces. + Source connectors that wish to leverage concurrency need to implement this new interface. An example will be available shortly + + Current restrictions on sources that implement this interface. Not all of these restrictions will be lifted in the future, but most will as we iterate on the design. + - Only full refresh is supported. This will be addressed in the future. + - The read method does not accept a cursor_field. Streams must be internally aware of the cursor field to use. User-defined cursor fields can be implemented by modifying the connector's main method to instantiate the streams with the configured cursor field. + - Streams cannot return user-friendly messages by overriding Stream.get_error_display_message. This will be addressed in the future. + - The Stream's behavior cannot depend on a namespace + - TypeTransformer is not supported. This will be addressed in the future. + - Nested cursor and primary keys are not supported + """ + + @abstractmethod + def generate_partitions(self) -> Iterable[Partition]: + """ + Generates the partitions that will be read by this stream. + :return: An iterable of partitions. + """ + + @property + @abstractmethod + def name(self) -> str: + """ + :return: The stream name + """ + + @property + @abstractmethod + def cursor_field(self) -> Optional[str]: + """ + Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. + :return: The name of the field used as a cursor. Nested cursor fields are not supported. + """ + + @abstractmethod + def check_availability(self) -> StreamAvailability: + """ + :return: The stream's availability + """ + + @abstractmethod + def get_json_schema(self) -> Mapping[str, Any]: + """ + :return: A dict of the JSON schema representing this stream. + """ + + @abstractmethod + def as_airbyte_stream(self) -> AirbyteStream: + """ + :return: A dict of the JSON schema representing this stream. + """ + + @abstractmethod + def log_stream_sync_configuration(self) -> None: + """ + Logs the stream's configuration for debugging purposes. + """ + + @property + @abstractmethod + def cursor(self) -> Cursor: + """ + :return: The cursor associated with this stream. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py new file mode 100644 index 000000000000..18cacbc500d5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py @@ -0,0 +1,37 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from abc import ABC, abstractmethod +from typing import Generic, Optional, TypeVar + +from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage + +StreamType = TypeVar("StreamType") + + +class AbstractStreamFacade(Generic[StreamType], ABC): + @abstractmethod + def get_underlying_stream(self) -> StreamType: + """ + Return the underlying stream facade object. + """ + ... + + @property + def source_defined_cursor(self) -> bool: + # Streams must be aware of their cursor at instantiation time + return True + + def get_error_display_message(self, exception: BaseException) -> Optional[str]: + """ + Retrieves the user-friendly display message that corresponds to an exception. + This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. + + A display message will be returned if the exception is an instance of ExceptionWithDisplayMessage. + + :param exception: The exception that was raised + :return: A user-friendly message that indicates the cause of the error + """ + if isinstance(exception, ExceptionWithDisplayMessage): + return exception.display_message + else: + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/adapters.py new file mode 100644 index 000000000000..5fc775a1a1e1 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -0,0 +1,425 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +import json +import logging +from functools import lru_cache +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, Level, SyncMode, Type +from airbyte_cdk.sources import AbstractSource, Source +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.source import ExperimentalClassWarning +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy +from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade +from airbyte_cdk.sources.streams.concurrent.availability_strategy import AbstractAvailabilityStrategy, AlwaysAvailableAvailabilityStrategy +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage +from airbyte_cdk.sources.streams.concurrent.helpers import get_cursor_field_from_stream, get_primary_key_from_stream +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import DateTimeStreamStateConverter +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.types import StreamSlice +from airbyte_cdk.sources.utils.schema_helpers import InternalConfig +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from deprecated.classic import deprecated + +""" +This module contains adapters to help enabling concurrency on Stream objects without needing to migrate to AbstractStream +""" + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +class StreamFacade(AbstractStreamFacade[DefaultStream], Stream): + """ + The StreamFacade is a Stream that wraps an AbstractStream and exposes it as a Stream. + + All methods either delegate to the wrapped AbstractStream or provide a default implementation. + The default implementations define restrictions imposed on Streams migrated to the new interface. For instance, only source-defined cursors are supported. + """ + + @classmethod + def create_from_stream( + cls, + stream: Stream, + source: AbstractSource, + logger: logging.Logger, + state: Optional[MutableMapping[str, Any]], + cursor: Cursor, + ) -> Stream: + """ + Create a ConcurrentStream from a Stream object. + :param source: The source + :param stream: The stream + :param max_workers: The maximum number of worker thread to use + :return: + """ + pk = get_primary_key_from_stream(stream.primary_key) + cursor_field = get_cursor_field_from_stream(stream) + + if not source.message_repository: + raise ValueError( + "A message repository is required to emit non-record messages. Please set the message repository on the source." + ) + + message_repository = source.message_repository + return StreamFacade( + DefaultStream( + partition_generator=StreamPartitionGenerator( + stream, + message_repository, + SyncMode.full_refresh if isinstance(cursor, FinalStateCursor) else SyncMode.incremental, + [cursor_field] if cursor_field is not None else None, + state, + cursor, + ), + name=stream.name, + namespace=stream.namespace, + json_schema=stream.get_json_schema(), + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=pk, + cursor_field=cursor_field, + logger=logger, + cursor=cursor, + ), + stream, + cursor, + slice_logger=source._slice_logger, + logger=logger, + ) + + @property + def state(self) -> MutableMapping[str, Any]: + raise NotImplementedError("This should not be called as part of the Concurrent CDK code. Please report the problem to Airbyte") + + @state.setter + def state(self, value: Mapping[str, Any]) -> None: + if "state" in dir(self._legacy_stream): + self._legacy_stream.state = value # type: ignore # validating `state` is attribute of stream using `if` above + + def __init__(self, stream: DefaultStream, legacy_stream: Stream, cursor: Cursor, slice_logger: SliceLogger, logger: logging.Logger): + """ + :param stream: The underlying AbstractStream + """ + self._abstract_stream = stream + self._legacy_stream = legacy_stream + self._cursor = cursor + self._slice_logger = slice_logger + self._logger = logger + + def read( + self, + configured_stream: ConfiguredAirbyteStream, + logger: logging.Logger, + slice_logger: SliceLogger, + stream_state: MutableMapping[str, Any], + state_manager: ConnectorStateManager, + internal_config: InternalConfig, + ) -> Iterable[StreamData]: + yield from self._read_records() + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + try: + yield from self._read_records() + except Exception as exc: + if hasattr(self._cursor, "state"): + state = str(self._cursor.state) + else: + # This shouldn't happen if the ConcurrentCursor was used + state = "unknown; no state attribute was available on the cursor" + yield AirbyteMessage( + type=Type.LOG, log=AirbyteLogMessage(level=Level.ERROR, message=f"Cursor State at time of exception: {state}") + ) + raise exc + + def _read_records(self) -> Iterable[StreamData]: + for partition in self._abstract_stream.generate_partitions(): + if self._slice_logger.should_log_slice_message(self._logger): + yield self._slice_logger.create_slice_log_message(partition.to_slice()) + for record in partition.read(): + yield record.data + + @property + def name(self) -> str: + return self._abstract_stream.name + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + # This method is not expected to be called directly. It is only implemented for backward compatibility with the old interface + return self.as_airbyte_stream().source_defined_primary_key # type: ignore # source_defined_primary_key is known to be an Optional[List[List[str]]] + + @property + def cursor_field(self) -> Union[str, List[str]]: + if self._abstract_stream.cursor_field is None: + return [] + else: + return self._abstract_stream.cursor_field + + @property + def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor + return self._cursor + + @lru_cache(maxsize=None) + def get_json_schema(self) -> Mapping[str, Any]: + return self._abstract_stream.get_json_schema() + + @property + def supports_incremental(self) -> bool: + return self._legacy_stream.supports_incremental + + def check_availability(self, logger: logging.Logger, source: Optional["Source"] = None) -> Tuple[bool, Optional[str]]: + """ + Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters + :param logger: (ignored) + :param source: (ignored) + :return: + """ + availability = self._abstract_stream.check_availability() + return availability.is_available(), availability.message() + + def as_airbyte_stream(self) -> AirbyteStream: + return self._abstract_stream.as_airbyte_stream() + + def log_stream_sync_configuration(self) -> None: + self._abstract_stream.log_stream_sync_configuration() + + def get_underlying_stream(self) -> DefaultStream: + return self._abstract_stream + + +class SliceEncoder(json.JSONEncoder): + def default(self, obj: Any) -> Any: + if hasattr(obj, "__json_serializable__"): + return obj.__json_serializable__() + + # Let the base class default method raise the TypeError + return super().default(obj) + + +class StreamPartition(Partition): + """ + This class acts as an adapter between the new Partition interface and the Stream's stream_slice interface + + StreamPartitions are instantiated from a Stream and a stream_slice. + + This class can be used to help enable concurrency on existing connectors without having to rewrite everything as AbstractStream. + In the long-run, it would be preferable to update the connectors, but we don't have the tooling or need to justify the effort at this time. + """ + + def __init__( + self, + stream: Stream, + _slice: Optional[Mapping[str, Any]], + message_repository: MessageRepository, + sync_mode: SyncMode, + cursor_field: Optional[List[str]], + state: Optional[MutableMapping[str, Any]], + cursor: Cursor, + ): + """ + :param stream: The stream to delegate to + :param _slice: The partition's stream_slice + :param message_repository: The message repository to use to emit non-record messages + """ + self._stream = stream + self._slice = _slice + self._message_repository = message_repository + self._sync_mode = sync_mode + self._cursor_field = cursor_field + self._state = state + self._cursor = cursor + self._is_closed = False + + def read(self) -> Iterable[Record]: + """ + Read messages from the stream. + If the StreamData is a Mapping, it will be converted to a Record. + Otherwise, the message will be emitted on the message repository. + """ + try: + # using `stream_state=self._state` have a very different behavior than the current one as today the state is updated slice + # by slice incrementally. We don't have this guarantee with Concurrent CDK. For HttpStream, `stream_state` is passed to: + # * fetch_next_page + # * parse_response + # Both are not used for Stripe so we should be good for the first iteration of Concurrent CDK. However, Stripe still do + # `if not stream_state` to know if it calls the Event stream or not + for record_data in self._stream.read_records( + cursor_field=self._cursor_field, + sync_mode=SyncMode.full_refresh, + stream_slice=copy.deepcopy(self._slice), + stream_state=self._state, + ): + if isinstance(record_data, Mapping): + data_to_return = dict(record_data) + self._stream.transformer.transform(data_to_return, self._stream.get_json_schema()) + yield Record(data_to_return, self) + else: + self._message_repository.emit_message(record_data) + except Exception as e: + display_message = self._stream.get_error_display_message(e) + if display_message: + raise ExceptionWithDisplayMessage(display_message) from e + else: + raise e + + def to_slice(self) -> Optional[Mapping[str, Any]]: + return self._slice + + def __hash__(self) -> int: + if self._slice: + # Convert the slice to a string so that it can be hashed + s = json.dumps(self._slice, sort_keys=True, cls=SliceEncoder) + return hash((self._stream.name, s)) + else: + return hash(self._stream.name) + + def stream_name(self) -> str: + return self._stream.name + + def close(self) -> None: + self._cursor.close_partition(self) + self._is_closed = True + + def is_closed(self) -> bool: + return self._is_closed + + def __repr__(self) -> str: + return f"StreamPartition({self._stream.name}, {self._slice})" + + +class StreamPartitionGenerator(PartitionGenerator): + """ + This class acts as an adapter between the new PartitionGenerator and Stream.stream_slices + + This class can be used to help enable concurrency on existing connectors without having to rewrite everything as AbstractStream. + In the long-run, it would be preferable to update the connectors, but we don't have the tooling or need to justify the effort at this time. + """ + + def __init__( + self, + stream: Stream, + message_repository: MessageRepository, + sync_mode: SyncMode, + cursor_field: Optional[List[str]], + state: Optional[MutableMapping[str, Any]], + cursor: Cursor, + ): + """ + :param stream: The stream to delegate to + :param message_repository: The message repository to use to emit non-record messages + """ + self.message_repository = message_repository + self._stream = stream + self._sync_mode = sync_mode + self._cursor_field = cursor_field + self._state = state + self._cursor = cursor + + def generate(self) -> Iterable[Partition]: + for s in self._stream.stream_slices(sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state): + yield StreamPartition( + self._stream, copy.deepcopy(s), self.message_repository, self._sync_mode, self._cursor_field, self._state, self._cursor + ) + + +class CursorPartitionGenerator(PartitionGenerator): + """ + This class generates partitions using the concurrent cursor and iterates through state slices to generate partitions. + + It is used when synchronizing a stream in incremental or full-refresh mode where state information is maintained + across partitions. Each partition represents a subset of the stream's data and is determined by the cursor's state. + """ + + _START_BOUNDARY = 0 + _END_BOUNDARY = 1 + + def __init__( + self, + stream: Stream, + message_repository: MessageRepository, + cursor: Cursor, + connector_state_converter: DateTimeStreamStateConverter, + cursor_field: Optional[List[str]], + slice_boundary_fields: Optional[Tuple[str, str]], + ): + """ + Initialize the CursorPartitionGenerator with a stream, sync mode, and cursor. + + :param stream: The stream to delegate to for partition generation. + :param message_repository: The message repository to use to emit non-record messages. + :param sync_mode: The synchronization mode. + :param cursor: A Cursor object that maintains the state and the cursor field. + """ + self._stream = stream + self.message_repository = message_repository + self._sync_mode = SyncMode.full_refresh + self._cursor = cursor + self._cursor_field = cursor_field + self._state = self._cursor.state + self._slice_boundary_fields = slice_boundary_fields + self._connector_state_converter = connector_state_converter + + def generate(self) -> Iterable[Partition]: + """ + Generate partitions based on the slices in the cursor's state. + + This method iterates through the list of slices found in the cursor's state, and for each slice, it generates + a `StreamPartition` object. + + :return: An iterable of StreamPartition objects. + """ + + start_boundary = self._slice_boundary_fields[self._START_BOUNDARY] if self._slice_boundary_fields else "start" + end_boundary = self._slice_boundary_fields[self._END_BOUNDARY] if self._slice_boundary_fields else "end" + + for slice_start, slice_end in self._cursor.generate_slices(): + stream_slice = StreamSlice( + partition={}, + cursor_slice={ + start_boundary: self._connector_state_converter.output_format(slice_start), + end_boundary: self._connector_state_converter.output_format(slice_end), + }, + ) + + yield StreamPartition( + self._stream, + copy.deepcopy(stream_slice), + self.message_repository, + self._sync_mode, + self._cursor_field, + self._state, + self._cursor, + ) + + +@deprecated("Availability strategy has been soft deprecated. Do not use. Class is subject to removal", category=ExperimentalClassWarning) +class AvailabilityStrategyFacade(AvailabilityStrategy): + def __init__(self, abstract_availability_strategy: AbstractAvailabilityStrategy): + self._abstract_availability_strategy = abstract_availability_strategy + + def check_availability(self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None) -> Tuple[bool, Optional[str]]: + """ + Checks stream availability. + + Important to note that the stream and source parameters are not used by the underlying AbstractAvailabilityStrategy. + + :param stream: (unused) + :param logger: logger object to use + :param source: (unused) + :return: A tuple of (boolean, str). If boolean is true, then the stream + """ + stream_availability = self._abstract_availability_strategy.check_availability(logger) + return stream_availability.is_available(), stream_availability.message() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/availability_strategy.py new file mode 100644 index 000000000000..098b24cef17d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/availability_strategy.py @@ -0,0 +1,87 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from typing import Optional + +from airbyte_cdk.sources.source import ExperimentalClassWarning +from deprecated.classic import deprecated + + +class StreamAvailability(ABC): + @abstractmethod + def is_available(self) -> bool: + """ + :return: True if the stream is available. False if the stream is not + """ + + @abstractmethod + def message(self) -> Optional[str]: + """ + :return: A message describing why the stream is not available. If the stream is available, this should return None. + """ + + +class StreamAvailable(StreamAvailability): + def is_available(self) -> bool: + return True + + def message(self) -> Optional[str]: + return None + + +class StreamUnavailable(StreamAvailability): + def __init__(self, message: str): + self._message = message + + def is_available(self) -> bool: + return False + + def message(self) -> Optional[str]: + return self._message + + +# Singleton instances of StreamAvailability to avoid the overhead of creating new dummy objects +STREAM_AVAILABLE = StreamAvailable() + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +class AbstractAvailabilityStrategy(ABC): + """ + AbstractAvailabilityStrategy is an experimental interface developed as part of the Concurrent CDK. + This interface is not yet stable and may change in the future. Use at your own risk. + + Why create a new interface instead of using the existing AvailabilityStrategy? + The existing AvailabilityStrategy is tightly coupled with Stream and Source, which yields to circular dependencies and makes it difficult to move away from the Stream interface to AbstractStream. + """ + + @abstractmethod + def check_availability(self, logger: logging.Logger) -> StreamAvailability: + """ + Checks stream availability. + + :param logger: logger object to use + :return: A StreamAvailability object describing the stream's availability + """ + + +@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning) +class AlwaysAvailableAvailabilityStrategy(AbstractAvailabilityStrategy): + """ + An availability strategy that always indicates a stream is available. + + This strategy is used to avoid breaking changes and serves as a soft + deprecation of the availability strategy, allowing a smoother transition + without disrupting existing functionality. + """ + + def check_availability(self, logger: logging.Logger) -> StreamAvailability: + """ + Checks stream availability. + + :param logger: logger object to use + :return: A StreamAvailability object describing the stream's availability + """ + return StreamAvailable() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/cursor.py new file mode 100644 index 000000000000..e212693b0270 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/cursor.py @@ -0,0 +1,375 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import functools +from abc import ABC, abstractmethod +from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional, Protocol, Tuple + +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import AbstractStreamStateConverter + + +def _extract_value(mapping: Mapping[str, Any], path: List[str]) -> Any: + return functools.reduce(lambda a, b: a[b], path, mapping) + + +class GapType(Protocol): + """ + This is the representation of gaps between two cursor values. Examples: + * if cursor values are datetimes, GapType is timedelta + * if cursor values are integer, GapType will also be integer + """ + + pass + + +class CursorValueType(Protocol): + """Protocol for annotating comparable types.""" + + @abstractmethod + def __lt__(self: "CursorValueType", other: "CursorValueType") -> bool: + pass + + @abstractmethod + def __ge__(self: "CursorValueType", other: "CursorValueType") -> bool: + pass + + @abstractmethod + def __add__(self: "CursorValueType", other: GapType) -> "CursorValueType": + pass + + @abstractmethod + def __sub__(self: "CursorValueType", other: GapType) -> "CursorValueType": + pass + + +class CursorField: + def __init__(self, cursor_field_key: str) -> None: + self.cursor_field_key = cursor_field_key + + def extract_value(self, record: Record) -> CursorValueType: + cursor_value = record.data.get(self.cursor_field_key) + if cursor_value is None: + raise ValueError(f"Could not find cursor field {self.cursor_field_key} in record") + return cursor_value # type: ignore # we assume that the value the path points at is a comparable + + +class Cursor(ABC): + @property + @abstractmethod + def state(self) -> MutableMapping[str, Any]: + ... + + @abstractmethod + def observe(self, record: Record) -> None: + """ + Indicate to the cursor that the record has been emitted + """ + raise NotImplementedError() + + @abstractmethod + def close_partition(self, partition: Partition) -> None: + """ + Indicate to the cursor that the partition has been successfully processed + """ + raise NotImplementedError() + + @abstractmethod + def ensure_at_least_one_state_emitted(self) -> None: + """ + State messages are emitted when a partition is closed. However, the platform expects at least one state to be emitted per sync per + stream. Hence, if no partitions are generated, this method needs to be called. + """ + raise NotImplementedError() + + def generate_slices(self) -> Iterable[Tuple[Any, Any]]: + """ + Default placeholder implementation of generate_slices. + Subclasses can override this method to provide actual behavior. + """ + yield from () + + +class FinalStateCursor(Cursor): + """Cursor that is used to guarantee at least one state message is emitted for a concurrent stream.""" + + def __init__( + self, + stream_name: str, + stream_namespace: Optional[str], + message_repository: MessageRepository, + ) -> None: + self._stream_name = stream_name + self._stream_namespace = stream_namespace + self._message_repository = message_repository + # Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel + # state message rather than manage overall source state. This is also only temporary as we move to the resumable + # full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state. + self._connector_state_manager = ConnectorStateManager() + self._has_closed_at_least_one_slice = False + + @property + def state(self) -> MutableMapping[str, Any]: + return {NO_CURSOR_STATE_KEY: True} + + def observe(self, record: Record) -> None: + pass + + def close_partition(self, partition: Partition) -> None: + pass + + def ensure_at_least_one_state_emitted(self) -> None: + """ + Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync + """ + + self._connector_state_manager.update_state_for_stream(self._stream_name, self._stream_namespace, self.state) + state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace) + self._message_repository.emit_message(state_message) + + +class ConcurrentCursor(Cursor): + _START_BOUNDARY = 0 + _END_BOUNDARY = 1 + + def __init__( + self, + stream_name: str, + stream_namespace: Optional[str], + stream_state: Any, + message_repository: MessageRepository, + connector_state_manager: ConnectorStateManager, + connector_state_converter: AbstractStreamStateConverter, + cursor_field: CursorField, + slice_boundary_fields: Optional[Tuple[str, str]], + start: Optional[CursorValueType], + end_provider: Callable[[], CursorValueType], + lookback_window: Optional[GapType] = None, + slice_range: Optional[GapType] = None, + cursor_granularity: Optional[GapType] = None, + ) -> None: + self._stream_name = stream_name + self._stream_namespace = stream_namespace + self._message_repository = message_repository + self._connector_state_converter = connector_state_converter + self._connector_state_manager = connector_state_manager + self._cursor_field = cursor_field + # To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379 + self._slice_boundary_fields = slice_boundary_fields + self._start = start + self._end_provider = end_provider + self.start, self._concurrent_state = self._get_concurrent_state(stream_state) + self._lookback_window = lookback_window + self._slice_range = slice_range + self._most_recent_cursor_value_per_partition: MutableMapping[Partition, Any] = {} + self._has_closed_at_least_one_slice = False + self._cursor_granularity = cursor_granularity + + @property + def state(self) -> MutableMapping[str, Any]: + return self._concurrent_state + + @property + def cursor_field(self) -> CursorField: + return self._cursor_field + + @property + def slice_boundary_fields(self) -> Optional[Tuple[str, str]]: + return self._slice_boundary_fields + + def _get_concurrent_state(self, state: MutableMapping[str, Any]) -> Tuple[CursorValueType, MutableMapping[str, Any]]: + if self._connector_state_converter.is_state_message_compatible(state): + return self._start or self._connector_state_converter.zero_value, self._connector_state_converter.deserialize(state) + return self._connector_state_converter.convert_from_sequential_state(self._cursor_field, state, self._start) + + def observe(self, record: Record) -> None: + most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(record.partition) + cursor_value = self._extract_cursor_value(record) + + if most_recent_cursor_value is None or most_recent_cursor_value < cursor_value: + self._most_recent_cursor_value_per_partition[record.partition] = cursor_value + + def _extract_cursor_value(self, record: Record) -> Any: + return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record)) + + def close_partition(self, partition: Partition) -> None: + slice_count_before = len(self.state.get("slices", [])) + self._add_slice_to_state(partition) + if slice_count_before < len(self.state["slices"]): # only emit if at least one slice has been processed + self._merge_partitions() + self._emit_state_message() + self._has_closed_at_least_one_slice = True + + def _add_slice_to_state(self, partition: Partition) -> None: + most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(partition) + + if self._slice_boundary_fields: + if "slices" not in self.state: + raise RuntimeError( + f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support." + ) + self.state["slices"].append( + { + self._connector_state_converter.START_KEY: self._extract_from_slice( + partition, self._slice_boundary_fields[self._START_BOUNDARY] + ), + self._connector_state_converter.END_KEY: self._extract_from_slice( + partition, self._slice_boundary_fields[self._END_BOUNDARY] + ), + self._connector_state_converter.MOST_RECENT_RECORD_KEY: most_recent_cursor_value, + } + ) + elif most_recent_cursor_value: + if self._has_closed_at_least_one_slice: + # If we track state value using records cursor field, we can only do that if there is one partition. This is because we save + # the state every time we close a partition. We assume that if there are multiple slices, they need to be providing + # boundaries. There are cases where partitions could not have boundaries: + # * The cursor should be per-partition + # * The stream state is actually the parent stream state + # There might be other cases not listed above. Those are not supported today hence the stream should not use this cursor for + # state management. For the specific user that was affected with this issue, we need to: + # * Fix state tracking (which is currently broken) + # * Make the new version available + # * (Probably) ask the user to reset the stream to avoid data loss + raise ValueError( + "Given that slice_boundary_fields is not defined and that per-partition state is not supported, only one slice is " + "expected. Please contact the Airbyte team." + ) + + self.state["slices"].append( + { + self._connector_state_converter.START_KEY: self.start, + self._connector_state_converter.END_KEY: most_recent_cursor_value, + self._connector_state_converter.MOST_RECENT_RECORD_KEY: most_recent_cursor_value, + } + ) + + def _emit_state_message(self) -> None: + self._connector_state_manager.update_state_for_stream( + self._stream_name, + self._stream_namespace, + self._connector_state_converter.convert_to_state_message(self._cursor_field, self.state), + ) + state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace) + self._message_repository.emit_message(state_message) + + def _merge_partitions(self) -> None: + self.state["slices"] = self._connector_state_converter.merge_intervals(self.state["slices"]) + + def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType: + try: + _slice = partition.to_slice() + if not _slice: + raise KeyError(f"Could not find key `{key}` in empty slice") + return self._connector_state_converter.parse_value(_slice[key]) # type: ignore # we expect the devs to specify a key that would return a CursorValueType + except KeyError as exception: + raise KeyError(f"Partition is expected to have key `{key}` but could not be found") from exception + + def ensure_at_least_one_state_emitted(self) -> None: + """ + The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be + called. + """ + self._emit_state_message() + + def generate_slices(self) -> Iterable[Tuple[CursorValueType, CursorValueType]]: + """ + Generating slices based on a few parameters: + * lookback_window: Buffer to remove from END_KEY of the highest slice + * slice_range: Max difference between two slices. If the difference between two slices is greater, multiple slices will be created + * start: `_split_per_slice_range` will clip any value to `self._start which means that: + * if upper is less than self._start, no slices will be generated + * if lower is less than self._start, self._start will be used as the lower boundary (lookback_window will not be considered in that case) + + Note that the slices will overlap at their boundaries. We therefore expect to have at least the lower or the upper boundary to be + inclusive in the API that is queried. + """ + self._merge_partitions() + + if self._start is not None and self._is_start_before_first_slice(): + yield from self._split_per_slice_range( + self._start, + self.state["slices"][0][self._connector_state_converter.START_KEY], + False, + ) + + if len(self.state["slices"]) == 1: + yield from self._split_per_slice_range( + self._calculate_lower_boundary_of_last_slice(self.state["slices"][0][self._connector_state_converter.END_KEY]), + self._end_provider(), + True, + ) + elif len(self.state["slices"]) > 1: + for i in range(len(self.state["slices"]) - 1): + if self._cursor_granularity: + yield from self._split_per_slice_range( + self.state["slices"][i][self._connector_state_converter.END_KEY] + self._cursor_granularity, + self.state["slices"][i + 1][self._connector_state_converter.START_KEY], + False, + ) + else: + yield from self._split_per_slice_range( + self.state["slices"][i][self._connector_state_converter.END_KEY], + self.state["slices"][i + 1][self._connector_state_converter.START_KEY], + False, + ) + yield from self._split_per_slice_range( + self._calculate_lower_boundary_of_last_slice(self.state["slices"][-1][self._connector_state_converter.END_KEY]), + self._end_provider(), + True, + ) + else: + raise ValueError("Expected at least one slice") + + def _is_start_before_first_slice(self) -> bool: + return self._start is not None and self._start < self.state["slices"][0][self._connector_state_converter.START_KEY] + + def _calculate_lower_boundary_of_last_slice(self, lower_boundary: CursorValueType) -> CursorValueType: + if self._lookback_window: + return lower_boundary - self._lookback_window + return lower_boundary + + def _split_per_slice_range( + self, lower: CursorValueType, upper: CursorValueType, upper_is_end: bool + ) -> Iterable[Tuple[CursorValueType, CursorValueType]]: + if lower >= upper: + return + + if self._start and upper < self._start: + return + + lower = max(lower, self._start) if self._start else lower + if not self._slice_range or self._evaluate_upper_safely(lower, self._slice_range) >= upper: + if self._cursor_granularity and not upper_is_end: + yield lower, upper - self._cursor_granularity + else: + yield lower, upper + else: + stop_processing = False + current_lower_boundary = lower + while not stop_processing: + current_upper_boundary = min(self._evaluate_upper_safely(current_lower_boundary, self._slice_range), upper) + has_reached_upper_boundary = current_upper_boundary >= upper + if self._cursor_granularity and (not upper_is_end or not has_reached_upper_boundary): + yield current_lower_boundary, current_upper_boundary - self._cursor_granularity + else: + yield current_lower_boundary, current_upper_boundary + current_lower_boundary = current_upper_boundary + if current_upper_boundary >= upper: + stop_processing = True + + def _evaluate_upper_safely(self, lower: CursorValueType, step: GapType) -> CursorValueType: + """ + Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date + This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code + would have broken anyway. + """ + try: + return lower + step + except OverflowError: + return self._end_provider() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/default_stream.py new file mode 100644 index 000000000000..a48d897e191e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from functools import lru_cache +from logging import Logger +from typing import Any, Iterable, List, Mapping, Optional + +from airbyte_cdk.models import AirbyteStream, SyncMode +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.availability_strategy import AbstractAvailabilityStrategy, StreamAvailability +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator + + +class DefaultStream(AbstractStream): + def __init__( + self, + partition_generator: PartitionGenerator, + name: str, + json_schema: Mapping[str, Any], + availability_strategy: AbstractAvailabilityStrategy, + primary_key: List[str], + cursor_field: Optional[str], + logger: Logger, + cursor: Cursor, + namespace: Optional[str] = None, + ) -> None: + self._stream_partition_generator = partition_generator + self._name = name + self._json_schema = json_schema + self._availability_strategy = availability_strategy + self._primary_key = primary_key + self._cursor_field = cursor_field + self._logger = logger + self._cursor = cursor + self._namespace = namespace + + def generate_partitions(self) -> Iterable[Partition]: + yield from self._stream_partition_generator.generate() + + @property + def name(self) -> str: + return self._name + + @property + def namespace(self) -> Optional[str]: + return self._namespace + + def check_availability(self) -> StreamAvailability: + return self._availability_strategy.check_availability(self._logger) + + @property + def cursor_field(self) -> Optional[str]: + return self._cursor_field + + @lru_cache(maxsize=None) + def get_json_schema(self) -> Mapping[str, Any]: + return self._json_schema + + def as_airbyte_stream(self) -> AirbyteStream: + stream = AirbyteStream(name=self.name, json_schema=dict(self._json_schema), supported_sync_modes=[SyncMode.full_refresh]) + + if self._namespace: + stream.namespace = self._namespace + + if self._cursor_field: + stream.source_defined_cursor = True + stream.is_resumable = True + stream.supported_sync_modes.append(SyncMode.incremental) + stream.default_cursor_field = [self._cursor_field] + + keys = self._primary_key + if keys and len(keys) > 0: + stream.source_defined_primary_key = [[key] for key in keys] + + return stream + + def log_stream_sync_configuration(self) -> None: + self._logger.debug( + f"Syncing stream instance: {self.name}", + extra={ + "primary_key": self._primary_key, + "cursor_field": self.cursor_field, + }, + ) + + @property + def cursor(self) -> Cursor: + return self._cursor diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/exceptions.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/exceptions.py new file mode 100644 index 000000000000..a0cf699a46d0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/exceptions.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any + + +class ExceptionWithDisplayMessage(Exception): + """ + Exception that can be used to display a custom message to the user. + """ + + def __init__(self, display_message: str, **kwargs: Any): + super().__init__(**kwargs) + self.display_message = display_message + + def __str__(self) -> str: + return f'ExceptionWithDisplayMessage: "{self.display_message}"' diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/helpers.py new file mode 100644 index 000000000000..ad7722726498 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/helpers.py @@ -0,0 +1,31 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from typing import List, Optional, Union + +from airbyte_cdk.sources.streams import Stream + + +def get_primary_key_from_stream(stream_primary_key: Optional[Union[str, List[str], List[List[str]]]]) -> List[str]: + if stream_primary_key is None: + return [] + elif isinstance(stream_primary_key, str): + return [stream_primary_key] + elif isinstance(stream_primary_key, list): + if len(stream_primary_key) > 0 and all(isinstance(k, str) for k in stream_primary_key): + return stream_primary_key # type: ignore # We verified all items in the list are strings + else: + raise ValueError(f"Nested primary keys are not supported. Found {stream_primary_key}") + else: + raise ValueError(f"Invalid type for primary key: {stream_primary_key}") + + +def get_cursor_field_from_stream(stream: Stream) -> Optional[str]: + if isinstance(stream.cursor_field, list): + if len(stream.cursor_field) > 1: + raise ValueError(f"Nested cursor fields are not supported. Got {stream.cursor_field} for {stream.name}") + elif len(stream.cursor_field) == 0: + return None + else: + return stream.cursor_field[0] + else: + return stream.cursor_field diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py new file mode 100644 index 000000000000..8e63c16a4b2c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import time +from queue import Queue + +from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel +from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem + + +class PartitionEnqueuer: + """ + Generates partitions from a partition generator and puts them in a queue. + """ + + def __init__(self, queue: Queue[QueueItem], thread_pool_manager: ThreadPoolManager, sleep_time_in_seconds: float = 0.1) -> None: + """ + :param queue: The queue to put the partitions in. + :param throttler: The throttler to use to throttle the partition generation. + """ + self._queue = queue + self._thread_pool_manager = thread_pool_manager + self._sleep_time_in_seconds = sleep_time_in_seconds + + def generate_partitions(self, stream: AbstractStream) -> None: + """ + Generate partitions from a partition generator and put them in a queue. + When all the partitions are added to the queue, a sentinel is added to the queue to indicate that all the partitions have been generated. + + If an exception is encountered, the exception will be caught and put in the queue. This is very important because if we don't, the + main thread will have no way to know that something when wrong and will wait until the timeout is reached + + This method is meant to be called in a separate thread. + """ + try: + for partition in stream.generate_partitions(): + # Adding partitions to the queue generates futures. To avoid having too many futures, we throttle here. We understand that + # we might add more futures than the limit by throttling in the threads while it is the main thread that actual adds the + # future but we expect the delta between the max futures length and the actual to be small enough that it would not be an + # issue. We do this in the threads because we want the main thread to always be processing QueueItems as if it does not, the + # queue size could grow and generating OOM issues. + # + # Also note that we do not expect this to create deadlocks where all worker threads wait because we have less + # PartitionEnqueuer threads than worker threads. + # + # Also note that prune_to_validate_has_reached_futures_limit has a lock while pruning which might create a bottleneck in + # terms of performance. + while self._thread_pool_manager.prune_to_validate_has_reached_futures_limit(): + time.sleep(self._sleep_time_in_seconds) + self._queue.put(partition) + self._queue.put(PartitionGenerationCompletedSentinel(stream)) + except Exception as e: + self._queue.put(StreamThreadException(e, stream.name)) + self._queue.put(PartitionGenerationCompletedSentinel(stream)) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partition_reader.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partition_reader.py new file mode 100644 index 000000000000..eec69d569d8a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partition_reader.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from queue import Queue + +from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem + + +class PartitionReader: + """ + Generates records from a partition and puts them in a queue. + """ + + _IS_SUCCESSFUL = True + + def __init__(self, queue: Queue[QueueItem]) -> None: + """ + :param queue: The queue to put the records in. + """ + self._queue = queue + + def process_partition(self, partition: Partition) -> None: + """ + Process a partition and put the records in the output queue. + When all the partitions are added to the queue, a sentinel is added to the queue to indicate that all the partitions have been generated. + + If an exception is encountered, the exception will be caught and put in the queue. This is very important because if we don't, the + main thread will have no way to know that something when wrong and will wait until the timeout is reached + + This method is meant to be called from a thread. + :param partition: The partition to read data from + :return: None + """ + try: + for record in partition.read(): + self._queue.put(record) + self._queue.put(PartitionCompleteSentinel(partition, self._IS_SUCCESSFUL)) + except Exception as e: + self._queue.put(StreamThreadException(e, partition.stream_name())) + self._queue.put(PartitionCompleteSentinel(partition, not self._IS_SUCCESSFUL)) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/partition.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/partition.py new file mode 100644 index 000000000000..09f83d8f85f2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/partition.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Any, Iterable, Mapping, Optional + +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record + + +class Partition(ABC): + """ + A partition is responsible for reading a specific set of data from a source. + """ + + @abstractmethod + def read(self) -> Iterable[Record]: + """ + Reads the data from the partition. + :return: An iterable of records. + """ + pass + + @abstractmethod + def to_slice(self) -> Optional[Mapping[str, Any]]: + """ + Converts the partition to a slice that can be serialized and deserialized. + + Note: it would have been interesting to have a type of `Mapping[str, Comparable]` to simplify typing but some slices can have nested + values ([example](https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L584-L596)) + :return: A mapping representing a slice + """ + pass + + @abstractmethod + def stream_name(self) -> str: + """ + Returns the name of the stream that this partition is reading from. + :return: The name of the stream. + """ + pass + + @abstractmethod + def close(self) -> None: + """ + Closes the partition. + """ + pass + + @abstractmethod + def is_closed(self) -> bool: + """ + Returns whether the partition is closed. + :return: + """ + pass + + @abstractmethod + def __hash__(self) -> int: + """ + Returns a hash of the partition. + Partitions must be hashable so that they can be used as keys in a dictionary. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py new file mode 100644 index 000000000000..eff978564772 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Iterable + +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition + + +class PartitionGenerator(ABC): + @abstractmethod + def generate(self) -> Iterable[Partition]: + """ + Generates partitions for a given sync mode. + :return: An iterable of partitions + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/record.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/record.py new file mode 100644 index 000000000000..0b34ae130071 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/record.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import TYPE_CHECKING, Any, Mapping + +if TYPE_CHECKING: + from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition + + +class Record: + """ + Represents a record read from a stream. + """ + + def __init__(self, data: Mapping[str, Any], partition: "Partition", is_file_transfer_message: bool = False): + self.data = data + self.partition = partition + self.is_file_transfer_message = is_file_transfer_message + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, Record): + return False + return self.data == other.data and self.partition.stream_name() == other.partition.stream_name() + + def __repr__(self) -> str: + return f"Record(data={self.data}, stream_name={self.partition.stream_name()})" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/types.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/types.py new file mode 100644 index 000000000000..c36d9d944cce --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/partitions/types.py @@ -0,0 +1,34 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Union + +from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record + + +class PartitionCompleteSentinel: + """ + A sentinel object indicating all records for a partition were produced. + Includes a pointer to the partition that was processed. + """ + + def __init__(self, partition: Partition, is_successful: bool = True): + """ + :param partition: The partition that was processed + """ + self.partition = partition + self.is_successful = is_successful + + def __eq__(self, other: Any) -> bool: + if isinstance(other, PartitionCompleteSentinel): + return self.partition == other.partition + return False + + +""" +Typedef representing the items that can be added to the ThreadBasedConcurrentStream +""" +QueueItem = Union[Record, Partition, PartitionCompleteSentinel, PartitionGenerationCompletedSentinel, Exception] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py new file mode 100644 index 000000000000..e80def360f27 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py @@ -0,0 +1,159 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from enum import Enum +from typing import TYPE_CHECKING, Any, List, MutableMapping, Optional, Tuple + +if TYPE_CHECKING: + from airbyte_cdk.sources.streams.concurrent.cursor import CursorField + + +class ConcurrencyCompatibleStateType(Enum): + date_range = "date-range" + + +class AbstractStreamStateConverter(ABC): + START_KEY = "start" + END_KEY = "end" + MOST_RECENT_RECORD_KEY = "most_recent_cursor_value" + + @abstractmethod + def _from_state_message(self, value: Any) -> Any: + pass + + @abstractmethod + def _to_state_message(self, value: Any) -> Any: + pass + + def __init__(self, is_sequential_state: bool = True): + self._is_sequential_state = is_sequential_state + + def convert_to_state_message(self, cursor_field: "CursorField", stream_state: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """ + Convert the state message from the concurrency-compatible format to the stream's original format. + + e.g. + { "created": "2021-01-18T21:18:20.000Z" } + """ + if self.is_state_message_compatible(stream_state) and self._is_sequential_state: + legacy_state = stream_state.get("legacy", {}) + latest_complete_time = self._get_latest_complete_time(stream_state.get("slices", [])) + if latest_complete_time is not None: + legacy_state.update({cursor_field.cursor_field_key: self._to_state_message(latest_complete_time)}) + return legacy_state or {} + else: + return self.serialize(stream_state, ConcurrencyCompatibleStateType.date_range) + + def _get_latest_complete_time(self, slices: List[MutableMapping[str, Any]]) -> Any: + """ + Get the latest time before which all records have been processed. + """ + if not slices: + raise RuntimeError("Expected at least one slice but there were none. This is unexpected; please contact Support.") + merged_intervals = self.merge_intervals(slices) + first_interval = merged_intervals[0] + + return first_interval.get("most_recent_cursor_value") or first_interval[self.START_KEY] + + def deserialize(self, state: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """ + Perform any transformations needed for compatibility with the converter. + """ + for stream_slice in state.get("slices", []): + stream_slice[self.START_KEY] = self._from_state_message(stream_slice[self.START_KEY]) + stream_slice[self.END_KEY] = self._from_state_message(stream_slice[self.END_KEY]) + return state + + def serialize(self, state: MutableMapping[str, Any], state_type: ConcurrencyCompatibleStateType) -> MutableMapping[str, Any]: + """ + Perform any transformations needed for compatibility with the converter. + """ + serialized_slices = [] + for stream_slice in state.get("slices", []): + serialized_slice = { + self.START_KEY: self._to_state_message(stream_slice[self.START_KEY]), + self.END_KEY: self._to_state_message(stream_slice[self.END_KEY]), + } + if stream_slice.get(self.MOST_RECENT_RECORD_KEY): + serialized_slice[self.MOST_RECENT_RECORD_KEY] = self._to_state_message(stream_slice[self.MOST_RECENT_RECORD_KEY]) + serialized_slices.append(serialized_slice) + return {"slices": serialized_slices, "state_type": state_type.value} + + @staticmethod + def is_state_message_compatible(state: MutableMapping[str, Any]) -> bool: + return bool(state) and state.get("state_type") in [t.value for t in ConcurrencyCompatibleStateType] + + @abstractmethod + def convert_from_sequential_state( + self, + cursor_field: "CursorField", # to deprecate as it is only needed for sequential state + stream_state: MutableMapping[str, Any], + start: Optional[Any], + ) -> Tuple[Any, MutableMapping[str, Any]]: + """ + Convert the state message to the format required by the ConcurrentCursor. + + e.g. + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "metadata": { … }, + "slices": [ + {starts: 0, end: 1617030403, finished_processing: true}] + } + """ + ... + + @abstractmethod + def increment(self, value: Any) -> Any: + """ + Increment a timestamp by a single unit. + """ + ... + + def merge_intervals(self, intervals: List[MutableMapping[str, Any]]) -> List[MutableMapping[str, Any]]: + """ + Compute and return a list of merged intervals. + + Intervals may be merged if the start time of the second interval is 1 unit or less (as defined by the + `increment` method) than the end time of the first interval. + """ + if not intervals: + return [] + + sorted_intervals = sorted(intervals, key=lambda interval: (interval[self.START_KEY], interval[self.END_KEY])) + merged_intervals = [sorted_intervals[0]] + + for current_interval in sorted_intervals[1:]: + last_interval = merged_intervals[-1] + last_interval_end = last_interval[self.END_KEY] + current_interval_start = current_interval[self.START_KEY] + + if self.increment(last_interval_end) >= current_interval_start: + last_interval[self.END_KEY] = max(last_interval_end, current_interval[self.END_KEY]) + last_interval_cursor_value = last_interval.get("most_recent_cursor_value") + current_interval_cursor_value = current_interval.get("most_recent_cursor_value") + + last_interval["most_recent_cursor_value"] = ( + max(current_interval_cursor_value, last_interval_cursor_value) + if current_interval_cursor_value and last_interval_cursor_value + else current_interval_cursor_value or last_interval_cursor_value + ) + else: + # Add a new interval if no overlap + merged_intervals.append(current_interval) + + return merged_intervals + + @abstractmethod + def parse_value(self, value: Any) -> Any: + """ + Parse the value of the cursor field into a comparable value. + """ + ... + + @property + @abstractmethod + def zero_value(self) -> Any: + ... diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py new file mode 100644 index 000000000000..f6f181e6bbfc --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py @@ -0,0 +1,194 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, List, MutableMapping, Optional, Tuple + +import pendulum + +# FIXME We would eventually like the Concurrent package do be agnostic of the declarative package. However, this is a breaking change and +# the goal in the short term is only to fix the issue we are seeing for source-declarative-manifest. +from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import ( + AbstractStreamStateConverter, + ConcurrencyCompatibleStateType, +) +from pendulum.datetime import DateTime + + +class DateTimeStreamStateConverter(AbstractStreamStateConverter): + def _from_state_message(self, value: Any) -> Any: + return self.parse_timestamp(value) + + def _to_state_message(self, value: Any) -> Any: + return self.output_format(value) + + @property + @abstractmethod + def _zero_value(self) -> Any: + ... + + @property + def zero_value(self) -> datetime: + return self.parse_timestamp(self._zero_value) + + @classmethod + def get_end_provider(cls) -> Callable[[], datetime]: + return lambda: datetime.now(timezone.utc) + + @abstractmethod + def increment(self, timestamp: datetime) -> datetime: + ... + + @abstractmethod + def parse_timestamp(self, timestamp: Any) -> datetime: + ... + + @abstractmethod + def output_format(self, timestamp: datetime) -> Any: + ... + + def parse_value(self, value: Any) -> Any: + """ + Parse the value of the cursor field into a comparable value. + """ + return self.parse_timestamp(value) + + def _compare_intervals(self, end_time: Any, start_time: Any) -> bool: + return bool(self.increment(end_time) >= start_time) + + def convert_from_sequential_state( + self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: Optional[datetime] + ) -> Tuple[datetime, MutableMapping[str, Any]]: + """ + Convert the state message to the format required by the ConcurrentCursor. + + e.g. + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "metadata": { … }, + "slices": [ + {"start": "2021-01-18T21:18:20.000+00:00", "end": "2021-01-18T21:18:20.000+00:00"}, + ] + } + """ + sync_start = self._get_sync_start(cursor_field, stream_state, start) + if self.is_state_message_compatible(stream_state): + return sync_start, stream_state + + # Create a slice to represent the records synced during prior syncs. + # The start and end are the same to avoid confusion as to whether the records for this slice + # were actually synced + slices = [{self.START_KEY: start if start is not None else sync_start, self.END_KEY: sync_start}] + + return sync_start, { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": slices, + "legacy": stream_state, + } + + def _get_sync_start(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: Optional[datetime]) -> datetime: + sync_start = start if start is not None else self.zero_value + prev_sync_low_water_mark = ( + self.parse_timestamp(stream_state[cursor_field.cursor_field_key]) if cursor_field.cursor_field_key in stream_state else None + ) + if prev_sync_low_water_mark and prev_sync_low_water_mark >= sync_start: + return prev_sync_low_water_mark + else: + return sync_start + + +class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter): + """ + e.g. + { "created": 1617030403 } + => + { + "state_type": "date-range", + "metadata": { … }, + "slices": [ + {starts: 0, end: 1617030403, finished_processing: true} + ] + } + """ + + _zero_value = 0 + + def increment(self, timestamp: datetime) -> datetime: + return timestamp + timedelta(seconds=1) + + def output_format(self, timestamp: datetime) -> int: + return int(timestamp.timestamp()) + + def parse_timestamp(self, timestamp: int) -> datetime: + dt_object = pendulum.from_timestamp(timestamp) + if not isinstance(dt_object, DateTime): + raise ValueError(f"DateTime object was expected but got {type(dt_object)} from pendulum.parse({timestamp})") + return dt_object # type: ignore # we are manually type checking because pendulum.parse may return different types + + +class IsoMillisConcurrentStreamStateConverter(DateTimeStreamStateConverter): + """ + e.g. + { "created": "2021-01-18T21:18:20.000Z" } + => + { + "state_type": "date-range", + "metadata": { … }, + "slices": [ + {starts: "2020-01-18T21:18:20.000Z", end: "2021-01-18T21:18:20.000Z", finished_processing: true} + ] + } + """ + + _zero_value = "0001-01-01T00:00:00.000Z" + + def __init__(self, is_sequential_state: bool = True, cursor_granularity: Optional[timedelta] = None): + super().__init__(is_sequential_state=is_sequential_state) + self._cursor_granularity = cursor_granularity or timedelta(milliseconds=1) + + def increment(self, timestamp: datetime) -> datetime: + return timestamp + self._cursor_granularity + + def output_format(self, timestamp: datetime) -> Any: + return timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" + + def parse_timestamp(self, timestamp: str) -> datetime: + dt_object = pendulum.parse(timestamp) + if not isinstance(dt_object, DateTime): + raise ValueError(f"DateTime object was expected but got {type(dt_object)} from pendulum.parse({timestamp})") + return dt_object # type: ignore # we are manually type checking because pendulum.parse may return different types + + +class CustomFormatConcurrentStreamStateConverter(IsoMillisConcurrentStreamStateConverter): + """ + Datetime State converter that emits state according to the supplied datetime format. The converter supports reading + incoming state in any valid datetime format via Pendulum. + """ + + def __init__( + self, + datetime_format: str, + input_datetime_formats: Optional[List[str]] = None, + is_sequential_state: bool = True, + cursor_granularity: Optional[timedelta] = None, + ): + super().__init__(is_sequential_state=is_sequential_state, cursor_granularity=cursor_granularity) + self._datetime_format = datetime_format + self._input_datetime_formats = input_datetime_formats if input_datetime_formats else [] + self._input_datetime_formats += [self._datetime_format] + self._parser = DatetimeParser() + + def output_format(self, timestamp: datetime) -> str: + return self._parser.format(timestamp, self._datetime_format) + + def parse_timestamp(self, timestamp: str) -> datetime: + for datetime_format in self._input_datetime_formats: + try: + return self._parser.parse(timestamp, datetime_format) + except ValueError: + pass + raise ValueError(f"No format in {self._input_datetime_formats} matching {timestamp}") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py new file mode 100644 index 000000000000..c7a0cf02ec5a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py @@ -0,0 +1,644 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import copy +import inspect +import itertools +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass +from functools import cached_property, lru_cache +from typing import Any, Dict, Iterable, Iterator, List, Mapping, MutableMapping, Optional, Union + +import airbyte_cdk.sources.utils.casing as casing +from airbyte_cdk.models import AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, DestinationSyncMode, SyncMode +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.streams.checkpoint import ( + CheckpointMode, + CheckpointReader, + Cursor, + CursorBasedCheckpointReader, + FullRefreshCheckpointReader, + IncrementalCheckpointReader, + LegacyCursorBasedCheckpointReader, + ResumableFullRefreshCheckpointReader, +) +from airbyte_cdk.sources.types import StreamSlice + +# list of all possible HTTP methods which can be used for sending of request bodies +from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, ResourceSchemaLoader +from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer +from deprecated import deprecated + +# A stream's read method can return one of the following types: +# Mapping[str, Any]: The content of an AirbyteRecordMessage +# AirbyteMessage: An AirbyteMessage. Could be of any type +StreamData = Union[Mapping[str, Any], AirbyteMessage] + +JsonSchema = Mapping[str, Any] + +NO_CURSOR_STATE_KEY = "__ab_no_cursor_state_message" + + +def package_name_from_class(cls: object) -> str: + """Find the package name given a class name""" + module = inspect.getmodule(cls) + if module is not None: + return module.__name__.split(".")[0] + else: + raise ValueError(f"Could not find package name for class {cls}") + + +class CheckpointMixin(ABC): + """Mixin for a stream that implements reading and writing the internal state used to checkpoint sync progress to the platform + + class CheckpointedStream(Stream, CheckpointMixin): + @property + def state(self): + return self._state + + @state.setter + def state(self, value): + self._state[self.cursor_field] = value[self.cursor_field] + """ + + @property + @abstractmethod + def state(self) -> MutableMapping[str, Any]: + """State getter, should return state in form that can serialized to a string and send to the output + as a STATE AirbyteMessage. + + A good example of a state is a cursor_value: + { + self.cursor_field: "cursor_value" + } + + State should try to be as small as possible but at the same time descriptive enough to restore + syncing process from the point where it stopped. + """ + + @state.setter + @abstractmethod + def state(self, value: MutableMapping[str, Any]) -> None: + """State setter, accept state serialized by state getter.""" + + +@deprecated(version="0.87.0", reason="Deprecated in favor of the CheckpointMixin which offers similar functionality") +class IncrementalMixin(CheckpointMixin, ABC): + """Mixin to make stream incremental. + + class IncrementalStream(Stream, IncrementalMixin): + @property + def state(self): + return self._state + + @state.setter + def state(self, value): + self._state[self.cursor_field] = value[self.cursor_field] + """ + + +@dataclass +class StreamClassification: + is_legacy_format: bool + has_multiple_slices: bool + + +# Moved to class declaration since get_updated_state is called on every record for incremental syncs, and thus the @deprecated decorator as well. +@deprecated( + version="0.1.49", + reason="Deprecated method get_updated_state, You should use explicit state property instead, see IncrementalMixin docs.", + action="ignore", +) +class Stream(ABC): + """ + Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol. + """ + + _configured_json_schema: Optional[Dict[str, Any]] = None + _exit_on_rate_limit: bool = False + + # Use self.logger in subclasses to log any messages + @property + def logger(self) -> logging.Logger: + return logging.getLogger(f"airbyte.streams.{self.name}") + + # TypeTransformer object to perform output data transformation + transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform) + + cursor: Optional[Cursor] = None + + has_multiple_slices = False + + @cached_property + def name(self) -> str: + """ + :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. + """ + return casing.camel_to_snake(self.__class__.__name__) + + def get_error_display_message(self, exception: BaseException) -> Optional[str]: + """ + Retrieves the user-friendly display message that corresponds to an exception. + This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. + + The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed. + + :param exception: The exception that was raised + :return: A user-friendly message that indicates the cause of the error + """ + return None + + def read( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies + self, + configured_stream: ConfiguredAirbyteStream, + logger: logging.Logger, + slice_logger: SliceLogger, + stream_state: MutableMapping[str, Any], + state_manager, + internal_config: InternalConfig, + ) -> Iterable[StreamData]: + sync_mode = configured_stream.sync_mode + cursor_field = configured_stream.cursor_field + self.configured_json_schema = configured_stream.stream.json_schema + + # WARNING: When performing a read() that uses incoming stream state, we MUST use the self.state that is defined as + # opposed to the incoming stream_state value. Because some connectors like ones using the file-based CDK modify + # state before setting the value on the Stream attribute, the most up-to-date state is derived from Stream.state + # instead of the stream_state parameter. This does not apply to legacy connectors using get_updated_state(). + try: + stream_state = self.state # type: ignore # we know the field might not exist... + except AttributeError: + pass + + should_checkpoint = bool(state_manager) + checkpoint_reader = self._get_checkpoint_reader( + logger=logger, cursor_field=cursor_field, sync_mode=sync_mode, stream_state=stream_state + ) + + next_slice = checkpoint_reader.next() + record_counter = 0 + stream_state_tracker = copy.deepcopy(stream_state) + while next_slice is not None: + if slice_logger.should_log_slice_message(logger): + yield slice_logger.create_slice_log_message(next_slice) + records = self.read_records( + sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior + stream_slice=next_slice, + stream_state=stream_state, + cursor_field=cursor_field or None, + ) + for record_data_or_message in records: + yield record_data_or_message + if isinstance(record_data_or_message, Mapping) or ( + hasattr(record_data_or_message, "type") and record_data_or_message.type == MessageType.RECORD + ): + record_data = record_data_or_message if isinstance(record_data_or_message, Mapping) else record_data_or_message.record + + # Thanks I hate it. RFR fundamentally doesn't fit with the concept of the legacy Stream.get_updated_state() + # method because RFR streams rely on pagination as a cursor. Stream.get_updated_state() was designed to make + # the CDK manage state using specifically the last seen record. don't @ brian.lai + # + # Also, because the legacy incremental state case decouples observing incoming records from emitting state, it + # requires that we separate CheckpointReader.observe() and CheckpointReader.get_checkpoint() which could + # otherwise be combined. + if self.cursor_field: + # Some connectors have streams that implement get_updated_state(), but do not define a cursor_field. This + # should be fixed on the stream implementation, but we should also protect against this in the CDK as well + stream_state_tracker = self.get_updated_state(stream_state_tracker, record_data) + self._observe_state(checkpoint_reader, stream_state_tracker) + record_counter += 1 + + checkpoint_interval = self.state_checkpoint_interval + checkpoint = checkpoint_reader.get_checkpoint() + if should_checkpoint and checkpoint_interval and record_counter % checkpoint_interval == 0 and checkpoint is not None: + airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager) + yield airbyte_state_message + + if internal_config.is_limit_reached(record_counter): + break + self._observe_state(checkpoint_reader) + checkpoint_state = checkpoint_reader.get_checkpoint() + if should_checkpoint and checkpoint_state is not None: + airbyte_state_message = self._checkpoint_state(checkpoint_state, state_manager=state_manager) + yield airbyte_state_message + + next_slice = checkpoint_reader.next() + + checkpoint = checkpoint_reader.get_checkpoint() + if should_checkpoint and checkpoint is not None: + airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager) + yield airbyte_state_message + + def read_only_records(self, state: Optional[Mapping[str, Any]] = None) -> Iterable[StreamData]: + """ + Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports + incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter) + or emit state messages. + """ + + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream( + name=self.name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + ), + sync_mode=SyncMode.incremental if state else SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + + yield from self.read( + configured_stream=configured_stream, + logger=self.logger, + slice_logger=DebugSliceLogger(), + stream_state=dict(state) if state else {}, # read() expects MutableMapping instead of Mapping which is used more often + state_manager=None, + internal_config=InternalConfig(), + ) + + @abstractmethod + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + """ + This method should be overridden by subclasses to read records based on the inputs + """ + + @lru_cache(maxsize=None) + def get_json_schema(self) -> Mapping[str, Any]: + """ + :return: A dict of the JSON schema representing this stream. + + The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. + Override as needed. + """ + # TODO show an example of using pydantic to define the JSON schema, or reading an OpenAPI spec + return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema(self.name) + + def as_airbyte_stream(self) -> AirbyteStream: + stream = AirbyteStream( + name=self.name, + json_schema=dict(self.get_json_schema()), + supported_sync_modes=[SyncMode.full_refresh], + is_resumable=self.is_resumable, + ) + + if self.namespace: + stream.namespace = self.namespace + + # If we can offer incremental we always should. RFR is always less reliable than incremental which uses a real cursor value + if self.supports_incremental: + stream.source_defined_cursor = self.source_defined_cursor + stream.supported_sync_modes.append(SyncMode.incremental) # type: ignore + stream.default_cursor_field = self._wrapped_cursor_field() + + keys = Stream._wrapped_primary_key(self.primary_key) + if keys and len(keys) > 0: + stream.source_defined_primary_key = keys + + return stream + + @property + def supports_incremental(self) -> bool: + """ + :return: True if this stream supports incrementally reading data + """ + return len(self._wrapped_cursor_field()) > 0 + + @property + def is_resumable(self) -> bool: + """ + :return: True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts. + This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh + can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning + on the next sync job. + """ + if self.supports_incremental: + return True + if self.has_multiple_slices: + # We temporarily gate substream to not support RFR because puts a pretty high burden on connector developers + # to structure stream state in a very specific way. We also can't check for issubclass(HttpSubStream) because + # not all substreams implement the interface and it would be a circular dependency so we use parent as a surrogate + return False + elif hasattr(type(self), "state") and getattr(type(self), "state").fset is not None: + # Modern case where a stream manages state using getter/setter + return True + else: + # Legacy case where the CDK manages state via the get_updated_state() method. This is determined by checking if + # the stream's get_updated_state() differs from the Stream class and therefore has been overridden + return type(self).get_updated_state != Stream.get_updated_state + + def _wrapped_cursor_field(self) -> List[str]: + return [self.cursor_field] if isinstance(self.cursor_field, str) else self.cursor_field + + @property + def cursor_field(self) -> Union[str, List[str]]: + """ + Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. + :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. + """ + return [] + + @property + def namespace(self) -> Optional[str]: + """ + Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for. + :return: A string containing the name of the namespace. + """ + return None + + @property + def source_defined_cursor(self) -> bool: + """ + Return False if the cursor can be configured by the user. + """ + return True + + @property + def exit_on_rate_limit(self) -> bool: + """Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited.""" + return self._exit_on_rate_limit + + @exit_on_rate_limit.setter + def exit_on_rate_limit(self, value: bool) -> None: + """Exit on rate limit setter, accept bool value.""" + self._exit_on_rate_limit = value + + @property + @abstractmethod + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + """ + :return: string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. + If the stream has no primary keys, return None. + """ + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + """ + Override to define the slices for this stream. See the stream slicing section of the docs for more information. + + :param sync_mode: + :param cursor_field: + :param stream_state: + :return: + """ + yield StreamSlice(partition={}, cursor_slice={}) + + @property + def state_checkpoint_interval(self) -> Optional[int]: + """ + Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading + 100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source. + + Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled. + + return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in + ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of + created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read. + """ + return None + + def get_updated_state( + self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] + ) -> MutableMapping[str, Any]: + """Override to extract state from the latest record. Needed to implement incremental sync. + + Inspects the latest record extracted from the data source and the current state object and return an updated state object. + + For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is + {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object. + + :param current_stream_state: The stream's current state object + :param latest_record: The latest record extracted from the stream + :return: An updated state object + """ + return {} + + def get_cursor(self) -> Optional[Cursor]: + """ + A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while + reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need + to define a cursor implementation and override this method to manage state through a Cursor. + """ + return self.cursor + + def _get_checkpoint_reader( + self, + logger: logging.Logger, + cursor_field: Optional[List[str]], + sync_mode: SyncMode, + stream_state: MutableMapping[str, Any], + ) -> CheckpointReader: + mappings_or_slices = self.stream_slices( + cursor_field=cursor_field, + sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior + stream_state=stream_state, + ) + + # Because of poor foresight, we wrote the default Stream.stream_slices() method to return [None] which is confusing and + # has now normalized this behavior for connector developers. Now some connectors return [None]. This is objectively + # misleading and a more ideal interface is [{}] to indicate we still want to iterate over one slice, but with no + # specific slice values. None is bad, and now I feel bad that I have to write this hack. + if mappings_or_slices == [None]: + mappings_or_slices = [{}] + + slices_iterable_copy, iterable_for_detecting_format = itertools.tee(mappings_or_slices, 2) + stream_classification = self._classify_stream(mappings_or_slices=iterable_for_detecting_format) + + # Streams that override has_multiple_slices are explicitly indicating that they will iterate over + # multiple partitions. Inspecting slices to automatically apply the correct cursor is only needed as + # a backup. So if this value was already assigned to True by the stream, we don't need to reassign it + self.has_multiple_slices = self.has_multiple_slices or stream_classification.has_multiple_slices + + cursor = self.get_cursor() + if cursor: + cursor.set_initial_state(stream_state=stream_state) + + checkpoint_mode = self._checkpoint_mode + + if cursor and stream_classification.is_legacy_format: + return LegacyCursorBasedCheckpointReader(stream_slices=slices_iterable_copy, cursor=cursor, read_state_from_cursor=True) + elif cursor: + return CursorBasedCheckpointReader( + stream_slices=slices_iterable_copy, + cursor=cursor, + read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH, + ) + elif checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH: + # Resumable full refresh readers rely on the stream state dynamically being updated during pagination and does + # not iterate over a static set of slices. + return ResumableFullRefreshCheckpointReader(stream_state=stream_state) + elif checkpoint_mode == CheckpointMode.INCREMENTAL: + return IncrementalCheckpointReader(stream_slices=slices_iterable_copy, stream_state=stream_state) + else: + return FullRefreshCheckpointReader(stream_slices=slices_iterable_copy) + + @property + def _checkpoint_mode(self) -> CheckpointMode: + if self.is_resumable and len(self._wrapped_cursor_field()) > 0: + return CheckpointMode.INCREMENTAL + elif self.is_resumable: + return CheckpointMode.RESUMABLE_FULL_REFRESH + else: + return CheckpointMode.FULL_REFRESH + + @staticmethod + def _classify_stream(mappings_or_slices: Iterator[Optional[Union[Mapping[str, Any], StreamSlice]]]) -> StreamClassification: + """ + This is a bit of a crazy solution, but also the only way we can detect certain attributes about the stream since Python + streams do not follow consistent implementation patterns. We care about the following two attributes: + - is_substream: Helps to incrementally release changes since substreams w/ parents are much more complicated. Also + helps de-risk the release of changes that might impact all connectors + - uses_legacy_slice_format: Since the checkpoint reader must manage a complex state object, we opted to have it always + use the structured StreamSlice object. However, this requires backwards compatibility with Python sources that only + support the legacy mapping object + + Both attributes can eventually be deprecated once stream's define this method deleted once substreams have been implemented and + legacy connectors all adhere to the StreamSlice object. + """ + if not mappings_or_slices: + raise ValueError("A stream should always have at least one slice") + try: + next_slice = next(mappings_or_slices) + if isinstance(next_slice, StreamSlice) and next_slice == StreamSlice(partition={}, cursor_slice={}): + is_legacy_format = False + slice_has_value = False + elif next_slice == {}: + is_legacy_format = True + slice_has_value = False + elif isinstance(next_slice, StreamSlice): + is_legacy_format = False + slice_has_value = True + else: + is_legacy_format = True + slice_has_value = True + except StopIteration: + # If the stream has no slices, the format ultimately does not matter since no data will get synced. This is technically + # a valid case because it is up to the stream to define its slicing behavior + return StreamClassification(is_legacy_format=False, has_multiple_slices=False) + + if slice_has_value: + # If the first slice contained a partition value from the result of stream_slices(), this is a substream that might + # have multiple parent records to iterate over + return StreamClassification(is_legacy_format=is_legacy_format, has_multiple_slices=slice_has_value) + + try: + # If stream_slices() returns multiple slices, this is also a substream that can potentially generate empty slices + next(mappings_or_slices) + return StreamClassification(is_legacy_format=is_legacy_format, has_multiple_slices=True) + except StopIteration: + # If the result of stream_slices() only returns a single empty stream slice, then we know this is a regular stream + return StreamClassification(is_legacy_format=is_legacy_format, has_multiple_slices=False) + + def log_stream_sync_configuration(self) -> None: + """ + Logs the configuration of this stream. + """ + self.logger.debug( + f"Syncing stream instance: {self.name}", + extra={ + "primary_key": self.primary_key, + "cursor_field": self.cursor_field, + }, + ) + + @staticmethod + def _wrapped_primary_key(keys: Optional[Union[str, List[str], List[List[str]]]]) -> Optional[List[List[str]]]: + """ + :return: wrap the primary_key property in a list of list of strings required by the Airbyte Stream object. + """ + if not keys: + return None + + if isinstance(keys, str): + return [[keys]] + elif isinstance(keys, list): + wrapped_keys = [] + for component in keys: + if isinstance(component, str): + wrapped_keys.append([component]) + elif isinstance(component, list): + wrapped_keys.append(component) + else: + raise ValueError(f"Element must be either list or str. Got: {type(component)}") + return wrapped_keys + else: + raise ValueError(f"Element must be either list or str. Got: {type(keys)}") + + def _observe_state(self, checkpoint_reader: CheckpointReader, stream_state: Optional[Mapping[str, Any]] = None) -> None: + """ + Convenience method that attempts to read the Stream's state using the recommended way of connector's managing their + own state via state setter/getter. But if we get back an AttributeError, then the legacy Stream.get_updated_state() + method is used as a fallback method. + """ + + # This is an inversion of the original logic that used to try state getter/setters first. As part of the work to + # automatically apply resumable full refresh to all streams, all HttpStream classes implement default state + # getter/setter methods, we should default to only using the incoming stream_state parameter value is {} which + # indicates the stream does not override the default get_updated_state() implementation. When the default method + # is not overridden, then the stream defers to self.state getter + if stream_state: + checkpoint_reader.observe(stream_state) + elif type(self).get_updated_state == Stream.get_updated_state: + # We only default to the state getter/setter if the stream does not use the legacy get_updated_state() method + try: + new_state = self.state # type: ignore # This will always exist on HttpStreams, but may not for Stream + if new_state: + checkpoint_reader.observe(new_state) + except AttributeError: + pass + + def _checkpoint_state( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies + self, + stream_state: Mapping[str, Any], + state_manager, + ) -> AirbyteMessage: + # todo: This can be consolidated into one ConnectorStateManager.update_and_create_state_message() method, but I want + # to reduce changes right now and this would span concurrent as well + state_manager.update_state_for_stream(self.name, self.namespace, stream_state) + return state_manager.create_state_message(self.name, self.namespace) + + @property + def configured_json_schema(self) -> Optional[Dict[str, Any]]: + """ + This property is set from the read method. + + :return Optional[Dict]: JSON schema from configured catalog if provided, otherwise None. + """ + return self._configured_json_schema + + @configured_json_schema.setter + def configured_json_schema(self, json_schema: Dict[str, Any]) -> None: + self._configured_json_schema = self._filter_schema_invalid_properties(json_schema) + + def _filter_schema_invalid_properties(self, configured_catalog_json_schema: Dict[str, Any]) -> Dict[str, Any]: + """ + Filters the properties in json_schema that are not present in the stream schema. + Configured Schemas can have very old fields, so we need to housekeeping ourselves. + """ + configured_schema: Any = configured_catalog_json_schema.get("properties", {}) + stream_schema_properties: Any = self.get_json_schema().get("properties", {}) + + configured_keys = configured_schema.keys() + stream_keys = stream_schema_properties.keys() + invalid_properties = configured_keys - stream_keys + if not invalid_properties: + return configured_catalog_json_schema + + self.logger.warning( + f"Stream {self.name}: the following fields are deprecated and cannot be synced. {invalid_properties}. Refresh the connection's source schema to resolve this warning." + ) + + valid_configured_schema_properties_keys = stream_keys & configured_keys + valid_configured_schema_properties = {} + + for configured_schema_property in valid_configured_schema_properties_keys: + valid_configured_schema_properties[configured_schema_property] = stream_schema_properties[configured_schema_property] + + return {**configured_catalog_json_schema, "properties": valid_configured_schema_properties} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/__init__.py new file mode 100644 index 000000000000..a876406b48b4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +# Initialize Streams Package +from .http_client import HttpClient +from .http import HttpStream, HttpSubStream +from .exceptions import UserDefinedBackoffException + +__all__ = ["HttpClient", "HttpStream", "HttpSubStream", "UserDefinedBackoffException"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/availability_strategy.py new file mode 100644 index 000000000000..4b3dba106c6e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/availability_strategy.py @@ -0,0 +1,52 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import typing +from typing import Optional, Tuple + +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + +if typing.TYPE_CHECKING: + from airbyte_cdk.sources import Source + + +class HttpAvailabilityStrategy(AvailabilityStrategy): + def check_availability(self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None) -> Tuple[bool, Optional[str]]: + """ + Check stream availability by attempting to read the first record of the + stream. + + :param stream: stream + :param logger: source logger + :param source: (optional) source + :return: A tuple of (boolean, str). If boolean is true, then the stream + is available, and no str is required. Otherwise, the stream is unavailable + for some reason and the str should describe what went wrong and how to + resolve the unavailability, if possible. + """ + reason: Optional[str] + try: + # Some streams need a stream slice to read records (e.g. if they have a SubstreamPartitionRouter) + # Streams that don't need a stream slice will return `None` as their first stream slice. + stream_slice = self.get_first_stream_slice(stream) + except StopIteration: + # If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!) + # This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield ` + # without accounting for the case in which the parent stream is empty. + reason = f"Cannot attempt to connect to stream {stream.name} - no stream slices were found, likely because the parent stream is empty." + return False, reason + except AirbyteTracedException as error: + return False, error.message + + try: + self.get_first_record_for_slice(stream, stream_slice) + return True, None + except StopIteration: + logger.info(f"Successfully connected to stream {stream.name}, but got 0 records.") + return True, None + except AirbyteTracedException as error: + return False, error.message diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/__init__.py new file mode 100644 index 000000000000..40abd4f944a4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/__init__.py @@ -0,0 +1,22 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from .backoff_strategy import BackoffStrategy +from .default_backoff_strategy import DefaultBackoffStrategy +from .error_handler import ErrorHandler +from .error_message_parser import ErrorMessageParser +from .http_status_error_handler import HttpStatusErrorHandler +from .json_error_message_parser import JsonErrorMessageParser +from .response_models import ResponseAction, ErrorResolution + +__all__ = [ + "BackoffStrategy", + "DefaultBackoffStrategy", + "ErrorHandler", + "ErrorMessageParser", + "HttpStatusErrorHandler", + "JsonErrorMessageParser", + "ResponseAction", + "ErrorResolution" +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py new file mode 100644 index 000000000000..6ed821791ca4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py @@ -0,0 +1,28 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Optional, Union + +import requests + + +class BackoffStrategy(ABC): + @abstractmethod + def backoff_time( + self, + response_or_exception: Optional[Union[requests.Response, requests.RequestException]], + attempt_count: int, + ) -> Optional[float]: + """ + Override this method to dynamically determine backoff time e.g: by reading the X-Retry-After header. + + This method is called only if should_backoff() returns True for the input request. + + :param response_or_exception: The response or exception that caused the backoff. + :param attempt_count: The number of attempts already performed for this request. + :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff + to the default backoff behavior (e.g using an exponential algorithm). + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py new file mode 100644 index 000000000000..2c3e10ad7a1a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + + +from typing import Optional, Union + +import requests + +from .backoff_strategy import BackoffStrategy + + +class DefaultBackoffStrategy(BackoffStrategy): + def backoff_time( + self, + response_or_exception: Optional[Union[requests.Response, requests.RequestException]], + attempt_count: int, + ) -> Optional[float]: + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py new file mode 100644 index 000000000000..546a910f57c5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py @@ -0,0 +1,82 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from typing import Mapping, Type, Union + +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, ResponseAction +from requests.exceptions import InvalidSchema, InvalidURL, RequestException + +DEFAULT_ERROR_MAPPING: Mapping[Union[int, str, Type[Exception]], ErrorResolution] = { + InvalidSchema: ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.config_error, + error_message="Invalid Protocol Schema: The endpoint that data is being requested from is using an invalid or insecure. Exception: requests.exceptions.InvalidSchema", + ), + InvalidURL: ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.config_error, + error_message="Invalid URL specified: The endpoint that data is being requested from is not a valid URL. Exception: requests.exceptions.InvalidURL", + ), + RequestException: ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message="An exception occurred when making the request. Exception: requests.exceptions.RequestException", + ), + 400: ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.system_error, + error_message="Bad request. Please check your request parameters.", + ), + 401: ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.config_error, + error_message="Unauthorized. Please ensure you are authenticated correctly.", + ), + 403: ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.config_error, + error_message="Forbidden. You don't have permission to access this resource.", + ), + 404: ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.system_error, + error_message="Not found. The requested resource was not found on the server.", + ), + 405: ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.system_error, + error_message="Method not allowed. Please check your request method.", + ), + 408: ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message="Request timeout.", + ), + 429: ErrorResolution( + response_action=ResponseAction.RATE_LIMITED, + failure_type=FailureType.transient_error, + error_message="Too many requests.", + ), + 500: ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message="Internal server error.", + ), + 502: ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message="Bad gateway.", + ), + 503: ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message="Service unavailable.", + ), + 504: ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message="Gateway timeout.", + ), +} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/error_handler.py new file mode 100644 index 000000000000..f1789cc6fb45 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/error_handler.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from abc import ABC, abstractmethod +from typing import Optional, Union + +import requests + +from .response_models import ErrorResolution + + +class ErrorHandler(ABC): + """ + Abstract base class to determine how to handle a failed HTTP request. + """ + + @property + @abstractmethod + def max_retries(self) -> Optional[int]: + """ + The maximum number of retries to attempt before giving up. + """ + pass + + @property + @abstractmethod + def max_time(self) -> Optional[int]: + """ + The maximum amount of time in seconds to retry before giving up. + """ + pass + + @abstractmethod + def interpret_response(self, response: Optional[Union[requests.Response, Exception]]) -> ErrorResolution: + """ + Interpret the response or exception and return the corresponding response action, failure type, and error message. + + :param response: The HTTP response object or exception raised during the request. + :return: A tuple containing the response action, failure type, and error message. + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py new file mode 100644 index 000000000000..966fe93a12bd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Optional + +import requests + + +class ErrorMessageParser(ABC): + @abstractmethod + def parse_response_error_message(self, response: requests.Response) -> Optional[str]: + """ + Parse error message from response. + :param response: response received for the request + :return: error message + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py new file mode 100644 index 000000000000..69adab30d1f6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py @@ -0,0 +1,98 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import logging +from datetime import timedelta +from typing import Mapping, Optional, Union + +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING +from airbyte_cdk.sources.streams.http.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, ResponseAction + + +class HttpStatusErrorHandler(ErrorHandler): + def __init__( + self, + logger: logging.Logger, + error_mapping: Optional[Mapping[Union[int, str, type[Exception]], ErrorResolution]] = None, + max_retries: int = 5, + max_time: timedelta = timedelta(seconds=600), + ) -> None: + """ + Initialize the HttpStatusErrorHandler. + + :param error_mapping: Custom error mappings to extend or override the default mappings. + """ + self._logger = logger + self._error_mapping = error_mapping or DEFAULT_ERROR_MAPPING + self._max_retries = max_retries + self._max_time = int(max_time.total_seconds()) + + @property + def max_retries(self) -> Optional[int]: + return self._max_retries + + @property + def max_time(self) -> Optional[int]: + return self._max_time + + def interpret_response(self, response_or_exception: Optional[Union[requests.Response, Exception]] = None) -> ErrorResolution: + """ + Interpret the response and return the corresponding response action, failure type, and error message. + + :param response: The HTTP response object. + :return: A tuple containing the response action, failure type, and error message. + """ + + if isinstance(response_or_exception, Exception): + mapped_error: Optional[ErrorResolution] = self._error_mapping.get(response_or_exception.__class__) + + if mapped_error is not None: + return mapped_error + else: + self._logger.error(f"Unexpected exception in error handler: {response_or_exception}") + return ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.system_error, + error_message=f"Unexpected exception in error handler: {response_or_exception}", + ) + + elif isinstance(response_or_exception, requests.Response): + if response_or_exception.status_code is None: + self._logger.error("Response does not include an HTTP status code.") + return ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message="Response does not include an HTTP status code.", + ) + + if response_or_exception.ok: + return ErrorResolution( + response_action=ResponseAction.SUCCESS, + failure_type=None, + error_message=None, + ) + + error_key = response_or_exception.status_code + + mapped_error = self._error_mapping.get(error_key) + + if mapped_error is not None: + return mapped_error + else: + self._logger.warning(f"Unexpected HTTP Status Code in error handler: '{error_key}'") + return ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.system_error, + error_message=f"Unexpected HTTP Status Code in error handler: {error_key}", + ) + else: + self._logger.error(f"Received unexpected response type: {type(response_or_exception)}") + return ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.system_error, + error_message=f"Received unexpected response type: {type(response_or_exception)}", + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py new file mode 100644 index 000000000000..3ca31ec57131 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Optional + +import requests +from airbyte_cdk.sources.streams.http.error_handlers import ErrorMessageParser +from airbyte_cdk.sources.utils.types import JsonType + + +class JsonErrorMessageParser(ErrorMessageParser): + def _try_get_error(self, value: Optional[JsonType]) -> Optional[str]: + if isinstance(value, str): + return value + elif isinstance(value, list): + errors_in_value = [self._try_get_error(v) for v in value] + return ", ".join(v for v in errors_in_value if v is not None) + elif isinstance(value, dict): + new_value = ( + value.get("message") + or value.get("messages") + or value.get("error") + or value.get("errors") + or value.get("failures") + or value.get("failure") + or value.get("detail") + or value.get("err") + or value.get("error_message") + or value.get("msg") + or value.get("reason") + or value.get("status_message") + ) + return self._try_get_error(new_value) + return None + + def parse_response_error_message(self, response: requests.Response) -> Optional[str]: + """ + Parses the raw response object from a failed request into a user-friendly error message. + + :param response: + :return: A user-friendly message that indicates the cause of the error + """ + try: + body = response.json() + return self._try_get_error(body) + except requests.exceptions.JSONDecodeError: + try: + return response.content.decode("utf-8") + except Exception: + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/response_models.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/response_models.py new file mode 100644 index 000000000000..21e20049a6c1 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/response_models.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from dataclasses import dataclass +from enum import Enum +from typing import Optional, Union + +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from requests import HTTPError + + +class ResponseAction(Enum): + SUCCESS = "SUCCESS" + RETRY = "RETRY" + FAIL = "FAIL" + IGNORE = "IGNORE" + RATE_LIMITED = "RATE_LIMITED" + + +@dataclass +class ErrorResolution: + response_action: Optional[ResponseAction] = None + failure_type: Optional[FailureType] = None + error_message: Optional[str] = None + + +def _format_exception_error_message(exception: Exception) -> str: + return f"{type(exception).__name__}: {str(exception)}" + + +def _format_response_error_message(response: requests.Response) -> str: + try: + response.raise_for_status() + except HTTPError as exception: + return filter_secrets(f"Response was not ok: `{str(exception)}`. Response content is: {response.text}") + # We purposefully do not add the response.content because the response is "ok" so there might be sensitive information in the payload. + # Feel free the + return f"Unexpected response with HTTP status {response.status_code}" + + +def create_fallback_error_resolution(response_or_exception: Optional[Union[requests.Response, Exception]]) -> ErrorResolution: + if response_or_exception is None: + # We do not expect this case to happen but if it does, it would be good to understand the cause and improve the error message + error_message = "Error handler did not receive a valid response or exception. This is unexpected please contact Airbyte Support" + elif isinstance(response_or_exception, Exception): + error_message = _format_exception_error_message(response_or_exception) + else: + error_message = _format_response_error_message(response_or_exception) + + return ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.system_error, + error_message=error_message, + ) + + +SUCCESS_RESOLUTION = ErrorResolution(response_action=ResponseAction.SUCCESS, failure_type=None, error_message=None) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/exceptions.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/exceptions.py new file mode 100644 index 000000000000..efa44165f8c2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/exceptions.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from typing import Optional, Union + +import requests + + +class BaseBackoffException(requests.exceptions.HTTPError): + def __init__( + self, + request: requests.PreparedRequest, + response: Optional[Union[requests.Response, Exception]], + error_message: str = "", + ): + + if isinstance(response, requests.Response): + error_message = ( + error_message or f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}" + ) + super().__init__(error_message, request=request, response=response) + else: + error_message = error_message or f"Request URL: {request.url}, Exception: {response}" + super().__init__(error_message, request=request, response=None) + + +class RequestBodyException(Exception): + """ + Raised when there are issues in configuring a request body + """ + + +class UserDefinedBackoffException(BaseBackoffException): + """ + An exception that exposes how long it attempted to backoff + """ + + def __init__( + self, + backoff: Union[int, float], + request: requests.PreparedRequest, + response: Optional[Union[requests.Response, Exception]], + error_message: str = "", + ): + """ + :param backoff: how long to backoff in seconds + :param request: the request that triggered this backoff exception + :param response: the response that triggered the backoff exception + """ + self.backoff = backoff + super().__init__(request=request, response=response, error_message=error_message) + + +class DefaultBackoffException(BaseBackoffException): + pass + + +class RateLimitBackoffException(BaseBackoffException): + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py new file mode 100644 index 000000000000..6c552ddab5e6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py @@ -0,0 +1,573 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from datetime import timedelta +from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union +from urllib.parse import urljoin + +import requests +from airbyte_cdk.models import AirbyteMessage, FailureType, SyncMode +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.message.repository import InMemoryMessageRepository +from airbyte_cdk.sources.streams.call_rate import APIBudget +from airbyte_cdk.sources.streams.checkpoint.cursor import Cursor +from airbyte_cdk.sources.streams.checkpoint.resumable_full_refresh_cursor import ResumableFullRefreshCursor +from airbyte_cdk.sources.streams.checkpoint.substream_resumable_full_refresh_cursor import SubstreamResumableFullRefreshCursor +from airbyte_cdk.sources.streams.core import CheckpointMixin, Stream, StreamData +from airbyte_cdk.sources.streams.http.error_handlers import BackoffStrategy, ErrorHandler, HttpStatusErrorHandler +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, ResponseAction +from airbyte_cdk.sources.streams.http.http_client import HttpClient +from airbyte_cdk.sources.types import Record, StreamSlice +from airbyte_cdk.sources.utils.types import JsonType +from deprecated import deprecated +from requests.auth import AuthBase + +# list of all possible HTTP methods which can be used for sending of request bodies +BODY_REQUEST_METHODS = ("GET", "POST", "PUT", "PATCH") + + +class HttpStream(Stream, CheckpointMixin, ABC): + """ + Base abstract class for an Airbyte Stream using the HTTP protocol. Basic building block for users building an Airbyte source for a HTTP API. + """ + + source_defined_cursor = True # Most HTTP streams use a source defined cursor (i.e: the user can't configure it like on a SQL table) + page_size: Optional[int] = None # Use this variable to define page size for API http requests with pagination support + + def __init__(self, authenticator: Optional[AuthBase] = None, api_budget: Optional[APIBudget] = None): + self._exit_on_rate_limit: bool = False + self._http_client = HttpClient( + name=self.name, + logger=self.logger, + error_handler=self.get_error_handler(), + api_budget=api_budget or APIBudget(policies=[]), + authenticator=authenticator, + use_cache=self.use_cache, + backoff_strategy=self.get_backoff_strategy(), + message_repository=InMemoryMessageRepository(), + ) + + # There are three conditions that dictate if RFR should automatically be applied to a stream + # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR + # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR. + # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform + # per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method + if not self.cursor and len(self.cursor_field) == 0 and type(self).read_records is HttpStream.read_records: + self.cursor = ResumableFullRefreshCursor() + + @property + def exit_on_rate_limit(self) -> bool: + """ + :return: False if the stream will retry endlessly when rate limited + """ + return self._exit_on_rate_limit + + @exit_on_rate_limit.setter + def exit_on_rate_limit(self, value: bool) -> None: + self._exit_on_rate_limit = value + + @property + def cache_filename(self) -> str: + """ + Override if needed. Return the name of cache file + Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only. + """ + return f"{self.name}.sqlite" + + @property + def use_cache(self) -> bool: + """ + Override if needed. If True, all records will be cached. + Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only. + """ + return False + + @property + @abstractmethod + def url_base(self) -> str: + """ + :return: URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/" + """ + + @property + def http_method(self) -> str: + """ + Override if needed. See get_request_data/get_request_json if using POST/PUT/PATCH. + """ + return "GET" + + @property + @deprecated(version="3.0.0", reason="You should set error_handler explicitly in HttpStream.get_error_handler() instead.") + def raise_on_http_errors(self) -> bool: + """ + Override if needed. If set to False, allows opting-out of raising HTTP code exception. + """ + return True + + @property + @deprecated(version="3.0.0", reason="You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead.") + def max_retries(self) -> Union[int, None]: + """ + Override if needed. Specifies maximum amount of retries for backoff policy. Return None for no limit. + """ + return 5 + + @property + @deprecated(version="3.0.0", reason="You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead.") + def max_time(self) -> Union[int, None]: + """ + Override if needed. Specifies maximum total waiting time (in seconds) for backoff policy. Return None for no limit. + """ + return 60 * 10 + + @property + @deprecated(version="3.0.0", reason="You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead.") + def retry_factor(self) -> float: + """ + Override if needed. Specifies factor for backoff policy. + """ + return 5 + + @abstractmethod + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + """ + Override this method to define a pagination strategy. + + The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params. + + :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response. + """ + + @abstractmethod + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + """ + Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity" + """ + + def request_params( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + """ + Override this method to define the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + return {} + + def request_headers( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. + """ + return {} + + def request_body_data( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Union[Mapping[str, Any], str]]: + """ + Override when creating POST/PUT/PATCH requests to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + return None + + def request_body_json( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping[str, Any]]: + """ + Override when creating POST/PUT/PATCH requests to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + return None + + def request_kwargs( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Override to return a mapping of keyword arguments to be used when creating the HTTP request. + Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from + this method. Note that these options do not conflict with request-level options such as headers, request params, etc.. + """ + return {} + + @abstractmethod + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + """ + Parses the raw response object into a list of records. + By default, this returns an iterable containing the input. Override to parse differently. + :param response: + :param stream_state: + :param stream_slice: + :param next_page_token: + :return: An iterable containing the parsed response + """ + + def get_backoff_strategy(self) -> Optional[Union[BackoffStrategy, List[BackoffStrategy]]]: + """ + Used to initialize Adapter to avoid breaking changes. + If Stream has a `backoff_time` method implementation, we know this stream uses old (pre-HTTPClient) backoff handlers and thus an adapter is needed. + + Override to provide custom BackoffStrategy + :return Optional[BackoffStrategy]: + """ + if hasattr(self, "backoff_time"): + return HttpStreamAdapterBackoffStrategy(self) + else: + return None + + def get_error_handler(self) -> Optional[ErrorHandler]: + """ + Used to initialize Adapter to avoid breaking changes. + If Stream has a `should_retry` method implementation, we know this stream uses old (pre-HTTPClient) error handlers and thus an adapter is needed. + + Override to provide custom ErrorHandler + :return Optional[ErrorHandler]: + """ + if hasattr(self, "should_retry"): + error_handler = HttpStreamAdapterHttpStatusErrorHandler( + stream=self, logger=logging.getLogger(), max_retries=self.max_retries, max_time=timedelta(seconds=self.max_time or 0) + ) + return error_handler + else: + return None + + @classmethod + def _join_url(cls, url_base: str, path: str) -> str: + return urljoin(url_base, path) + + @classmethod + def parse_response_error_message(cls, response: requests.Response) -> Optional[str]: + """ + Parses the raw response object from a failed request into a user-friendly error message. + By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently. + + :param response: + :return: A user-friendly message that indicates the cause of the error + """ + + # default logic to grab error from common fields + def _try_get_error(value: Optional[JsonType]) -> Optional[str]: + if isinstance(value, str): + return value + elif isinstance(value, list): + errors_in_value = [_try_get_error(v) for v in value] + return ", ".join(v for v in errors_in_value if v is not None) + elif isinstance(value, dict): + new_value = ( + value.get("message") + or value.get("messages") + or value.get("error") + or value.get("errors") + or value.get("failures") + or value.get("failure") + or value.get("detail") + ) + return _try_get_error(new_value) + return None + + try: + body = response.json() + return _try_get_error(body) + except requests.exceptions.JSONDecodeError: + return None + + def get_error_display_message(self, exception: BaseException) -> Optional[str]: + """ + Retrieves the user-friendly display message that corresponds to an exception. + This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. + + The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message(). + The method should be overriden as needed to handle any additional exception types. + + :param exception: The exception that was raised + :return: A user-friendly message that indicates the cause of the error + """ + if isinstance(exception, requests.HTTPError) and exception.response is not None: + return self.parse_response_error_message(exception.response) + return None + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + # A cursor_field indicates this is an incremental stream which offers better checkpointing than RFR enabled via the cursor + if self.cursor_field or not isinstance(self.get_cursor(), ResumableFullRefreshCursor): + yield from self._read_pages( + lambda req, res, state, _slice: self.parse_response(res, stream_slice=_slice, stream_state=state), + stream_slice, + stream_state, + ) + else: + yield from self._read_single_page( + lambda req, res, state, _slice: self.parse_response(res, stream_slice=_slice, stream_state=state), + stream_slice, + stream_state, + ) + + @property + def state(self) -> MutableMapping[str, Any]: + cursor = self.get_cursor() + if cursor: + return cursor.get_stream_state() # type: ignore + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + cursor = self.get_cursor() + if cursor: + cursor.set_initial_state(value) + self._state = value + + def get_cursor(self) -> Optional[Cursor]: + # I don't love that this is semi-stateful but not sure what else to do. We don't know exactly what type of cursor to + # instantiate when creating the class. We can make a few assumptions like if there is a cursor_field which implies + # incremental, but we don't know until runtime if this is a substream. Ideally, a stream should explicitly define + # its cursor, but because we're trying to automatically apply RFR we're stuck with this logic where we replace the + # cursor at runtime once we detect this is a substream based on self.has_multiple_slices being reassigned + if self.has_multiple_slices and isinstance(self.cursor, ResumableFullRefreshCursor): + self.cursor = SubstreamResumableFullRefreshCursor() + return self.cursor + else: + return self.cursor + + def _read_pages( + self, + records_generator_fn: Callable[ + [requests.PreparedRequest, requests.Response, Mapping[str, Any], Optional[Mapping[str, Any]]], Iterable[StreamData] + ], + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + partition, _, _ = self._extract_slice_fields(stream_slice=stream_slice) + + stream_state = stream_state or {} + pagination_complete = False + next_page_token = None + while not pagination_complete: + request, response = self._fetch_next_page(stream_slice, stream_state, next_page_token) + yield from records_generator_fn(request, response, stream_state, stream_slice) + + next_page_token = self.next_page_token(response) + if not next_page_token: + pagination_complete = True + + cursor = self.get_cursor() + if cursor and isinstance(cursor, SubstreamResumableFullRefreshCursor): + # Substreams checkpoint state by marking an entire parent partition as completed so that on the subsequent attempt + # after a failure, completed parents are skipped and the sync can make progress + cursor.close_slice(StreamSlice(cursor_slice={}, partition=partition)) + + # Always return an empty generator just in case no records were ever yielded + yield from [] + + def _read_single_page( + self, + records_generator_fn: Callable[ + [requests.PreparedRequest, requests.Response, Mapping[str, Any], Optional[Mapping[str, Any]]], Iterable[StreamData] + ], + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + partition, cursor_slice, remaining_slice = self._extract_slice_fields(stream_slice=stream_slice) + stream_state = stream_state or {} + next_page_token = cursor_slice or None + + request, response = self._fetch_next_page(remaining_slice, stream_state, next_page_token) + yield from records_generator_fn(request, response, stream_state, remaining_slice) + + next_page_token = self.next_page_token(response) or {"__ab_full_refresh_sync_complete": True} + + cursor = self.get_cursor() + if cursor: + cursor.close_slice(StreamSlice(cursor_slice=next_page_token, partition=partition)) + + # Always return an empty generator just in case no records were ever yielded + yield from [] + + @staticmethod + def _extract_slice_fields(stream_slice: Optional[Mapping[str, Any]]) -> tuple[Mapping[str, Any], Mapping[str, Any], Mapping[str, Any]]: + if not stream_slice: + return {}, {}, {} + + if isinstance(stream_slice, StreamSlice): + partition = stream_slice.partition + cursor_slice = stream_slice.cursor_slice + remaining = {k: v for k, v in stream_slice.items()} + else: + # RFR streams that implement stream_slices() to generate stream slices in the legacy mapping format are converted into a + # structured stream slice mapping by the LegacyCursorBasedCheckpointReader. The structured mapping object has separate + # fields for the partition and cursor_slice value + partition = stream_slice.get("partition", {}) + cursor_slice = stream_slice.get("cursor_slice", {}) + remaining = {key: val for key, val in stream_slice.items() if key != "partition" and key != "cursor_slice"} + return partition, cursor_slice, remaining + + def _fetch_next_page( + self, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Tuple[requests.PreparedRequest, requests.Response]: + + request, response = self._http_client.send_request( + http_method=self.http_method, + url=self._join_url( + self.url_base, + self.path(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + ), + request_kwargs=self.request_kwargs(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + headers=self.request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + params=self.request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + json=self.request_body_json(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + data=self.request_body_data(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + dedupe_query_params=True, + log_formatter=self.get_log_formatter(), + exit_on_rate_limit=self.exit_on_rate_limit, + ) + + return request, response + + def get_log_formatter(self) -> Optional[Callable[[requests.Response], Any]]: + """ + + :return Optional[Callable[[requests.Response], Any]]: Function that will be used in logging inside HttpClient + """ + return None + + +class HttpSubStream(HttpStream, ABC): + def __init__(self, parent: HttpStream, **kwargs: Any): + """ + :param parent: should be the instance of HttpStream class + """ + super().__init__(**kwargs) + self.parent = parent + self.has_multiple_slices = True # Substreams are based on parent records which implies there are multiple slices + + # There are three conditions that dictate if RFR should automatically be applied to a stream + # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR + # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR. + # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform + # per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method + if not self.cursor and len(self.cursor_field) == 0 and type(self).read_records is HttpStream.read_records: + self.cursor = SubstreamResumableFullRefreshCursor() + + def stream_slices( + self, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does + # not support either substreams or RFR, but something that needs to be considered once we do + for parent_record in self.parent.read_only_records(stream_state): + # Skip non-records (eg AirbyteLogMessage) + if isinstance(parent_record, AirbyteMessage): + if parent_record.type == MessageType.RECORD: + parent_record = parent_record.record.data + else: + continue + elif isinstance(parent_record, Record): + parent_record = parent_record.data + yield {"parent": parent_record} + + +@deprecated(version="3.0.0", reason="You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead.") +class HttpStreamAdapterBackoffStrategy(BackoffStrategy): + def __init__(self, stream: HttpStream): + self.stream = stream + + def backoff_time( + self, + response_or_exception: Optional[Union[requests.Response, requests.RequestException]], + attempt_count: int, + ) -> Optional[float]: + return self.stream.backoff_time(response_or_exception) # type: ignore # noqa # HttpStream.backoff_time has been deprecated + + +@deprecated(version="3.0.0", reason="You should set error_handler explicitly in HttpStream.get_error_handler() instead.") +class HttpStreamAdapterHttpStatusErrorHandler(HttpStatusErrorHandler): + def __init__(self, stream: HttpStream, **kwargs): # type: ignore # noqa + self.stream = stream + super().__init__(**kwargs) + + def interpret_response(self, response_or_exception: Optional[Union[requests.Response, Exception]] = None) -> ErrorResolution: + if isinstance(response_or_exception, Exception): + return super().interpret_response(response_or_exception) + elif isinstance(response_or_exception, requests.Response): + should_retry = self.stream.should_retry(response_or_exception) # type: ignore # noqa + if should_retry: + if response_or_exception.status_code == 429: + return ErrorResolution( + response_action=ResponseAction.RATE_LIMITED, + failure_type=FailureType.transient_error, + error_message=f"Response status code: {response_or_exception.status_code}. Retrying...", # type: ignore[union-attr] + ) + return ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.transient_error, + error_message=f"Response status code: {response_or_exception.status_code}. Retrying...", # type: ignore[union-attr] + ) + else: + if response_or_exception.ok: # type: ignore # noqa + return ErrorResolution( + response_action=ResponseAction.SUCCESS, + failure_type=None, + error_message=None, + ) + if self.stream.raise_on_http_errors: + return ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.transient_error, + error_message=f"Response status code: {response_or_exception.status_code}. Unexpected error. Failed.", # type: ignore[union-attr] + ) + else: + return ErrorResolution( + response_action=ResponseAction.IGNORE, + failure_type=FailureType.transient_error, + error_message=f"Response status code: {response_or_exception.status_code}. Ignoring...", # type: ignore[union-attr] + ) + else: + self._logger.error(f"Received unexpected response type: {type(response_or_exception)}") + return ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.system_error, + error_message=f"Received unexpected response type: {type(response_or_exception)}", + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http_client.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http_client.py new file mode 100644 index 000000000000..cccbc4b8c01c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http_client.py @@ -0,0 +1,410 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import os +import urllib +from pathlib import Path +from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union + +import orjson +import requests +import requests_cache +from airbyte_cdk.models import ( + AirbyteMessageSerializer, + AirbyteStreamStatus, + AirbyteStreamStatusReason, + AirbyteStreamStatusReasonType, + Level, + StreamDescriptor, +) +from airbyte_cdk.sources.http_config import MAX_CONNECTION_POOL_SIZE +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.call_rate import APIBudget, CachedLimiterSession, LimiterSession +from airbyte_cdk.sources.streams.http.error_handlers import ( + BackoffStrategy, + DefaultBackoffStrategy, + ErrorHandler, + ErrorMessageParser, + ErrorResolution, + HttpStatusErrorHandler, + JsonErrorMessageParser, + ResponseAction, +) +from airbyte_cdk.sources.streams.http.exceptions import ( + DefaultBackoffException, + RateLimitBackoffException, + RequestBodyException, + UserDefinedBackoffException, +) +from airbyte_cdk.sources.streams.http.rate_limiting import ( + http_client_default_backoff_handler, + rate_limit_default_backoff_handler, + user_defined_backoff_handler, +) +from airbyte_cdk.utils.constants import ENV_REQUEST_CACHE_PATH +from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from requests.auth import AuthBase + +BODY_REQUEST_METHODS = ("GET", "POST", "PUT", "PATCH") + + +class MessageRepresentationAirbyteTracedErrors(AirbyteTracedException): + """ + Before the migration to the HttpClient in low-code, the exception raised was + [ReadException](https://github.com/airbytehq/airbyte/blob/8fdd9818ec16e653ba3dd2b167a74b7c07459861/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_requester.py#L566). + This has been moved to a AirbyteTracedException. The printing on this is questionable (AirbyteTracedException string representation + shows the internal_message and not the message). We have already discussed moving the AirbyteTracedException string representation to + `message` but the impact is unclear and hard to quantify so we will do it here only for now. + """ + + def __str__(self) -> str: + if self.message: + return self.message + elif self.internal_message: + return self.internal_message + return "" + + +class HttpClient: + + _DEFAULT_MAX_RETRY: int = 5 + _DEFAULT_MAX_TIME: int = 60 * 10 + + def __init__( + self, + name: str, + logger: logging.Logger, + error_handler: Optional[ErrorHandler] = None, + api_budget: Optional[APIBudget] = None, + session: Optional[Union[requests.Session, requests_cache.CachedSession]] = None, + authenticator: Optional[AuthBase] = None, + use_cache: bool = False, + backoff_strategy: Optional[Union[BackoffStrategy, List[BackoffStrategy]]] = None, + error_message_parser: Optional[ErrorMessageParser] = None, + disable_retries: bool = False, + message_repository: Optional[MessageRepository] = None, + ): + self._name = name + self._api_budget: APIBudget = api_budget or APIBudget(policies=[]) + if session: + self._session = session + else: + self._use_cache = use_cache + self._session = self._request_session() + self._session.mount( + "https://", requests.adapters.HTTPAdapter(pool_connections=MAX_CONNECTION_POOL_SIZE, pool_maxsize=MAX_CONNECTION_POOL_SIZE) + ) + if isinstance(authenticator, AuthBase): + self._session.auth = authenticator + self._logger = logger + self._error_handler = error_handler or HttpStatusErrorHandler(self._logger) + if backoff_strategy is not None: + if isinstance(backoff_strategy, list): + self._backoff_strategies = backoff_strategy + else: + self._backoff_strategies = [backoff_strategy] + else: + self._backoff_strategies = [DefaultBackoffStrategy()] + self._error_message_parser = error_message_parser or JsonErrorMessageParser() + self._request_attempt_count: Dict[requests.PreparedRequest, int] = {} + self._disable_retries = disable_retries + self._message_repository = message_repository + + @property + def cache_filename(self) -> str: + """ + Override if needed. Return the name of cache file + Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only. + """ + return f"{self._name}.sqlite" + + def _request_session(self) -> requests.Session: + """ + Session factory based on use_cache property and call rate limits (api_budget parameter) + :return: instance of request-based session + """ + if self._use_cache: + cache_dir = os.getenv(ENV_REQUEST_CACHE_PATH) + # Use in-memory cache if cache_dir is not set + # This is a non-obvious interface, but it ensures we don't write sql files when running unit tests + if cache_dir: + sqlite_path = str(Path(cache_dir) / self.cache_filename) + else: + sqlite_path = "file::memory:?cache=shared" + return CachedLimiterSession(sqlite_path, backend="sqlite", api_budget=self._api_budget, match_headers=True) # type: ignore # there are no typeshed stubs for requests_cache + else: + return LimiterSession(api_budget=self._api_budget) + + def clear_cache(self) -> None: + """ + Clear cached requests for current session, can be called any time + """ + if isinstance(self._session, requests_cache.CachedSession): + self._session.cache.clear() # type: ignore # cache.clear is not typed + + def _dedupe_query_params(self, url: str, params: Optional[Mapping[str, str]]) -> Mapping[str, str]: + """ + Remove query parameters from params mapping if they are already encoded in the URL. + :param url: URL with + :param params: + :return: + """ + if params is None: + params = {} + query_string = urllib.parse.urlparse(url).query + query_dict = {k: v[0] for k, v in urllib.parse.parse_qs(query_string).items()} + + duplicate_keys_with_same_value = {k for k in query_dict.keys() if str(params.get(k)) == str(query_dict[k])} + return {k: v for k, v in params.items() if k not in duplicate_keys_with_same_value} + + def _create_prepared_request( + self, + http_method: str, + url: str, + dedupe_query_params: bool = False, + headers: Optional[Mapping[str, str]] = None, + params: Optional[Mapping[str, str]] = None, + json: Optional[Mapping[str, Any]] = None, + data: Optional[Union[str, Mapping[str, Any]]] = None, + ) -> requests.PreparedRequest: + if dedupe_query_params: + query_params = self._dedupe_query_params(url, params) + else: + query_params = params or {} + args = {"method": http_method, "url": url, "headers": headers, "params": query_params} + if http_method.upper() in BODY_REQUEST_METHODS: + if json and data: + raise RequestBodyException( + "At the same time only one of the 'request_body_data' and 'request_body_json' functions can return data" + ) + elif json: + args["json"] = json + elif data: + args["data"] = data + prepared_request: requests.PreparedRequest = self._session.prepare_request(requests.Request(**args)) + + return prepared_request + + @property + def _max_retries(self) -> int: + """ + Determines the max retries based on the provided error handler. + """ + max_retries = None + if self._disable_retries: + max_retries = 0 + else: + max_retries = self._error_handler.max_retries + return max_retries if max_retries is not None else self._DEFAULT_MAX_RETRY + + @property + def _max_time(self) -> int: + """ + Determines the max time based on the provided error handler. + """ + return self._error_handler.max_time if self._error_handler.max_time is not None else self._DEFAULT_MAX_TIME + + def _send_with_retry( + self, + request: requests.PreparedRequest, + request_kwargs: Mapping[str, Any], + log_formatter: Optional[Callable[[requests.Response], Any]] = None, + exit_on_rate_limit: Optional[bool] = False, + ) -> requests.Response: + """ + Sends a request with retry logic. + + Args: + request (requests.PreparedRequest): The prepared HTTP request to send. + request_kwargs (Mapping[str, Any]): Additional keyword arguments for the request. + + Returns: + requests.Response: The HTTP response received from the server after retries. + """ + + max_retries = self._max_retries + max_tries = max(0, max_retries) + 1 + max_time = self._max_time + + user_backoff_handler = user_defined_backoff_handler(max_tries=max_tries, max_time=max_time)(self._send) + rate_limit_backoff_handler = rate_limit_default_backoff_handler() + backoff_handler = http_client_default_backoff_handler(max_tries=max_tries, max_time=max_time) + # backoff handlers wrap _send, so it will always return a response + response = backoff_handler(rate_limit_backoff_handler(user_backoff_handler))(request, request_kwargs, log_formatter=log_formatter, exit_on_rate_limit=exit_on_rate_limit) # type: ignore # mypy can't infer that backoff_handler wraps _send + + return response + + def _send( + self, + request: requests.PreparedRequest, + request_kwargs: Mapping[str, Any], + log_formatter: Optional[Callable[[requests.Response], Any]] = None, + exit_on_rate_limit: Optional[bool] = False, + ) -> requests.Response: + + if request not in self._request_attempt_count: + self._request_attempt_count[request] = 1 + else: + self._request_attempt_count[request] += 1 + if hasattr(self._session, "auth") and isinstance(self._session.auth, AuthBase): + self._session.auth(request) + + self._logger.debug( + "Making outbound API request", extra={"headers": request.headers, "url": request.url, "request_body": request.body} + ) + + response: Optional[requests.Response] = None + exc: Optional[requests.RequestException] = None + + try: + response = self._session.send(request, **request_kwargs) + except requests.RequestException as e: + exc = e + + error_resolution: ErrorResolution = self._error_handler.interpret_response(response if response is not None else exc) + + # Evaluation of response.text can be heavy, for example, if streaming a large response + # Do it only in debug mode + if self._logger.isEnabledFor(logging.DEBUG) and response is not None: + if request_kwargs.get("stream"): + self._logger.debug( + "Receiving response, but not logging it as the response is streamed", + extra={"headers": response.headers, "status": response.status_code}, + ) + else: + self._logger.debug( + "Receiving response", extra={"headers": response.headers, "status": response.status_code, "body": response.text} + ) + + # Request/response logging for declarative cdk + if log_formatter is not None and response is not None and self._message_repository is not None: + formatter = log_formatter + self._message_repository.log_message( + Level.DEBUG, + lambda: formatter(response), # type: ignore # log_formatter is always cast to a callable + ) + + self._handle_error_resolution( + response=response, exc=exc, request=request, error_resolution=error_resolution, exit_on_rate_limit=exit_on_rate_limit + ) + + return response # type: ignore # will either return a valid response of type requests.Response or raise an exception + + def _handle_error_resolution( + self, + response: Optional[requests.Response], + exc: Optional[requests.RequestException], + request: requests.PreparedRequest, + error_resolution: ErrorResolution, + exit_on_rate_limit: Optional[bool] = False, + ) -> None: + # Emit stream status RUNNING with the reason RATE_LIMITED to log that the rate limit has been reached + if error_resolution.response_action == ResponseAction.RATE_LIMITED: + # TODO: Update to handle with message repository when concurrent message repository is ready + reasons = [AirbyteStreamStatusReason(type=AirbyteStreamStatusReasonType.RATE_LIMITED)] + message = orjson.dumps( + AirbyteMessageSerializer.dump( + stream_status_as_airbyte_message(StreamDescriptor(name=self._name), AirbyteStreamStatus.RUNNING, reasons) + ) + ).decode() + + # Simply printing the stream status is a temporary solution and can cause future issues. Currently, the _send method is + # wrapped with backoff decorators, and we can only emit messages by iterating record_iterator in the abstract source at the + # end of the retry decorator behavior. This approach does not allow us to emit messages in the queue before exiting the + # backoff retry loop. Adding `\n` to the message and ignore 'end' ensure that few messages are printed at the same time. + print(f"{message}\n", end="", flush=True) + + if error_resolution.response_action == ResponseAction.FAIL: + if response is not None: + error_message = f"'{request.method}' request to '{request.url}' failed with status code '{response.status_code}' and error message '{self._error_message_parser.parse_response_error_message(response)}'" + else: + error_message = f"'{request.method}' request to '{request.url}' failed with exception: '{exc}'" + + raise MessageRepresentationAirbyteTracedErrors( + internal_message=error_message, + message=error_resolution.error_message or error_message, + failure_type=error_resolution.failure_type, + ) + + elif error_resolution.response_action == ResponseAction.IGNORE: + if response is not None: + log_message = ( + f"Ignoring response for '{request.method}' request to '{request.url}' with response code '{response.status_code}'" + ) + else: + log_message = f"Ignoring response for '{request.method}' request to '{request.url}' with error '{exc}'" + + self._logger.info(error_resolution.error_message or log_message) + + # TODO: Consider dynamic retry count depending on subsequent error codes + elif error_resolution.response_action == ResponseAction.RETRY or error_resolution.response_action == ResponseAction.RATE_LIMITED: + user_defined_backoff_time = None + for backoff_strategy in self._backoff_strategies: + backoff_time = backoff_strategy.backoff_time( + response_or_exception=response if response is not None else exc, attempt_count=self._request_attempt_count[request] + ) + if backoff_time: + user_defined_backoff_time = backoff_time + break + error_message = ( + error_resolution.error_message + or f"Request to {request.url} failed with failure type {error_resolution.failure_type}, response action {error_resolution.response_action}." + ) + + retry_endlessly = error_resolution.response_action == ResponseAction.RATE_LIMITED and not exit_on_rate_limit + + if user_defined_backoff_time: + raise UserDefinedBackoffException( + backoff=user_defined_backoff_time, + request=request, + response=(response if response is not None else exc), + error_message=error_message, + ) + + elif retry_endlessly: + raise RateLimitBackoffException(request=request, response=response or exc, error_message=error_message) + + raise DefaultBackoffException( + request=request, response=(response if response is not None else exc), error_message=error_message + ) + + elif response: + try: + response.raise_for_status() + except requests.HTTPError as e: + self._logger.error(response.text) + raise e + + @property + def name(self) -> str: + return self._name + + def send_request( + self, + http_method: str, + url: str, + request_kwargs: Mapping[str, Any], + headers: Optional[Mapping[str, str]] = None, + params: Optional[Mapping[str, str]] = None, + json: Optional[Mapping[str, Any]] = None, + data: Optional[Union[str, Mapping[str, Any]]] = None, + dedupe_query_params: bool = False, + log_formatter: Optional[Callable[[requests.Response], Any]] = None, + exit_on_rate_limit: Optional[bool] = False, + ) -> Tuple[requests.PreparedRequest, requests.Response]: + """ + Prepares and sends request and return request and response objects. + """ + + request: requests.PreparedRequest = self._create_prepared_request( + http_method=http_method, url=url, dedupe_query_params=dedupe_query_params, headers=headers, params=params, json=json, data=data + ) + + response: requests.Response = self._send_with_retry( + request=request, request_kwargs=request_kwargs, log_formatter=log_formatter, exit_on_rate_limit=exit_on_rate_limit + ) + + return request, response diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/rate_limiting.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/rate_limiting.py new file mode 100644 index 000000000000..cae3907dbb39 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/rate_limiting.py @@ -0,0 +1,142 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import sys +import time +from typing import Any, Callable, Mapping, Optional + +import backoff +from requests import PreparedRequest, RequestException, Response, codes, exceptions + +from .exceptions import DefaultBackoffException, RateLimitBackoffException, UserDefinedBackoffException + +TRANSIENT_EXCEPTIONS = ( + DefaultBackoffException, + exceptions.ConnectTimeout, + exceptions.ReadTimeout, + exceptions.ConnectionError, + exceptions.ChunkedEncodingError, +) + +logger = logging.getLogger("airbyte") + + +SendRequestCallableType = Callable[[PreparedRequest, Mapping[str, Any]], Response] + + +def default_backoff_handler( + max_tries: Optional[int], factor: float, max_time: Optional[int] = None, **kwargs: Any +) -> Callable[[SendRequestCallableType], SendRequestCallableType]: + def log_retry_attempt(details: Mapping[str, Any]) -> None: + _, exc, _ = sys.exc_info() + if isinstance(exc, RequestException) and exc.response: + logger.info(f"Status code: {exc.response.status_code!r}, Response Content: {exc.response.content!r}") + logger.info( + f"Caught retryable error '{str(exc)}' after {details['tries']} tries. Waiting {details['wait']} seconds then retrying..." + ) + + def should_give_up(exc: Exception) -> bool: + # If a non-rate-limiting related 4XX error makes it this far, it means it was unexpected and probably consistent, so we shouldn't back off + if isinstance(exc, RequestException): + if exc.response is not None: + give_up: bool = ( + exc.response is not None + and exc.response.status_code != codes.too_many_requests + and 400 <= exc.response.status_code < 500 + ) + if give_up: + logger.info(f"Giving up for returned HTTP status: {exc.response.status_code!r}") + return give_up + # Only RequestExceptions are retryable, so if we get here, it's not retryable + return False + + return backoff.on_exception( # type: ignore # Decorator function returns a function with a different signature than the input function, so mypy can't infer the type of the returned function + backoff.expo, + TRANSIENT_EXCEPTIONS, + jitter=None, + on_backoff=log_retry_attempt, + giveup=should_give_up, + max_tries=max_tries, + max_time=max_time, + factor=factor, + **kwargs, + ) + + +def http_client_default_backoff_handler( + max_tries: Optional[int], max_time: Optional[int] = None, **kwargs: Any +) -> Callable[[SendRequestCallableType], SendRequestCallableType]: + def log_retry_attempt(details: Mapping[str, Any]) -> None: + _, exc, _ = sys.exc_info() + if isinstance(exc, RequestException) and exc.response: + logger.info(f"Status code: {exc.response.status_code!r}, Response Content: {exc.response.content!r}") + logger.info( + f"Caught retryable error '{str(exc)}' after {details['tries']} tries. Waiting {details['wait']} seconds then retrying..." + ) + + def should_give_up(exc: Exception) -> bool: + # If made it here, the ResponseAction was RETRY and therefore should not give up + return False + + return backoff.on_exception( # type: ignore # Decorator function returns a function with a different signature than the input function, so mypy can't infer the type of the returned function + backoff.expo, + TRANSIENT_EXCEPTIONS, + jitter=None, + on_backoff=log_retry_attempt, + giveup=should_give_up, + max_tries=max_tries, + max_time=max_time, + **kwargs, + ) + + +def user_defined_backoff_handler( + max_tries: Optional[int], max_time: Optional[int] = None, **kwargs: Any +) -> Callable[[SendRequestCallableType], SendRequestCallableType]: + def sleep_on_ratelimit(details: Mapping[str, Any]) -> None: + _, exc, _ = sys.exc_info() + if isinstance(exc, UserDefinedBackoffException): + if exc.response: + logger.info(f"Status code: {exc.response.status_code!r}, Response Content: {exc.response.content!r}") + retry_after = exc.backoff + logger.info(f"Retrying. Sleeping for {retry_after} seconds") + time.sleep(retry_after + 1) # extra second to cover any fractions of second + + def log_give_up(details: Mapping[str, Any]) -> None: + _, exc, _ = sys.exc_info() + if isinstance(exc, RequestException): + logger.error(f"Max retry limit reached in {details['elapsed']}s. Request: {exc.request}, Response: {exc.response}") + else: + logger.error("Max retry limit reached for unknown request and response") + + return backoff.on_exception( # type: ignore # Decorator function returns a function with a different signature than the input function, so mypy can't infer the type of the returned function + backoff.constant, + UserDefinedBackoffException, + interval=0, # skip waiting, we'll wait in on_backoff handler + on_backoff=sleep_on_ratelimit, + on_giveup=log_give_up, + jitter=None, + max_tries=max_tries, + max_time=max_time, + **kwargs, + ) + + +def rate_limit_default_backoff_handler(**kwargs: Any) -> Callable[[SendRequestCallableType], SendRequestCallableType]: + def log_retry_attempt(details: Mapping[str, Any]) -> None: + _, exc, _ = sys.exc_info() + if isinstance(exc, RequestException) and exc.response: + logger.info(f"Status code: {exc.response.status_code!r}, Response Content: {exc.response.content!r}") + logger.info( + f"Caught retryable error '{str(exc)}' after {details['tries']} tries. Waiting {details['wait']} seconds then retrying..." + ) + + return backoff.on_exception( # type: ignore # Decorator function returns a function with a different signature than the input function, so mypy can't infer the type of the returned function + backoff.expo, + RateLimitBackoffException, + jitter=None, + on_backoff=log_retry_attempt, + **kwargs, + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py new file mode 100644 index 000000000000..307f91f40ced --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py @@ -0,0 +1,14 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from .oauth import Oauth2Authenticator, SingleUseRefreshTokenOauth2Authenticator +from .token import BasicHttpAuthenticator, MultipleTokenAuthenticator, TokenAuthenticator + +__all__ = [ + "Oauth2Authenticator", + "SingleUseRefreshTokenOauth2Authenticator", + "TokenAuthenticator", + "MultipleTokenAuthenticator", + "BasicHttpAuthenticator", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py new file mode 100644 index 000000000000..63915f71d651 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py @@ -0,0 +1,258 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import abstractmethod +from json import JSONDecodeError +from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union + +import backoff +import pendulum +import requests +from airbyte_cdk.models import FailureType, Level +from airbyte_cdk.sources.http_logger import format_http_message +from airbyte_cdk.sources.message import MessageRepository, NoopMessageRepository +from airbyte_cdk.utils import AirbyteTracedException +from airbyte_cdk.utils.airbyte_secrets_utils import add_to_secrets +from requests.auth import AuthBase + +from ..exceptions import DefaultBackoffException + +logger = logging.getLogger("airbyte") +_NOOP_MESSAGE_REPOSITORY = NoopMessageRepository() + + +class AbstractOauth2Authenticator(AuthBase): + """ + Abstract class for an OAuth authenticators that implements the OAuth token refresh flow. The authenticator + is designed to generically perform the refresh flow without regard to how config fields are get/set by + delegating that behavior to the classes implementing the interface. + """ + + _NO_STREAM_NAME = None + + def __init__( + self, + refresh_token_error_status_codes: Tuple[int, ...] = (), + refresh_token_error_key: str = "", + refresh_token_error_values: Tuple[str, ...] = (), + ) -> None: + """ + If all of refresh_token_error_status_codes, refresh_token_error_key, and refresh_token_error_values are set, + then http errors with such params will be wrapped in AirbyteTracedException. + """ + self._refresh_token_error_status_codes = refresh_token_error_status_codes + self._refresh_token_error_key = refresh_token_error_key + self._refresh_token_error_values = refresh_token_error_values + + def __call__(self, request: requests.PreparedRequest) -> requests.PreparedRequest: + """Attach the HTTP headers required to authenticate on the HTTP request""" + request.headers.update(self.get_auth_header()) + return request + + def get_auth_header(self) -> Mapping[str, Any]: + """HTTP header to set on the requests""" + return {"Authorization": f"Bearer {self.get_access_token()}"} + + def get_access_token(self) -> str: + """Returns the access token""" + if self.token_has_expired(): + token, expires_in = self.refresh_access_token() + self.access_token = token + self.set_token_expiry_date(expires_in) + + return self.access_token + + def token_has_expired(self) -> bool: + """Returns True if the token is expired""" + return pendulum.now() > self.get_token_expiry_date() # type: ignore # this is always a bool despite what mypy thinks + + def build_refresh_request_body(self) -> Mapping[str, Any]: + """ + Returns the request body to set on the refresh request + + Override to define additional parameters + """ + payload: MutableMapping[str, Any] = { + "grant_type": self.get_grant_type(), + "client_id": self.get_client_id(), + "client_secret": self.get_client_secret(), + "refresh_token": self.get_refresh_token(), + } + + if self.get_scopes(): + payload["scopes"] = self.get_scopes() + + if self.get_refresh_request_body(): + for key, val in self.get_refresh_request_body().items(): + # We defer to existing oauth constructs over custom configured fields + if key not in payload: + payload[key] = val + + return payload + + def _wrap_refresh_token_exception(self, exception: requests.exceptions.RequestException) -> bool: + try: + if exception.response is not None: + exception_content = exception.response.json() + else: + return False + except JSONDecodeError: + return False + return ( + exception.response.status_code in self._refresh_token_error_status_codes + and exception_content.get(self._refresh_token_error_key) in self._refresh_token_error_values + ) + + @backoff.on_exception( + backoff.expo, + DefaultBackoffException, + on_backoff=lambda details: logger.info( + f"Caught retryable error after {details['tries']} tries. Waiting {details['wait']} seconds then retrying..." + ), + max_time=300, + ) + def _get_refresh_access_token_response(self) -> Any: + try: + response = requests.request(method="POST", url=self.get_token_refresh_endpoint(), data=self.build_refresh_request_body()) + if response.ok: + response_json = response.json() + # Add the access token to the list of secrets so it is replaced before logging the response + # An argument could be made to remove the prevous access key from the list of secrets, but unmasking values seems like a security incident waiting to happen... + access_key = response_json.get(self.get_access_token_name()) + if not access_key: + raise Exception("Token refresh API response was missing access token {self.get_access_token_name()}") + add_to_secrets(access_key) + self._log_response(response) + return response_json + else: + # log the response even if the request failed for troubleshooting purposes + self._log_response(response) + response.raise_for_status() + except requests.exceptions.RequestException as e: + if e.response is not None: + if e.response.status_code == 429 or e.response.status_code >= 500: + raise DefaultBackoffException(request=e.response.request, response=e.response) + if self._wrap_refresh_token_exception(e): + message = "Refresh token is invalid or expired. Please re-authenticate from Sources//Settings." + raise AirbyteTracedException(internal_message=message, message=message, failure_type=FailureType.config_error) + raise + except Exception as e: + raise Exception(f"Error while refreshing access token: {e}") from e + + def refresh_access_token(self) -> Tuple[str, Union[str, int]]: + """ + Returns the refresh token and its expiration datetime + + :return: a tuple of (access_token, token_lifespan) + """ + response_json = self._get_refresh_access_token_response() + + return response_json[self.get_access_token_name()], response_json[self.get_expires_in_name()] + + def _parse_token_expiration_date(self, value: Union[str, int]) -> pendulum.DateTime: + """ + Return the expiration datetime of the refresh token + + :return: expiration datetime + """ + + if self.token_expiry_is_time_of_expiration: + if not self.token_expiry_date_format: + raise ValueError( + f"Invalid token expiry date format {self.token_expiry_date_format}; a string representing the format is required." + ) + return pendulum.from_format(str(value), self.token_expiry_date_format) + else: + return pendulum.now().add(seconds=int(float(value))) + + @property + def token_expiry_is_time_of_expiration(self) -> bool: + """ + Indicates that the Token Expiry returns the date until which the token will be valid, not the amount of time it will be valid. + """ + + return False + + @property + def token_expiry_date_format(self) -> Optional[str]: + """ + Format of the datetime; exists it if expires_in is returned as the expiration datetime instead of seconds until it expires + """ + + return None + + @abstractmethod + def get_token_refresh_endpoint(self) -> str: + """Returns the endpoint to refresh the access token""" + + @abstractmethod + def get_client_id(self) -> str: + """The client id to authenticate""" + + @abstractmethod + def get_client_secret(self) -> str: + """The client secret to authenticate""" + + @abstractmethod + def get_refresh_token(self) -> Optional[str]: + """The token used to refresh the access token when it expires""" + + @abstractmethod + def get_scopes(self) -> List[str]: + """List of requested scopes""" + + @abstractmethod + def get_token_expiry_date(self) -> pendulum.DateTime: + """Expiration date of the access token""" + + @abstractmethod + def set_token_expiry_date(self, value: Union[str, int]) -> None: + """Setter for access token expiration date""" + + @abstractmethod + def get_access_token_name(self) -> str: + """Field to extract access token from in the response""" + + @abstractmethod + def get_expires_in_name(self) -> str: + """Returns the expires_in field name""" + + @abstractmethod + def get_refresh_request_body(self) -> Mapping[str, Any]: + """Returns the request body to set on the refresh request""" + + @abstractmethod + def get_grant_type(self) -> str: + """Returns grant_type specified for requesting access_token""" + + @property + @abstractmethod + def access_token(self) -> str: + """Returns the access token""" + + @access_token.setter + @abstractmethod + def access_token(self, value: str) -> str: + """Setter for the access token""" + + @property + def _message_repository(self) -> Optional[MessageRepository]: + """ + The implementation can define a message_repository if it wants debugging logs for HTTP requests + """ + return _NOOP_MESSAGE_REPOSITORY + + def _log_response(self, response: requests.Response) -> None: + if self._message_repository: + self._message_repository.log_message( + Level.DEBUG, + lambda: format_http_message( + response, + "Refresh token", + "Obtains access token", + self._NO_STREAM_NAME, + is_auxiliary=True, + ), + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py new file mode 100644 index 000000000000..db59600db8f0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from typing import Any, Mapping + +from requests.auth import AuthBase + + +class AbstractHeaderAuthenticator(AuthBase): + """Abstract class for an header-based authenticators that add a header to outgoing HTTP requests.""" + + def __call__(self, request): + """Attach the HTTP headers required to authenticate on the HTTP request""" + request.headers.update(self.get_auth_header()) + return request + + def get_auth_header(self) -> Mapping[str, Any]: + """The header to set on outgoing HTTP requests""" + if self.auth_header: + return {self.auth_header: self.token} + return {} + + @property + @abstractmethod + def auth_header(self) -> str: + """HTTP header to set on the requests""" + + @property + @abstractmethod + def token(self) -> str: + """The header value to set on outgoing HTTP requests""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py new file mode 100644 index 000000000000..1728f4099797 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py @@ -0,0 +1,258 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union + +import dpath +import pendulum +from airbyte_cdk.config_observation import create_connector_config_control_message, emit_configuration_as_airbyte_control_message +from airbyte_cdk.sources.message import MessageRepository, NoopMessageRepository +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth import AbstractOauth2Authenticator + + +class Oauth2Authenticator(AbstractOauth2Authenticator): + """ + Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials. + The generated access token is attached to each request via the Authorization header. + If a connector_config is provided any mutation of it's value in the scope of this class will emit AirbyteControlConnectorConfigMessage. + """ + + def __init__( + self, + token_refresh_endpoint: str, + client_id: str, + client_secret: str, + refresh_token: str, + scopes: List[str] = None, + token_expiry_date: pendulum.DateTime = None, + token_expiry_date_format: str = None, + access_token_name: str = "access_token", + expires_in_name: str = "expires_in", + refresh_request_body: Mapping[str, Any] = None, + grant_type: str = "refresh_token", + token_expiry_is_time_of_expiration: bool = False, + refresh_token_error_status_codes: Tuple[int, ...] = (), + refresh_token_error_key: str = "", + refresh_token_error_values: Tuple[str, ...] = (), + ): + self._token_refresh_endpoint = token_refresh_endpoint + self._client_secret = client_secret + self._client_id = client_id + self._refresh_token = refresh_token + self._scopes = scopes + self._access_token_name = access_token_name + self._expires_in_name = expires_in_name + self._refresh_request_body = refresh_request_body + self._grant_type = grant_type + + self._token_expiry_date = token_expiry_date or pendulum.now().subtract(days=1) + self._token_expiry_date_format = token_expiry_date_format + self._token_expiry_is_time_of_expiration = token_expiry_is_time_of_expiration + self._access_token = None + super().__init__(refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values) + + def get_token_refresh_endpoint(self) -> str: + return self._token_refresh_endpoint + + def get_client_id(self) -> str: + return self._client_id + + def get_client_secret(self) -> str: + return self._client_secret + + def get_refresh_token(self) -> str: + return self._refresh_token + + def get_access_token_name(self) -> str: + return self._access_token_name + + def get_scopes(self) -> [str]: + return self._scopes + + def get_expires_in_name(self) -> str: + return self._expires_in_name + + def get_refresh_request_body(self) -> Mapping[str, Any]: + return self._refresh_request_body + + def get_grant_type(self) -> str: + return self._grant_type + + def get_token_expiry_date(self) -> pendulum.DateTime: + return self._token_expiry_date + + def set_token_expiry_date(self, value: Union[str, int]): + self._token_expiry_date = self._parse_token_expiration_date(value) + + @property + def token_expiry_is_time_of_expiration(self) -> bool: + return self._token_expiry_is_time_of_expiration + + @property + def token_expiry_date_format(self) -> Optional[str]: + return self._token_expiry_date_format + + @property + def access_token(self) -> str: + return self._access_token + + @access_token.setter + def access_token(self, value: str): + self._access_token = value + + +class SingleUseRefreshTokenOauth2Authenticator(Oauth2Authenticator): + """ + Authenticator that should be used for API implementing single use refresh tokens: + when refreshing access token some API returns a new refresh token that needs to used in the next refresh flow. + This authenticator updates the configuration with new refresh token by emitting Airbyte control message from an observed mutation. + By default, this authenticator expects a connector config with a "credentials" field with the following nested fields: client_id, + client_secret, refresh_token. This behavior can be changed by defining custom config path (using dpath paths) in client_id_config_path, + client_secret_config_path, refresh_token_config_path constructor arguments. + """ + + def __init__( + self, + connector_config: Mapping[str, Any], + token_refresh_endpoint: str, + scopes: List[str] = None, + access_token_name: str = "access_token", + expires_in_name: str = "expires_in", + refresh_token_name: str = "refresh_token", + refresh_request_body: Mapping[str, Any] = None, + grant_type: str = "refresh_token", + client_id: Optional[str] = None, + client_secret: Optional[str] = None, + access_token_config_path: Sequence[str] = ("credentials", "access_token"), + refresh_token_config_path: Sequence[str] = ("credentials", "refresh_token"), + token_expiry_date_config_path: Sequence[str] = ("credentials", "token_expiry_date"), + token_expiry_date_format: Optional[str] = None, + message_repository: MessageRepository = NoopMessageRepository(), + token_expiry_is_time_of_expiration: bool = False, + refresh_token_error_status_codes: Tuple[int, ...] = (), + refresh_token_error_key: str = "", + refresh_token_error_values: Tuple[str, ...] = (), + ): + """ + Args: + connector_config (Mapping[str, Any]): The full connector configuration + token_refresh_endpoint (str): Full URL to the token refresh endpoint + scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None. + access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token". + expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in". + refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token". + refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None. + grant_type (str, optional): OAuth grant type. Defaults to "refresh_token". + client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object. + client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object. + access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token"). + refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token"). + token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date"). + token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration. + token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration + message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update + """ + self._client_id = client_id if client_id is not None else dpath.get(connector_config, ("credentials", "client_id")) + self._client_secret = client_secret if client_secret is not None else dpath.get(connector_config, ("credentials", "client_secret")) + self._access_token_config_path = access_token_config_path + self._refresh_token_config_path = refresh_token_config_path + self._token_expiry_date_config_path = token_expiry_date_config_path + self._token_expiry_date_format = token_expiry_date_format + self._refresh_token_name = refresh_token_name + self._connector_config = connector_config + self.__message_repository = message_repository + super().__init__( + token_refresh_endpoint, + self.get_client_id(), + self.get_client_secret(), + self.get_refresh_token(), + scopes=scopes, + token_expiry_date=self.get_token_expiry_date(), + access_token_name=access_token_name, + expires_in_name=expires_in_name, + refresh_request_body=refresh_request_body, + grant_type=grant_type, + token_expiry_date_format=token_expiry_date_format, + token_expiry_is_time_of_expiration=token_expiry_is_time_of_expiration, + refresh_token_error_status_codes=refresh_token_error_status_codes, + refresh_token_error_key=refresh_token_error_key, + refresh_token_error_values=refresh_token_error_values, + ) + + def get_refresh_token_name(self) -> str: + return self._refresh_token_name + + def get_client_id(self) -> str: + return self._client_id + + def get_client_secret(self) -> str: + return self._client_secret + + @property + def access_token(self) -> str: + return dpath.get(self._connector_config, self._access_token_config_path, default="") + + @access_token.setter + def access_token(self, new_access_token: str): + dpath.new(self._connector_config, self._access_token_config_path, new_access_token) + + def get_refresh_token(self) -> str: + return dpath.get(self._connector_config, self._refresh_token_config_path, default="") + + def set_refresh_token(self, new_refresh_token: str): + dpath.new(self._connector_config, self._refresh_token_config_path, new_refresh_token) + + def get_token_expiry_date(self) -> pendulum.DateTime: + expiry_date = dpath.get(self._connector_config, self._token_expiry_date_config_path, default="") + return pendulum.now().subtract(days=1) if expiry_date == "" else pendulum.parse(expiry_date) + + def set_token_expiry_date(self, new_token_expiry_date): + dpath.new(self._connector_config, self._token_expiry_date_config_path, str(new_token_expiry_date)) + + def token_has_expired(self) -> bool: + """Returns True if the token is expired""" + return pendulum.now("UTC") > self.get_token_expiry_date() + + @staticmethod + def get_new_token_expiry_date(access_token_expires_in: str, token_expiry_date_format: str = None) -> pendulum.DateTime: + if token_expiry_date_format: + return pendulum.from_format(access_token_expires_in, token_expiry_date_format) + else: + return pendulum.now("UTC").add(seconds=int(access_token_expires_in)) + + def get_access_token(self) -> str: + """Retrieve new access and refresh token if the access token has expired. + The new refresh token is persisted with the set_refresh_token function + Returns: + str: The current access_token, updated if it was previously expired. + """ + if self.token_has_expired(): + new_access_token, access_token_expires_in, new_refresh_token = self.refresh_access_token() + new_token_expiry_date = self.get_new_token_expiry_date(access_token_expires_in, self._token_expiry_date_format) + self.access_token = new_access_token + self.set_refresh_token(new_refresh_token) + self.set_token_expiry_date(new_token_expiry_date) + # FIXME emit_configuration_as_airbyte_control_message as been deprecated in favor of package airbyte_cdk.sources.message + # Usually, a class shouldn't care about the implementation details but to keep backward compatibility where we print the + # message directly in the console, this is needed + if not isinstance(self._message_repository, NoopMessageRepository): + self._message_repository.emit_message(create_connector_config_control_message(self._connector_config)) + else: + emit_configuration_as_airbyte_control_message(self._connector_config) + return self.access_token + + def refresh_access_token(self) -> Tuple[str, str, str]: + response_json = self._get_refresh_access_token_response() + return ( + response_json[self.get_access_token_name()], + response_json[self.get_expires_in_name()], + response_json[self.get_refresh_token_name()], + ) + + @property + def _message_repository(self) -> MessageRepository: + """ + Overriding AbstractOauth2Authenticator._message_repository to allow for HTTP request logs + """ + return self.__message_repository diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/token.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/token.py new file mode 100644 index 000000000000..becfe8108f24 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/token.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import base64 +from itertools import cycle +from typing import List + +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_token import AbstractHeaderAuthenticator + + +class MultipleTokenAuthenticator(AbstractHeaderAuthenticator): + """ + Builds auth header, based on the list of tokens provided. + Auth header is changed per each `get_auth_header` call, using each token in cycle. + The token is attached to each request via the `auth_header` header. + """ + + @property + def auth_header(self) -> str: + return self._auth_header + + @property + def token(self) -> str: + return f"{self._auth_method} {next(self._tokens_iter)}" + + def __init__(self, tokens: List[str], auth_method: str = "Bearer", auth_header: str = "Authorization"): + self._auth_method = auth_method + self._auth_header = auth_header + self._tokens = tokens + self._tokens_iter = cycle(self._tokens) + + +class TokenAuthenticator(AbstractHeaderAuthenticator): + """ + Builds auth header, based on the token provided. + The token is attached to each request via the `auth_header` header. + """ + + @property + def auth_header(self) -> str: + return self._auth_header + + @property + def token(self) -> str: + return f"{self._auth_method} {self._token}" + + def __init__(self, token: str, auth_method: str = "Bearer", auth_header: str = "Authorization"): + self._auth_header = auth_header + self._auth_method = auth_method + self._token = token + + +class BasicHttpAuthenticator(AbstractHeaderAuthenticator): + """ + Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using bas64 + https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme + """ + + @property + def auth_header(self) -> str: + return self._auth_header + + @property + def token(self) -> str: + return f"{self._auth_method} {self._token}" + + def __init__(self, username: str, password: str = "", auth_method: str = "Basic", auth_header: str = "Authorization"): + auth_string = f"{username}:{password}".encode("utf8") + b64_encoded = base64.b64encode(auth_string).decode("utf8") + self._auth_header = auth_header + self._auth_method = auth_method + self._token = b64_encoded diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/utils/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/utils/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/utils/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/types.py b/airbyte-cdk/python/airbyte_cdk/sources/types.py new file mode 100644 index 000000000000..6659c8dd767c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/types.py @@ -0,0 +1,137 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from __future__ import annotations + +from typing import Any, ItemsView, Iterator, KeysView, List, Mapping, Optional, ValuesView + +# A FieldPointer designates a path to a field inside a mapping. For example, retrieving ["k1", "k1.2"] in the object {"k1" :{"k1.2": +# "hello"}] returns "hello" +FieldPointer = List[str] +Config = Mapping[str, Any] +ConnectionDefinition = Mapping[str, Any] +StreamState = Mapping[str, Any] + + +class Record(Mapping[str, Any]): + def __init__(self, data: Mapping[str, Any], associated_slice: Optional[StreamSlice]): + self._data = data + self._associated_slice = associated_slice + + @property + def data(self) -> Mapping[str, Any]: + return self._data + + @property + def associated_slice(self) -> Optional[StreamSlice]: + return self._associated_slice + + def __repr__(self) -> str: + return repr(self._data) + + def __getitem__(self, key: str) -> Any: + return self._data[key] + + def __len__(self) -> int: + return len(self._data) + + def __iter__(self) -> Any: + return iter(self._data) + + def __contains__(self, item: object) -> bool: + return item in self._data + + def __eq__(self, other: object) -> bool: + if isinstance(other, Record): + # noinspection PyProtectedMember + return self._data == other._data + return False + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) + + +class StreamSlice(Mapping[str, Any]): + def __init__( + self, *, partition: Mapping[str, Any], cursor_slice: Mapping[str, Any], extra_fields: Optional[Mapping[str, Any]] = None + ) -> None: + """ + :param partition: The partition keys representing a unique partition in the stream. + :param cursor_slice: The incremental cursor slice keys, such as dates or pagination tokens. + :param extra_fields: Additional fields that should not be part of the partition but passed along, such as metadata from the parent stream. + """ + self._partition = partition + self._cursor_slice = cursor_slice + self._extra_fields = extra_fields or {} + + # Ensure that partition keys do not overlap with cursor slice keys + if partition.keys() & cursor_slice.keys(): + raise ValueError("Keys for partition and incremental sync cursor should not overlap") + + self._stream_slice = dict(partition) | dict(cursor_slice) + + @property + def partition(self) -> Mapping[str, Any]: + """Returns the partition portion of the stream slice.""" + p = self._partition + while isinstance(p, StreamSlice): + p = p.partition + return p + + @property + def cursor_slice(self) -> Mapping[str, Any]: + """Returns the cursor slice portion of the stream slice.""" + c = self._cursor_slice + while isinstance(c, StreamSlice): + c = c.cursor_slice + return c + + @property + def extra_fields(self) -> Mapping[str, Any]: + """Returns the extra fields that are not part of the partition.""" + return self._extra_fields + + def __repr__(self) -> str: + return repr(self._stream_slice) + + def __setitem__(self, key: str, value: Any) -> None: + raise ValueError("StreamSlice is immutable") + + def __getitem__(self, key: str) -> Any: + return self._stream_slice[key] + + def __len__(self) -> int: + return len(self._stream_slice) + + def __iter__(self) -> Iterator[str]: + return iter(self._stream_slice) + + def __contains__(self, item: Any) -> bool: + return item in self._stream_slice + + def keys(self) -> KeysView[str]: + return self._stream_slice.keys() + + def items(self) -> ItemsView[str, Any]: + return self._stream_slice.items() + + def values(self) -> ValuesView[Any]: + return self._stream_slice.values() + + def get(self, key: str, default: Any = None) -> Optional[Any]: + return self._stream_slice.get(key, default) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, dict): + return self._stream_slice == other + if isinstance(other, StreamSlice): + # noinspection PyProtectedMember + return self._partition == other._partition and self._cursor_slice == other._cursor_slice + return False + + def __ne__(self, other: Any) -> bool: + return not self.__eq__(other) + + def __json_serializable__(self) -> Any: + return self._stream_slice diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/__init__.py new file mode 100644 index 000000000000..b609a6c7a540 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +# Initialize Utils Package + +__all__ = ["record_helper"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/casing.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/casing.py new file mode 100644 index 000000000000..806e077ae00c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/casing.py @@ -0,0 +1,12 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import re + + +# https://stackoverflow.com/a/1176023 +def camel_to_snake(s: str) -> str: + s = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", s) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s).lower() diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/record_helper.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/record_helper.py new file mode 100644 index 000000000000..98cefd1a8d40 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/record_helper.py @@ -0,0 +1,44 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import time +from collections.abc import Mapping as ABCMapping +from typing import Any, Mapping, Optional + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteRecordMessage, AirbyteTraceMessage +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.models.file_transfer_record_message import AirbyteFileTransferRecordMessage +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer + + +def stream_data_to_airbyte_message( + stream_name: str, + data_or_message: StreamData, + transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform), + schema: Optional[Mapping[str, Any]] = None, + is_file_transfer_message: bool = False, +) -> AirbyteMessage: + if schema is None: + schema = {} + + match data_or_message: + case ABCMapping(): + data = dict(data_or_message) + now_millis = time.time_ns() // 1_000_000 + # Transform object fields according to config. Most likely you will + # need it to normalize values against json schema. By default no action + # taken unless configured. See + # docs/connector-development/cdk-python/schemas.md for details. + transformer.transform(data, schema) # type: ignore + if is_file_transfer_message: + message = AirbyteFileTransferRecordMessage(stream=stream_name, file=data, emitted_at=now_millis, data={}) + else: + message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis) + return AirbyteMessage(type=MessageType.RECORD, record=message) + case AirbyteTraceMessage(): + return AirbyteMessage(type=MessageType.TRACE, trace=data_or_message) + case AirbyteLogMessage(): + return AirbyteMessage(type=MessageType.LOG, log=data_or_message) + case _: + raise ValueError(f"Unexpected type for data_or_message: {type(data_or_message)}: {data_or_message}") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_helpers.py new file mode 100644 index 000000000000..7eef091aa02b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_helpers.py @@ -0,0 +1,223 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import importlib +import json +import os +import pkgutil +from typing import Any, ClassVar, Dict, List, Mapping, MutableMapping, Optional, Tuple + +import jsonref +from airbyte_cdk.models import ConnectorSpecification, FailureType +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from jsonschema import RefResolver, validate +from jsonschema.exceptions import ValidationError +from pydantic.v1 import BaseModel, Field + + +class JsonFileLoader: + """ + Custom json file loader to resolve references to resources located in "shared" directory. + We need this for compatability with existing schemas cause all of them have references + pointing to shared_schema.json file instead of shared/shared_schema.json + """ + + def __init__(self, uri_base: str, shared: str): + self.shared = shared + self.uri_base = uri_base + + def __call__(self, uri: str) -> Dict[str, Any]: + uri = uri.replace(self.uri_base, f"{self.uri_base}/{self.shared}/") + with open(uri) as f: + data = json.load(f) + if isinstance(data, dict): + return data + else: + raise ValueError(f"Expected to read a dictionary from {uri}. Got: {data}") + + +def resolve_ref_links(obj: Any) -> Any: + """ + Scan resolved schema and convert jsonref.JsonRef object to JSON serializable dict. + + :param obj - jsonschema object with ref field resolved. + :return JSON serializable object with references without external dependencies. + """ + if isinstance(obj, jsonref.JsonRef): + obj = resolve_ref_links(obj.__subject__) + # Omit existing definitions for external resource since + # we dont need it anymore. + if isinstance(obj, dict): + obj.pop("definitions", None) + return obj + else: + raise ValueError(f"Expected obj to be a dict. Got {obj}") + elif isinstance(obj, dict): + return {k: resolve_ref_links(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [resolve_ref_links(item) for item in obj] + else: + return obj + + +def _expand_refs(schema: Any, ref_resolver: Optional[RefResolver] = None) -> None: + """Internal function to iterate over schema and replace all occurrences of $ref with their definitions. Recursive. + + :param schema: schema that will be patched + :param ref_resolver: resolver to get definition from $ref, if None pass it will be instantiated + """ + ref_resolver = ref_resolver or RefResolver.from_schema(schema) + + if isinstance(schema, MutableMapping): + if "$ref" in schema: + ref_url = schema.pop("$ref") + _, definition = ref_resolver.resolve(ref_url) + _expand_refs(definition, ref_resolver=ref_resolver) # expand refs in definitions as well + schema.update(definition) + else: + for key, value in schema.items(): + _expand_refs(value, ref_resolver=ref_resolver) + elif isinstance(schema, List): + for value in schema: + _expand_refs(value, ref_resolver=ref_resolver) + + +def expand_refs(schema: Any) -> None: + """Iterate over schema and replace all occurrences of $ref with their definitions. + + :param schema: schema that will be patched + """ + _expand_refs(schema) + schema.pop("definitions", None) # remove definitions created by $ref + + +def rename_key(schema: Any, old_key: str, new_key: str) -> None: + """Iterate over nested dictionary and replace one key with another. Used to replace anyOf with oneOf. Recursive." + + :param schema: schema that will be patched + :param old_key: name of the key to replace + :param new_key: new name of the key + """ + if not isinstance(schema, MutableMapping): + return + + for key, value in schema.items(): + rename_key(value, old_key, new_key) + if old_key in schema: + schema[new_key] = schema.pop(old_key) + + +class ResourceSchemaLoader: + """JSONSchema loader from package resources""" + + def __init__(self, package_name: str): + self.package_name = package_name + + def get_schema(self, name: str) -> dict[str, Any]: + """ + This method retrieves a JSON schema from the schemas/ folder. + + + The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs + living inside the "schemas/shared/" folder. For example: + + schemas/shared/.json + schemas/.json # contains a $ref to shared_definition + schemas/.json # contains a $ref to shared_definition + """ + + schema_filename = f"schemas/{name}.json" + raw_file = pkgutil.get_data(self.package_name, schema_filename) + if not raw_file: + raise IOError(f"Cannot find file {schema_filename}") + try: + raw_schema = json.loads(raw_file) + except ValueError as err: + raise RuntimeError(f"Invalid JSON file format for file {schema_filename}") from err + + return self._resolve_schema_references(raw_schema) + + def _resolve_schema_references(self, raw_schema: dict[str, Any]) -> dict[str, Any]: + """ + Resolve links to external references and move it to local "definitions" map. + + :param raw_schema jsonschema to lookup for external links. + :return JSON serializable object with references without external dependencies. + """ + + package = importlib.import_module(self.package_name) + if package.__file__: + base = os.path.dirname(package.__file__) + "/" + else: + raise ValueError(f"Package {package} does not have a valid __file__ field") + resolved = jsonref.JsonRef.replace_refs(raw_schema, loader=JsonFileLoader(base, "schemas/shared"), base_uri=base) + resolved = resolve_ref_links(resolved) + if isinstance(resolved, dict): + return resolved + else: + raise ValueError(f"Expected resolved to be a dict. Got {resolved}") + + +def check_config_against_spec_or_exit(config: Mapping[str, Any], spec: ConnectorSpecification) -> None: + """ + Check config object against spec. In case of spec is invalid, throws + an exception with validation error description. + + :param config - config loaded from file specified over command line + :param spec - spec object generated by connector + """ + spec_schema = spec.connectionSpecification + try: + validate(instance=config, schema=spec_schema) + except ValidationError as validation_error: + raise AirbyteTracedException( + message="Config validation error: " + validation_error.message, + internal_message=validation_error.message, + failure_type=FailureType.config_error, + ) from None # required to prevent logging config secrets from the ValidationError's stacktrace + + +class InternalConfig(BaseModel): + KEYWORDS: ClassVar[set[str]] = {"_limit", "_page_size"} + limit: int = Field(None, alias="_limit") + page_size: int = Field(None, alias="_page_size") + + def dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]: + kwargs["by_alias"] = True + kwargs["exclude_unset"] = True + return super().dict(*args, **kwargs) # type: ignore[no-any-return] + + def is_limit_reached(self, records_counter: int) -> bool: + """ + Check if record count reached limit set by internal config. + :param records_counter - number of records already red + :return True if limit reached, False otherwise + """ + if self.limit: + if records_counter >= self.limit: + return True + return False + + +def split_config(config: Mapping[str, Any]) -> Tuple[dict[str, Any], InternalConfig]: + """ + Break config map object into 2 instances: first is a dict with user defined + configuration and second is internal config that contains private keys for + acceptance test configuration. + + :param + config - Dict object that has been loaded from config file. + + :return tuple of user defined config dict with filtered out internal + parameters and connector acceptance test internal config object. + """ + main_config = {} + internal_config = {} + for k, v in config.items(): + if k in InternalConfig.KEYWORDS: + internal_config[k] = v + else: + main_config[k] = v + return main_config, InternalConfig.parse_obj(internal_config) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/slice_logger.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/slice_logger.py new file mode 100644 index 000000000000..6981cdde88fa --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/slice_logger.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from abc import ABC, abstractmethod +from typing import Any, Mapping, Optional + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level +from airbyte_cdk.models import Type as MessageType + + +class SliceLogger(ABC): + """ + SliceLogger is an interface that allows us to log slices of data in a uniform way. + It is responsible for determining whether or not a slice should be logged and for creating the log message. + """ + + SLICE_LOG_PREFIX = "slice:" + + def create_slice_log_message(self, _slice: Optional[Mapping[str, Any]]) -> AirbyteMessage: + """ + Mapping is an interface that can be implemented in various ways. However, json.dumps will just do a `str()` if + the slice is a class implementing Mapping. Therefore, we want to cast this as a dict before passing this to json.dump + """ + printable_slice = dict(_slice) if _slice else _slice + return AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage(level=Level.INFO, message=f"{SliceLogger.SLICE_LOG_PREFIX}{json.dumps(printable_slice, default=str)}"), + ) + + @abstractmethod + def should_log_slice_message(self, logger: logging.Logger) -> bool: + """ + + :param logger: + :return: + """ + + +class DebugSliceLogger(SliceLogger): + def should_log_slice_message(self, logger: logging.Logger) -> bool: + """ + + :param logger: + :return: + """ + return logger.isEnabledFor(logging.DEBUG) + + +class AlwaysLogSliceLogger(SliceLogger): + def should_log_slice_message(self, logger: logging.Logger) -> bool: + return True diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py new file mode 100644 index 000000000000..b15ff11db84a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py @@ -0,0 +1,196 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from distutils.util import strtobool +from enum import Flag, auto +from typing import Any, Callable, Dict, Mapping, Optional + +from jsonschema import Draft7Validator, ValidationError, validators + +json_to_python_simple = {"string": str, "number": float, "integer": int, "boolean": bool, "null": type(None)} +json_to_python = {**json_to_python_simple, **{"object": dict, "array": list}} +python_to_json = {v: k for k, v in json_to_python.items()} + +logger = logging.getLogger("airbyte") + + +class TransformConfig(Flag): + """ + TypeTransformer class config. Configs can be combined using bitwise or operator e.g. + ``` + TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization + ``` + """ + + # No action taken, default behaviour. Cannot be combined with any other options. + NoTransform = auto() + # Applies default type casting with default_convert method which converts + # values by applying simple type casting to specified jsonschema type. + DefaultSchemaNormalization = auto() + # Allow registering custom type transformation callback. Can be combined + # with DefaultSchemaNormalization. In this case default type casting would + # be applied before custom one. + CustomSchemaNormalization = auto() + + +class TypeTransformer: + """ + Class for transforming object before output. + """ + + _custom_normalizer: Optional[Callable[[Any, Dict[str, Any]], Any]] = None + + def __init__(self, config: TransformConfig): + """ + Initialize TypeTransformer instance. + :param config Transform config that would be applied to object + """ + if TransformConfig.NoTransform in config and config != TransformConfig.NoTransform: + raise Exception("NoTransform option cannot be combined with other flags.") + self._config = config + all_validators = { + key: self.__get_normalizer(key, orig_validator) + for key, orig_validator in Draft7Validator.VALIDATORS.items() + # Do not validate field we do not transform for maximum performance. + if key in ["type", "array", "$ref", "properties", "items"] + } + self._normalizer = validators.create(meta_schema=Draft7Validator.META_SCHEMA, validators=all_validators) + + def registerCustomTransform(self, normalization_callback: Callable[[Any, Dict[str, Any]], Any]) -> Callable: + """ + Register custom normalization callback. + :param normalization_callback function to be used for value + normalization. Takes original value and part type schema. Should return + normalized value. See docs/connector-development/cdk-python/schemas.md + for details. + :return Same callbeck, this is usefull for using registerCustomTransform function as decorator. + """ + if TransformConfig.CustomSchemaNormalization not in self._config: + raise Exception("Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer") + self._custom_normalizer = normalization_callback + return normalization_callback + + def __normalize(self, original_item: Any, subschema: Dict[str, Any]) -> Any: + """ + Applies different transform function to object's field according to config. + :param original_item original value of field. + :param subschema part of the jsonschema containing field type/format data. + :return Final field value. + """ + if TransformConfig.DefaultSchemaNormalization in self._config: + original_item = self.default_convert(original_item, subschema) + + if self._custom_normalizer: + original_item = self._custom_normalizer(original_item, subschema) + return original_item + + @staticmethod + def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any: + """ + Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set. + :param original_item original value of field. + :param subschema part of the jsonschema containing field type/format data. + :return transformed field value. + """ + target_type = subschema.get("type", []) + if original_item is None and "null" in target_type: + return None + if isinstance(target_type, list): + # jsonschema type could either be a single string or array of type + # strings. In case if there is some disambigous and more than one + # type (except null) do not do any conversion and return original + # value. If type array has one type and null i.e. {"type": + # ["integer", "null"]}, convert value to specified type. + target_type = [t for t in target_type if t != "null"] + if len(target_type) != 1: + return original_item + target_type = target_type[0] + try: + if target_type == "string": + return str(original_item) + elif target_type == "number": + return float(original_item) + elif target_type == "integer": + return int(original_item) + elif target_type == "boolean": + if isinstance(original_item, str): + return strtobool(original_item) == 1 + return bool(original_item) + elif target_type == "array": + item_types = set(subschema.get("items", {}).get("type", set())) + if item_types.issubset(json_to_python_simple) and type(original_item) in json_to_python_simple.values(): + return [original_item] + except (ValueError, TypeError): + return original_item + return original_item + + def __get_normalizer(self, schema_key: str, original_validator: Callable): + """ + Traverse through object fields using native jsonschema validator and apply normalization function. + :param schema_key related json schema key that currently being validated/normalized. + :original_validator: native jsonschema validator callback. + """ + + def normalizator(validator_instance: Callable, property_value: Any, instance: Any, schema: Dict[str, Any]): + """ + Jsonschema validator callable it uses for validating instance. We + override default Draft7Validator to perform value transformation + before validation take place. We do not take any action except + logging warn if object does not conform to json schema, just using + jsonschema algorithm to traverse through object fields. + Look + https://python-jsonschema.readthedocs.io/en/stable/creating/?highlight=validators.create#jsonschema.validators.create + validators parameter for detailed description. + : + """ + + def resolve(subschema): + if "$ref" in subschema: + _, resolved = validator_instance.resolver.resolve(subschema["$ref"]) + return resolved + return subschema + + # Transform object and array values before running json schema type checking for each element. + # Recursively normalize every value of the "instance" sub-object, + # if "instance" is an incorrect type - skip recursive normalization of "instance" + if schema_key == "properties" and isinstance(instance, dict): + for k, subschema in property_value.items(): + if k in instance: + subschema = resolve(subschema) + instance[k] = self.__normalize(instance[k], subschema) + # Recursively normalize every item of the "instance" sub-array, + # if "instance" is an incorrect type - skip recursive normalization of "instance" + elif schema_key == "items" and isinstance(instance, list): + subschema = resolve(property_value) + for index, item in enumerate(instance): + instance[index] = self.__normalize(item, subschema) + + # Running native jsonschema traverse algorithm after field normalization is done. + yield from original_validator(validator_instance, property_value, instance, schema) + + return normalizator + + def transform(self, record: Dict[str, Any], schema: Mapping[str, Any]): + """ + Normalize and validate according to config. + :param record: record instance for normalization/transformation. All modification are done by modifying existent object. + :param schema: object's jsonschema for normalization. + """ + if TransformConfig.NoTransform in self._config: + return + normalizer = self._normalizer(schema) + for e in normalizer.iter_errors(record): + """ + just calling normalizer.validate() would throw an exception on + first validation occurences and stop processing rest of schema. + """ + logger.warning(self.get_error_message(e)) + + def get_error_message(self, e: ValidationError) -> str: + instance_json_type = python_to_json[type(e.instance)] + key_path = "." + ".".join(map(str, e.path)) + return ( + f"Failed to transform value {repr(e.instance)} of type '{instance_json_type}' to '{e.validator_value}', key path: '{key_path}'" + ) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/types.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/types.py new file mode 100644 index 000000000000..9dc5e253bf29 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/types.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Union + +JsonType = Union[dict[str, "JsonType"], list["JsonType"], str, int, float, bool, None] diff --git a/airbyte-cdk/python/airbyte_cdk/sql/__init__.py b/airbyte-cdk/python/airbyte_cdk/sql/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sql/_util/__init__.py b/airbyte-cdk/python/airbyte_cdk/sql/_util/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/airbyte_cdk/sql/_util/hashing.py b/airbyte-cdk/python/airbyte_cdk/sql/_util/hashing.py new file mode 100644 index 000000000000..781305c48a14 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/_util/hashing.py @@ -0,0 +1,34 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Hashing utils for Airbyte.""" + +from __future__ import annotations + +import hashlib +from collections.abc import Mapping + +HASH_SEED = "Airbyte:" +"""Additional seed for randomizing one-way hashed strings.""" + + +def one_way_hash( + obj: Mapping[str, str] | list[str] | object, + /, +) -> str: + """Return a one-way hash of the given string. + + To ensure a unique domain of hashes, we prepend a seed to the string before hashing. + """ + string_to_hash: str + if isinstance(obj, Mapping): + # Recursively sort and convert nested dictionaries to tuples of key-value pairs + string_to_hash = str(sorted((k, one_way_hash(v)) for k, v in obj.items())) + + elif isinstance(obj, list): + # Recursively hash elements of the list + string_to_hash = str([one_way_hash(item) for item in obj]) + + else: + # Convert the object to a string + string_to_hash = str(obj) + + return hashlib.sha256((HASH_SEED + str(string_to_hash)).encode()).hexdigest() diff --git a/airbyte-cdk/python/airbyte_cdk/sql/_util/name_normalizers.py b/airbyte-cdk/python/airbyte_cdk/sql/_util/name_normalizers.py new file mode 100644 index 000000000000..9311432d7387 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/_util/name_normalizers.py @@ -0,0 +1,92 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Name normalizer classes.""" + +from __future__ import annotations + +import abc +import functools +import re +from typing import TYPE_CHECKING + +from airbyte_cdk.sql import exceptions as exc + +if TYPE_CHECKING: + from collections.abc import Iterable + + +class NameNormalizerBase(abc.ABC): + """Abstract base class for name normalizers.""" + + @staticmethod + @abc.abstractmethod + def normalize(name: str) -> str: + """Return the normalized name.""" + ... + + @classmethod + def normalize_set(cls, str_iter: Iterable[str]) -> set[str]: + """Converts string iterable to a set of lower case strings.""" + return {cls.normalize(s) for s in str_iter} + + @classmethod + def normalize_list(cls, str_iter: Iterable[str]) -> list[str]: + """Converts string iterable to a list of lower case strings.""" + return [cls.normalize(s) for s in str_iter] + + @classmethod + def check_matched(cls, name1: str, name2: str) -> bool: + """Return True if the two names match after each is normalized.""" + return cls.normalize(name1) == cls.normalize(name2) + + @classmethod + def check_normalized(cls, name: str) -> bool: + """Return True if the name is already normalized.""" + return cls.normalize(name) == name + + +class LowerCaseNormalizer(NameNormalizerBase): + """A name normalizer that converts names to lower case.""" + + @staticmethod + @functools.cache + def normalize(name: str) -> str: + """Return the normalized name. + + - All non-alphanumeric characters are replaced with underscores. + - Any names that start with a numeric ("1", "2", "123", "1b" etc.) are prefixed + with and underscore ("_1", "_2", "_123", "_1b" etc.) + + Examples: + - "Hello World!" -> "hello_world" + - "Hello, World!" -> "hello__world" + - "Hello - World" -> "hello___world" + - "___Hello, World___" -> "___hello__world___" + - "Average Sales (%)" -> "average_sales____" + - "Average Sales (#)" -> "average_sales____" + - "+1" -> "_1" + - "-1" -> "_1" + """ + result = name + + # Replace all non-alphanumeric characters with underscores. + result = re.sub("[^A-Za-z0-9]", "_", result.lower()) + + # Check if name starts with a number and prepend "_" if it does. + if result and result[0].isdigit(): + # Most databases do not allow identifiers to start with a number. + result = f"_{result}" + + if not result.replace("_", ""): + raise exc.AirbyteNameNormalizationError( + message="Name cannot be empty after normalization.", + raw_name=name, + normalization_result=result, + ) + + return result + + +__all__ = [ + "NameNormalizerBase", + "LowerCaseNormalizer", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sql/constants.py b/airbyte-cdk/python/airbyte_cdk/sql/constants.py new file mode 100644 index 000000000000..2f7de7817ac6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/constants.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Constants shared across the Airbyte codebase.""" + +from __future__ import annotations + +DEBUG_MODE = False # Set to True to enable additional debug logging. + +AB_EXTRACTED_AT_COLUMN = "_airbyte_extracted_at" +"""A column that stores the timestamp when the record was extracted.""" + +AB_META_COLUMN = "_airbyte_meta" +"""A column that stores metadata about the record.""" + +AB_RAW_ID_COLUMN = "_airbyte_raw_id" +"""A column that stores a unique identifier for each row in the source data. + +Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations. +In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In Airbyte, +this column is simply used as a unique identifier for each record as it is received. + +Airbyte uses ULIDs for this column, which are identifiers that can be sorted by time +received. This allows us to determine the debug the order of records as they are received, even if +the source provides records that are tied or received out of order from the perspective of their +`emitted_at` (`_airbyte_extracted_at`) timestamps. +""" + +AB_INTERNAL_COLUMNS = { + AB_RAW_ID_COLUMN, + AB_EXTRACTED_AT_COLUMN, + AB_META_COLUMN, +} +"""A set of internal columns that are reserved for Airbyte's internal use.""" diff --git a/airbyte-cdk/python/airbyte_cdk/sql/exceptions.py b/airbyte-cdk/python/airbyte_cdk/sql/exceptions.py new file mode 100644 index 000000000000..0192d829ac50 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/exceptions.py @@ -0,0 +1,222 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +"""All exceptions used in Airbyte. + +This design is modeled after structlog's exceptions, in that we bias towards auto-generated +property prints rather than sentence-like string concatenation. + +E.g. Instead of this: + +> `Subprocess failed with exit code '1'` + +We do this: + +> `Subprocess failed. (exit_code=1)` + +The benefit of this approach is that we can easily support structured logging, and we can +easily add new properties to exceptions without having to update all the places where they +are raised. We can also support any arbitrary number of properties in exceptions, without spending +time on building sentence-like string constructions with optional inputs. + + +In addition, the following principles are applied for exception class design: + +- All exceptions inherit from a common base class. +- All exceptions have a message attribute. +- The first line of the docstring is used as the default message. +- The default message can be overridden by explicitly setting the message attribute. +- Exceptions may optionally have a guidance attribute. +- Exceptions may optionally have a help_url attribute. +- Rendering is automatically handled by the base class. +- Any helpful context not defined by the exception class can be passed in the `context` dict arg. +- Within reason, avoid sending PII to the exception constructor. +- Exceptions are dataclasses, so they can be instantiated with keyword arguments. +- Use the 'from' syntax to chain exceptions when it is helpful to do so. + E.g. `raise AirbyteConnectorNotFoundError(...) from FileNotFoundError(connector_path)` +- Any exception that adds a new property should also be decorated as `@dataclass`. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path +from textwrap import indent +from typing import Any + +NEW_ISSUE_URL = "https://github.com/airbytehq/airbyte/issues/new/choose" +DOCS_URL_BASE = "https://https://docs.airbyte.com/" +DOCS_URL = f"{DOCS_URL_BASE}/airbyte.html" + +VERTICAL_SEPARATOR = "\n" + "-" * 60 + + +# Base error class + + +@dataclass +class AirbyteError(Exception): + """Base class for exceptions in Airbyte.""" + + guidance: str | None = None + help_url: str | None = None + log_text: str | list[str] | None = None + log_file: Path | None = None + context: dict[str, Any] | None = None + message: str | None = None + original_exception: Exception | None = None + + def get_message(self) -> str: + """Return the best description for the exception. + + We resolve the following in order: + 1. The message sent to the exception constructor (if provided). + 2. The first line of the class's docstring. + """ + if self.message: + return self.message + + return self.__doc__.split("\n")[0] if self.__doc__ else "" + + def __str__(self) -> str: + """Return a string representation of the exception.""" + special_properties = [ + "message", + "guidance", + "help_url", + "log_text", + "context", + "log_file", + "original_exception", + ] + display_properties = { + k: v for k, v in self.__dict__.items() if k not in special_properties and not k.startswith("_") and v is not None + } + display_properties.update(self.context or {}) + context_str = "\n ".join(f"{str(k).replace('_', ' ').title()}: {v!r}" for k, v in display_properties.items()) + exception_str = ( + f"{self.get_message()} ({self.__class__.__name__})" + VERTICAL_SEPARATOR + f"\n{self.__class__.__name__}: {self.get_message()}" + ) + + if self.guidance: + exception_str += f"\n {self.guidance}" + + if self.help_url: + exception_str += f"\n More info: {self.help_url}" + + if context_str: + exception_str += "\n " + context_str + + if self.log_file: + exception_str += f"\n Log file: {self.log_file.absolute()!s}" + + if self.log_text: + if isinstance(self.log_text, list): + self.log_text = "\n".join(self.log_text) + + exception_str += f"\n Log output: \n {indent(self.log_text, ' ')}" + + if self.original_exception: + exception_str += VERTICAL_SEPARATOR + f"\nCaused by: {self.original_exception!s}" + + return exception_str + + def __repr__(self) -> str: + """Return a string representation of the exception.""" + class_name = self.__class__.__name__ + properties_str = ", ".join(f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")) + return f"{class_name}({properties_str})" + + def safe_logging_dict(self) -> dict[str, Any]: + """Return a dictionary of the exception's properties which is safe for logging. + + We avoid any properties which could potentially contain PII. + """ + result = { + # The class name is safe to log: + "class": self.__class__.__name__, + # We discourage interpolated strings in 'message' so that this should never contain PII: + "message": self.get_message(), + } + safe_attrs = ["connector_name", "stream_name", "violation", "exit_code"] + for attr in safe_attrs: + if hasattr(self, attr): + result[attr] = getattr(self, attr) + + return result + + +# Airbyte Internal Errors (these are probably bugs) + + +@dataclass +class AirbyteInternalError(AirbyteError): + """An internal error occurred in Airbyte.""" + + guidance = "Please consider reporting this error to the Airbyte team." + help_url = NEW_ISSUE_URL + + +# Airbyte Input Errors (replaces ValueError for user input) + + +@dataclass +class AirbyteInputError(AirbyteError, ValueError): + """The input provided to Airbyte did not match expected validation rules. + + This inherits from ValueError so that it can be used as a drop-in replacement for + ValueError in the Airbyte API. + """ + + guidance = "Please check the provided value and try again." + help_url = DOCS_URL + input_value: str | None = None + + +# Normalization Errors + + +@dataclass +class AirbyteNameNormalizationError(AirbyteError, ValueError): + """Error occurred while normalizing a table or column name.""" + + guidance = "Please consider renaming the source object if possible, or " "raise an issue in GitHub if not." + help_url = NEW_ISSUE_URL + + raw_name: str | None = None + normalization_result: str | None = None + + +@dataclass +class AirbyteConnectorError(AirbyteError): + """Error when running the connector.""" + + connector_name: str | None = None + + def __post_init__(self) -> None: + """Set the log file path for the connector.""" + self.log_file = self._get_log_file() + if not self.guidance and self.log_file: + self.guidance = "Please review the log file for more information." + + def _get_log_file(self) -> Path | None: + """Return the log file path for the connector.""" + if self.connector_name: + logger = logging.getLogger(f"airbyte.{self.connector_name}") + + log_paths: list[Path] = [ + Path(handler.baseFilename).absolute() for handler in logger.handlers if isinstance(handler, logging.FileHandler) + ] + + if log_paths: + return log_paths[0] + + return None + + +@dataclass +class AirbyteStreamNotFoundError(AirbyteConnectorError): + """Connector stream not found.""" + + stream_name: str | None = None + available_streams: list[str] | None = None diff --git a/airbyte-cdk/python/airbyte_cdk/sql/secrets.py b/airbyte-cdk/python/airbyte_cdk/sql/secrets.py new file mode 100644 index 000000000000..aaf7641aa934 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/secrets.py @@ -0,0 +1,120 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Base classes and methods for working with secrets in Airbyte.""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +from airbyte_cdk.sql import exceptions as exc +from pydantic_core import CoreSchema, core_schema + +if TYPE_CHECKING: + from pydantic import GetCoreSchemaHandler, GetJsonSchemaHandler, ValidationInfo + from pydantic.json_schema import JsonSchemaValue + + +class SecretString(str): + """A string that represents a secret. + + This class is used to mark a string as a secret. When a secret is printed, it + will be masked to prevent accidental exposure of sensitive information when debugging + or when printing containing objects like dictionaries. + + To create a secret string, simply instantiate the class with any string value: + + ```python + secret = SecretString("my_secret_password") + ``` + + """ + + __slots__ = () + + def __repr__(self) -> str: + """Override the representation of the secret string to return a masked value. + + The secret string is always masked with `****` to prevent accidental exposure, unless + explicitly converted to a string. For instance, printing a config dictionary that contains + a secret will automatically mask the secret value instead of printing it in plain text. + + However, if you explicitly convert the cast the secret as a string, such as when used + in an f-string, the secret will be exposed. This is the desired behavior to allow + secrets to be used in a controlled manner. + """ + return "" + + def is_empty(self) -> bool: + """Check if the secret is an empty string.""" + return len(self) == 0 + + def is_json(self) -> bool: + """Check if the secret string is a valid JSON string.""" + try: + json.loads(self) + except (json.JSONDecodeError, Exception): + return False + + return True + + def __bool__(self) -> bool: + """Override the boolean value of the secret string. + + Always returns `True` without inspecting contents. + """ + return True + + def parse_json(self) -> Any: + """Parse the secret string as JSON.""" + try: + return json.loads(self) + except json.JSONDecodeError as ex: + raise exc.AirbyteInputError( + message="Failed to parse secret as JSON.", + context={ + "Message": ex.msg, + "Position": ex.pos, + "SecretString_Length": len(self), # Debug secret blank or an unexpected format. + }, + ) from None + + # Pydantic compatibility + + @classmethod + def validate( + cls, + v: Any, # noqa: ANN401 # Must allow `Any` to match Pydantic signature + info: ValidationInfo, + ) -> SecretString: + """Validate the input value is valid as a secret string.""" + _ = info # Unused + if not isinstance(v, str): + raise exc.AirbyteInputError( + message="A valid `str` or `SecretString` object is required.", + ) + return cls(v) + + @classmethod + def __get_pydantic_core_schema__( # noqa: PLW3201 # Pydantic dunder + cls, + source_type: Any, # noqa: ANN401 # Must allow `Any` to match Pydantic signature + handler: GetCoreSchemaHandler, + ) -> CoreSchema: + """Return a modified core schema for the secret string.""" + return core_schema.with_info_after_validator_function(function=cls.validate, schema=handler(str), field_name=handler.field_name) + + @classmethod + def __get_pydantic_json_schema__( # noqa: PLW3201 # Pydantic dunder method + cls, _core_schema: core_schema.CoreSchema, handler: GetJsonSchemaHandler + ) -> JsonSchemaValue: + """Return a modified JSON schema for the secret string. + + - `writeOnly=True` is the official way to prevent secrets from being exposed inadvertently. + - `Format=password` is a popular and readable convention to indicate the field is sensitive. + """ + _ = _core_schema, handler # Unused + return { + "type": "string", + "format": "password", + "writeOnly": True, + } diff --git a/airbyte-cdk/python/airbyte_cdk/sql/shared/__init__.py b/airbyte-cdk/python/airbyte_cdk/sql/shared/__init__.py new file mode 100644 index 000000000000..d9156b9d0d76 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/shared/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Module for future CDK components. + +Components here are planned to move to the CDK. + +TODO!: Add GitHub link here before merging. +""" + +from __future__ import annotations + +from airbyte_cdk.sql.shared.sql_processor import SqlProcessorBase + +__all__ = [ + "SqlProcessorBase", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sql/shared/catalog_providers.py b/airbyte-cdk/python/airbyte_cdk/sql/shared/catalog_providers.py new file mode 100644 index 000000000000..8d139c9c9cd2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/shared/catalog_providers.py @@ -0,0 +1,136 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Catalog provider implementation. + +A catalog provider wraps a configured catalog and configured streams. This class is responsible for +providing information about the catalog and streams. A catalog provider can also be updated with new +streams as they are discovered, providing a thin layer of abstraction over the configured catalog. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, cast, final + +from airbyte_cdk.models import ConfiguredAirbyteCatalog +from airbyte_cdk.sql import exceptions as exc +from airbyte_cdk.sql._util.name_normalizers import LowerCaseNormalizer + +if TYPE_CHECKING: + from airbyte_cdk.models import ConfiguredAirbyteStream + + +class CatalogProvider: + """A catalog provider wraps a configured catalog and configured streams. + + This class is responsible for providing information about the catalog and streams. + + Note: + - The catalog provider is not responsible for managing the catalog or streams but it may + be updated with new streams as they are discovered. + """ + + def __init__( + self, + configured_catalog: ConfiguredAirbyteCatalog, + ) -> None: + """Initialize the catalog manager with a catalog object reference. + + Since the catalog is passed by reference, the catalog manager may be updated with new + streams as they are discovered. + """ + self._catalog: ConfiguredAirbyteCatalog = self.validate_catalog(configured_catalog) + + @staticmethod + def validate_catalog(catalog: ConfiguredAirbyteCatalog) -> Any: + """Validate the catalog to ensure it is valid. + + This requires ensuring that `generationId` and `minGenerationId` are both set. If + not, both values will be set to `1`. + """ + for stream in catalog.streams: + if stream.generation_id is None: + stream.generation_id = 1 + if stream.minimum_generation_id is None: + stream.minimum_generation_id = 1 + if stream.sync_id is None: + stream.sync_id = 1 # This should ideally increment monotonically with each sync. + + return catalog + + @property + def configured_catalog(self) -> ConfiguredAirbyteCatalog: + """Return the configured catalog.""" + return self._catalog + + @property + def stream_names(self) -> list[str]: + """Return the names of the streams in the catalog.""" + return list({stream.stream.name for stream in self.configured_catalog.streams}) + + def get_configured_stream_info( + self, + stream_name: str, + ) -> ConfiguredAirbyteStream: + """Return the column definitions for the given stream.""" + if not self.configured_catalog: + raise exc.AirbyteInternalError( + message="Cannot get stream JSON schema without a catalog.", + ) + + matching_streams: list[ConfiguredAirbyteStream] = [ + stream for stream in self.configured_catalog.streams if stream.stream.name == stream_name + ] + if not matching_streams: + raise exc.AirbyteStreamNotFoundError( + stream_name=stream_name, + context={ + "available_streams": [stream.stream.name for stream in self.configured_catalog.streams], + }, + ) + + if len(matching_streams) > 1: + raise exc.AirbyteInternalError( + message="Multiple streams found with same name.", + context={ + "stream_name": stream_name, + }, + ) + + return matching_streams[0] + + @final + def get_stream_json_schema( + self, + stream_name: str, + ) -> dict[str, Any]: + """Return the column definitions for the given stream.""" + return cast(dict[str, Any], self.get_configured_stream_info(stream_name).stream.json_schema) + + def get_stream_properties( + self, + stream_name: str, + ) -> dict[str, dict[str, Any]]: + """Return the names of the top-level properties for the given stream.""" + return cast(dict[str, Any], self.get_stream_json_schema(stream_name)["properties"]) + + def get_primary_keys( + self, + stream_name: str, + ) -> list[str]: + """Return the primary keys for the given stream.""" + pks = self.get_configured_stream_info(stream_name).primary_key + if not pks: + return [] + + normalized_pks: list[list[str]] = [[LowerCaseNormalizer.normalize(c) for c in pk] for pk in pks] + + for pk_nodes in normalized_pks: + if len(pk_nodes) != 1: + raise exc.AirbyteError( + message=("Nested primary keys are not supported. " "Each PK column should have exactly one node. "), + context={ + "stream_name": stream_name, + "primary_key_nodes": pk_nodes, + }, + ) + + return [pk_nodes[0] for pk_nodes in normalized_pks] diff --git a/airbyte-cdk/python/airbyte_cdk/sql/shared/sql_processor.py b/airbyte-cdk/python/airbyte_cdk/sql/shared/sql_processor.py new file mode 100644 index 000000000000..52a8e52dee3a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/shared/sql_processor.py @@ -0,0 +1,754 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""The base SQL Cache implementation.""" + +from __future__ import annotations + +import abc +from collections import defaultdict +from contextlib import contextmanager +from functools import cached_property +from pathlib import Path +from typing import TYPE_CHECKING, Any, final + +import pandas as pd +import sqlalchemy +import ulid +from airbyte_cdk.sql import exceptions as exc +from airbyte_cdk.sql._util.hashing import one_way_hash +from airbyte_cdk.sql._util.name_normalizers import LowerCaseNormalizer +from airbyte_cdk.sql.constants import AB_EXTRACTED_AT_COLUMN, AB_META_COLUMN, AB_RAW_ID_COLUMN, DEBUG_MODE +from airbyte_cdk.sql.secrets import SecretString +from airbyte_cdk.sql.types import SQLTypeConverter +from airbyte_protocol_dataclasses.models import AirbyteStateMessage +from pandas import Index +from pydantic import BaseModel, Field +from sqlalchemy import Column, Table, and_, create_engine, insert, null, select, text, update +from sqlalchemy.exc import ProgrammingError, SQLAlchemyError + +if TYPE_CHECKING: + from collections.abc import Generator + + from airbyte_cdk.sql.shared.catalog_providers import CatalogProvider + from sqlalchemy.engine import Connection, Engine + from sqlalchemy.engine.cursor import CursorResult + from sqlalchemy.engine.reflection import Inspector + from sqlalchemy.sql.base import Executable + from sqlalchemy.sql.elements import TextClause + from sqlalchemy.sql.type_api import TypeEngine + + +class SQLRuntimeError(Exception): + """Raised when an SQL operation fails.""" + + +class SqlConfig(BaseModel, abc.ABC): + """Common configuration for SQL connections.""" + + schema_name: str = Field(default="airbyte_raw") + """The name of the schema to write to.""" + + table_prefix: str | None = "" + """A prefix to add to created table names.""" + + @abc.abstractmethod + def get_sql_alchemy_url(self) -> SecretString: + """Returns a SQL Alchemy URL.""" + ... + + @abc.abstractmethod + def get_database_name(self) -> str: + """Return the name of the database.""" + ... + + @property + def config_hash(self) -> str | None: + """Return a unique one-way hash of the configuration. + + The generic implementation uses the SQL Alchemy URL, schema name, and table prefix. Some + inputs may be redundant with the SQL Alchemy URL, but this does not hurt the hash + uniqueness. + + In most cases, subclasses do not need to override this method. + """ + return one_way_hash( + SecretString( + ":".join( + [ + str(self.get_sql_alchemy_url()), + self.schema_name or "", + self.table_prefix or "", + ] + ) + ) + ) + + def get_sql_engine(self) -> Engine: + """Return a new SQL engine to use.""" + return create_engine( + url=self.get_sql_alchemy_url(), + echo=DEBUG_MODE, + execution_options={ + "schema_translate_map": {None: self.schema_name}, + }, + future=True, + ) + + def get_vendor_client(self) -> object: + """Return the vendor-specific client object. + + This is used for vendor-specific operations. + + Raises `NotImplementedError` if a custom vendor client is not defined. + """ + raise NotImplementedError(f"The type '{type(self).__name__}' does not define a custom client.") + + +class SqlProcessorBase(abc.ABC): + """A base class to be used for SQL Caches.""" + + type_converter_class: type[SQLTypeConverter] = SQLTypeConverter + """The type converter class to use for converting JSON schema types to SQL types.""" + + normalizer = LowerCaseNormalizer + """The name normalizer to user for table and column name normalization.""" + + supports_merge_insert = False + """True if the database supports the MERGE INTO syntax.""" + + def __init__( + self, + *, + sql_config: SqlConfig, + catalog_provider: CatalogProvider, + ) -> None: + """Create a new SQL processor.""" + self._sql_config: SqlConfig = sql_config + self._catalog_provider: CatalogProvider | None = catalog_provider + + self._pending_state_messages: dict[str, list[AirbyteStateMessage]] = defaultdict(list, {}) + self._finalized_state_messages: dict[ + str, + list[AirbyteStateMessage], + ] = defaultdict(list, {}) + + self._setup() + self.type_converter = self.type_converter_class() + self._cached_table_definitions: dict[str, sqlalchemy.Table] = {} + + self._known_schemas_list: list[str] = [] + self._ensure_schema_exists() + + @property + def catalog_provider( + self, + ) -> CatalogProvider: + """Return the catalog manager. + + Subclasses should set this property to a valid catalog manager instance if one + is not explicitly passed to the constructor. + + Raises: + AirbyteInternalError: If the catalog manager is not set. + """ + if not self._catalog_provider: + raise exc.AirbyteInternalError( + message="Catalog manager should exist but does not.", + ) + + return self._catalog_provider + + def _setup(self) -> None: # noqa: B027 # Intentionally empty, not abstract + """Create the database. + + By default this is a no-op but subclasses can override this method to prepare + any necessary resources. + """ + pass + + def _do_checkpoint( # noqa: B027 # Intentionally empty, not abstract + self, + connection: Connection | None = None, + ) -> None: + """Checkpoint the given connection. + + If the WAL log needs to be, it will be flushed. + + For most SQL databases, this is a no-op. However, it exists so that + subclasses can override this method to perform a checkpoint operation. + """ + pass + + # Public interface: + + @property + def sql_config(self) -> SqlConfig: + """Return the SQL configuration.""" + return self._sql_config + + def get_sql_alchemy_url(self) -> SecretString: + """Return the SQLAlchemy URL to use.""" + return self.sql_config.get_sql_alchemy_url() + + @final + @cached_property + def database_name(self) -> str: + """Return the name of the database.""" + return self.sql_config.get_database_name() + + @final + def get_sql_engine(self) -> Engine: + """Return a new SQL engine to use.""" + return self.sql_config.get_sql_engine() + + @contextmanager + def get_sql_connection(self) -> Generator[sqlalchemy.engine.Connection, None, None]: + """A context manager which returns a new SQL connection for running queries. + + If the connection needs to close, it will be closed automatically. + """ + with self.get_sql_engine().begin() as connection: + self._init_connection_settings(connection) + yield connection + + connection.close() + del connection + + def get_sql_table_name( + self, + stream_name: str, + ) -> str: + """Return the name of the SQL table for the given stream.""" + table_prefix = self.sql_config.table_prefix + return self.normalizer.normalize( + f"{table_prefix}{stream_name}", + ) + + @final + def get_sql_table( + self, + stream_name: str, + ) -> sqlalchemy.Table: + """Return the main table object for the stream.""" + return self._get_table_by_name( + self.get_sql_table_name(stream_name), + ) + + # Protected members (non-public interface): + + def _init_connection_settings(self, connection: Connection) -> None: # noqa: B027 # Intentionally empty, not abstract + """This is called automatically whenever a new connection is created. + + By default this is a no-op. Subclasses can use this to set connection settings, such as + timezone, case-sensitivity settings, and other session-level variables. + """ + pass + + def _invalidate_table_cache( + self, + table_name: str, + ) -> None: + """Invalidate the the named table cache. + + This should be called whenever the table schema is known to have changed. + """ + if table_name in self._cached_table_definitions: + del self._cached_table_definitions[table_name] + + def _get_table_by_name( + self, + table_name: str, + *, + force_refresh: bool = False, + shallow_okay: bool = False, + ) -> sqlalchemy.Table: + """Return a table object from a table name. + + If 'shallow_okay' is True, the table will be returned without requiring properties to + be read from the database. + + To prevent unnecessary round-trips to the database, the table is cached after the first + query. To ignore the cache and force a refresh, set 'force_refresh' to True. + """ + if force_refresh and shallow_okay: + raise exc.AirbyteInternalError(message="Cannot force refresh and use shallow query at the same time.") + + if force_refresh and table_name in self._cached_table_definitions: + self._invalidate_table_cache(table_name) + + if table_name not in self._cached_table_definitions: + if shallow_okay: + # Return a shallow instance, without column declarations. Do not cache + # the table definition in this case. + return sqlalchemy.Table( + table_name, + sqlalchemy.MetaData(schema=self.sql_config.schema_name), + ) + + self._cached_table_definitions[table_name] = sqlalchemy.Table( + table_name, + sqlalchemy.MetaData(schema=self.sql_config.schema_name), + autoload_with=self.get_sql_engine(), + ) + + return self._cached_table_definitions[table_name] + + def _ensure_schema_exists( + self, + ) -> None: + schema_name = self.normalizer.normalize(self.sql_config.schema_name) + known_schemas_list = self.normalizer.normalize_list(self._known_schemas_list) + if known_schemas_list and schema_name in known_schemas_list: + return # Already exists + + schemas_list = self.normalizer.normalize_list(self._get_schemas_list()) + if schema_name in schemas_list: + return + + sql = f"CREATE SCHEMA IF NOT EXISTS {schema_name}" + + try: + self._execute_sql(sql) + except Exception as ex: + # Ignore schema exists errors. + if "already exists" not in str(ex): + raise + + if DEBUG_MODE: + found_schemas = schemas_list + assert schema_name in found_schemas, f"Schema {schema_name} was not created. Found: {found_schemas}" + + def _quote_identifier(self, identifier: str) -> str: + """Return the given identifier, quoted.""" + return f'"{identifier}"' + + @final + def _get_temp_table_name( + self, + stream_name: str, + batch_id: str | None = None, # ULID of the batch + ) -> str: + """Return a new (unique) temporary table name.""" + if not batch_id: + batch_id = str(ulid.ULID()) + + # Use the first 6 and last 3 characters of the ULID. This gives great uniqueness while + # limiting the table name suffix to 10 characters, including the underscore. + suffix = f"{batch_id[:6]}{batch_id[-3:]}" if len(batch_id) > 9 else batch_id # noqa: PLR2004 # Allow magic int value + + # Note: The normalizer may truncate the table name if the database has a name length limit. + # For instance, the Postgres normalizer will enforce a 63-character limit on table names. + return self.normalizer.normalize(f"{stream_name}_{suffix}") + + def _fully_qualified( + self, + table_name: str, + ) -> str: + """Return the fully qualified name of the given table.""" + return f"{self.sql_config.schema_name}.{self._quote_identifier(table_name)}" + + @final + def _create_table_for_loading( + self, + /, + stream_name: str, + batch_id: str | None, + ) -> str: + """Create a new table for loading data.""" + temp_table_name = self._get_temp_table_name(stream_name, batch_id) + column_definition_str = ",\n ".join( + f"{self._quote_identifier(column_name)} {sql_type}" + for column_name, sql_type in self._get_sql_column_definitions(stream_name).items() + ) + self._create_table(temp_table_name, column_definition_str) + + return temp_table_name + + def _get_tables_list( + self, + ) -> list[str]: + """Return a list of all tables in the database.""" + with self.get_sql_connection() as conn: + inspector: Inspector = sqlalchemy.inspect(conn) + return inspector.get_table_names(schema=self.sql_config.schema_name) # type: ignore + + def _get_schemas_list( + self, + database_name: str | None = None, + *, + force_refresh: bool = False, + ) -> list[str]: + """Return a list of all tables in the database.""" + if not force_refresh and self._known_schemas_list: + return self._known_schemas_list + + inspector: Inspector = sqlalchemy.inspect(self.get_sql_engine()) + database_name = database_name or self.database_name + found_schemas = inspector.get_schema_names() + self._known_schemas_list = [ + found_schema.split(".")[-1].strip('"') + for found_schema in found_schemas + if "." not in found_schema or (found_schema.split(".")[0].lower().strip('"') == database_name.lower()) + ] + return self._known_schemas_list + + def _ensure_final_table_exists( + self, + stream_name: str, + *, + create_if_missing: bool = True, + ) -> str: + """Create the final table if it doesn't already exist. + + Return the table name. + """ + table_name = self.get_sql_table_name(stream_name) + did_exist = self._table_exists(table_name) + if not did_exist and create_if_missing: + column_definition_str = ",\n ".join( + f"{self._quote_identifier(column_name)} {sql_type}" + for column_name, sql_type in self._get_sql_column_definitions( + stream_name, + ).items() + ) + self._create_table(table_name, column_definition_str) + + return table_name + + def _ensure_compatible_table_schema( + self, + stream_name: str, + table_name: str, + ) -> None: + """Return true if the given table is compatible with the stream's schema. + + Raises an exception if the table schema is not compatible with the schema of the + input stream. + """ + # TODO: Expand this to check for column types and sizes. + # https://github.com/airbytehq/Airbyte/issues/321 + self._add_missing_columns_to_table( + stream_name=stream_name, + table_name=table_name, + ) + + @final + def _create_table( + self, + table_name: str, + column_definition_str: str, + primary_keys: list[str] | None = None, + ) -> None: + if primary_keys: + pk_str = ", ".join(primary_keys) + column_definition_str += f",\n PRIMARY KEY ({pk_str})" + + cmd = f""" + CREATE TABLE {self._fully_qualified(table_name)} ( + {column_definition_str} + ) + """ + _ = self._execute_sql(cmd) + + @final + def _get_sql_column_definitions( + self, + stream_name: str, + ) -> dict[str, sqlalchemy.types.TypeEngine[Any]]: + """Return the column definitions for the given stream.""" + columns: dict[str, sqlalchemy.types.TypeEngine[Any]] = {} + properties = self.catalog_provider.get_stream_properties(stream_name) + for property_name, json_schema_property_def in properties.items(): + clean_prop_name = self.normalizer.normalize(property_name) + columns[clean_prop_name] = self.type_converter.to_sql_type( + json_schema_property_def, + ) + + columns[AB_RAW_ID_COLUMN] = self.type_converter_class.get_string_type() + columns[AB_EXTRACTED_AT_COLUMN] = sqlalchemy.TIMESTAMP() + columns[AB_META_COLUMN] = self.type_converter_class.get_json_type() + + return columns + + def _execute_sql(self, sql: str | TextClause | Executable) -> CursorResult[Any]: + """Execute the given SQL statement.""" + if isinstance(sql, str): + sql = text(sql) + + with self.get_sql_connection() as conn: + try: + result = conn.execute(sql) + except ( + ProgrammingError, + SQLAlchemyError, + ) as ex: + msg = f"Error when executing SQL:\n{sql}\n{type(ex).__name__}{ex!s}" + raise SQLRuntimeError(msg) from None # from ex + + return result + + def _drop_temp_table( + self, + table_name: str, + *, + if_exists: bool = True, + ) -> None: + """Drop the given table.""" + exists_str = "IF EXISTS" if if_exists else "" + self._execute_sql(f"DROP TABLE {exists_str} {self._fully_qualified(table_name)}") + + def _write_files_to_new_table( + self, + files: list[Path], + stream_name: str, + batch_id: str, + ) -> str: + """Write a file(s) to a new table. + + This is a generic implementation, which can be overridden by subclasses + to improve performance. + """ + temp_table_name = self._create_table_for_loading(stream_name, batch_id) + for file_path in files: + dataframe = pd.read_json(file_path, lines=True) + + sql_column_definitions: dict[str, TypeEngine[Any]] = self._get_sql_column_definitions(stream_name) + + # Remove fields that are not in the schema + for col_name in dataframe.columns: + if col_name not in sql_column_definitions: + dataframe = dataframe.drop(columns=col_name) + + # Pandas will auto-create the table if it doesn't exist, which we don't want. + if not self._table_exists(temp_table_name): + raise exc.AirbyteInternalError( + message="Table does not exist after creation.", + context={ + "temp_table_name": temp_table_name, + }, + ) + + # Normalize all column names to lower case. + dataframe.columns = Index([self.normalizer.normalize(col) for col in dataframe.columns]) + + # Write the data to the table. + dataframe.to_sql( + temp_table_name, + self.get_sql_alchemy_url(), + schema=self.sql_config.schema_name, + if_exists="append", + index=False, + dtype=sql_column_definitions, # type: ignore[arg-type] + ) + return temp_table_name + + def _add_column_to_table( + self, + table: Table, + column_name: str, + column_type: sqlalchemy.types.TypeEngine[Any], + ) -> None: + """Add a column to the given table.""" + self._execute_sql( + text(f"ALTER TABLE {self._fully_qualified(table.name)} " f"ADD COLUMN {column_name} {column_type}"), + ) + + def _add_missing_columns_to_table( + self, + stream_name: str, + table_name: str, + ) -> None: + """Add missing columns to the table. + + This is a no-op if all columns are already present. + """ + columns = self._get_sql_column_definitions(stream_name) + # First check without forcing a refresh of the cache (faster). If nothing is missing, + # then we're done. + table = self._get_table_by_name( + table_name, + force_refresh=False, + ) + missing_columns: bool = any(column_name not in table.columns for column_name in columns) + + if missing_columns: + # If we found missing columns, refresh the cache and then take action on anything + # that's still confirmed missing. + columns_added = False + table = self._get_table_by_name( + table_name, + force_refresh=True, + ) + for column_name, column_type in columns.items(): + if column_name not in table.columns: + self._add_column_to_table(table, column_name, column_type) + columns_added = True + + if columns_added: + # We've added columns, so invalidate the cache. + self._invalidate_table_cache(table_name) + + def _append_temp_table_to_final_table( + self, + temp_table_name: str, + final_table_name: str, + stream_name: str, + ) -> None: + nl = "\n" + columns = [self._quote_identifier(c) for c in self._get_sql_column_definitions(stream_name)] + self._execute_sql( + f""" + INSERT INTO {self._fully_qualified(final_table_name)} ( + {f',{nl} '.join(columns)} + ) + SELECT + {f',{nl} '.join(columns)} + FROM {self._fully_qualified(temp_table_name)} + """, + ) + + def _swap_temp_table_with_final_table( + self, + stream_name: str, + temp_table_name: str, + final_table_name: str, + ) -> None: + """Merge the temp table into the main one. + + This implementation requires MERGE support in the SQL DB. + Databases that do not support this syntax can override this method. + """ + if final_table_name is None: + raise exc.AirbyteInternalError(message="Arg 'final_table_name' cannot be None.") + if temp_table_name is None: + raise exc.AirbyteInternalError(message="Arg 'temp_table_name' cannot be None.") + + _ = stream_name + deletion_name = f"{final_table_name}_deleteme" + commands = "\n".join( + [ + f"ALTER TABLE {self._fully_qualified(final_table_name)} RENAME " f"TO {deletion_name};", + f"ALTER TABLE {self._fully_qualified(temp_table_name)} RENAME " f"TO {final_table_name};", + f"DROP TABLE {self._fully_qualified(deletion_name)};", + ] + ) + self._execute_sql(commands) + + def _merge_temp_table_to_final_table( + self, + stream_name: str, + temp_table_name: str, + final_table_name: str, + ) -> None: + """Merge the temp table into the main one. + + This implementation requires MERGE support in the SQL DB. + Databases that do not support this syntax can override this method. + """ + nl = "\n" + columns = {self._quote_identifier(c) for c in self._get_sql_column_definitions(stream_name)} + pk_columns = {self._quote_identifier(c) for c in self.catalog_provider.get_primary_keys(stream_name)} + non_pk_columns = columns - pk_columns + join_clause = f"{nl} AND ".join(f"tmp.{pk_col} = final.{pk_col}" for pk_col in pk_columns) + set_clause = f"{nl} , ".join(f"{col} = tmp.{col}" for col in non_pk_columns) + self._execute_sql( + f""" + MERGE INTO {self._fully_qualified(final_table_name)} final + USING ( + SELECT * + FROM {self._fully_qualified(temp_table_name)} + ) AS tmp + ON {join_clause} + WHEN MATCHED THEN UPDATE + SET + {set_clause} + WHEN NOT MATCHED THEN INSERT + ( + {f',{nl} '.join(columns)} + ) + VALUES ( + tmp.{f',{nl} tmp.'.join(columns)} + ); + """, + ) + + def _get_column_by_name(self, table: str | Table, column_name: str) -> Column[Any]: + """Return the column object for the given column name. + + This method is case-insensitive. + """ + if isinstance(table, str): + table = self._get_table_by_name(table) + try: + # Try to get the column in a case-insensitive manner + return next(col for col in table.c if col.name.lower() == column_name.lower()) + except StopIteration: + raise exc.AirbyteInternalError( + message="Could not find matching column.", + context={ + "table": table, + "column_name": column_name, + }, + ) from None + + def _emulated_merge_temp_table_to_final_table( + self, + stream_name: str, + temp_table_name: str, + final_table_name: str, + ) -> None: + """Emulate the merge operation using a series of SQL commands. + + This is a fallback implementation for databases that do not support MERGE. + """ + final_table = self._get_table_by_name(final_table_name) + temp_table = self._get_table_by_name(temp_table_name) + pk_columns = self.catalog_provider.get_primary_keys(stream_name) + + columns_to_update: set[str] = self._get_sql_column_definitions(stream_name=stream_name).keys() - set(pk_columns) + + # Create a dictionary mapping columns in users_final to users_stage for updating + update_values = { + self._get_column_by_name(final_table, column): (self._get_column_by_name(temp_table, column)) for column in columns_to_update + } + + # Craft the WHERE clause for composite primary keys + join_conditions = [ + self._get_column_by_name(final_table, pk_column) == self._get_column_by_name(temp_table, pk_column) for pk_column in pk_columns + ] + join_clause = and_(*join_conditions) + + # Craft the UPDATE statement + update_stmt = update(final_table).values(update_values).where(join_clause) + + # Define a join between temp_table and final_table + joined_table = temp_table.outerjoin(final_table, join_clause) + + # Define a condition that checks for records in temp_table that do not have a corresponding + # record in final_table + where_not_exists_clause = self._get_column_by_name(final_table, pk_columns[0]) == null() + + # Select records from temp_table that are not in final_table + select_new_records_stmt = select(temp_table).select_from(joined_table).where(where_not_exists_clause) + + # Craft the INSERT statement using the select statement + insert_new_records_stmt = insert(final_table).from_select( + names=[column.name for column in temp_table.columns], select=select_new_records_stmt + ) + + if DEBUG_MODE: + print(str(update_stmt)) + print(str(insert_new_records_stmt)) + + with self.get_sql_connection() as conn: + conn.execute(update_stmt) + conn.execute(insert_new_records_stmt) + + def _table_exists( + self, + table_name: str, + ) -> bool: + """Return true if the given table exists. + + Subclasses may override this method to provide a more efficient implementation. + """ + return table_name in self._get_tables_list() diff --git a/airbyte-cdk/python/airbyte_cdk/sql/types.py b/airbyte-cdk/python/airbyte_cdk/sql/types.py new file mode 100644 index 000000000000..bb6fa1cb76df --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sql/types.py @@ -0,0 +1,160 @@ +# noqa: A005 # Allow shadowing the built-in 'types' module +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +"""Type conversion methods for SQL Caches.""" + +from __future__ import annotations + +from typing import Any, cast + +import sqlalchemy + +# Compare to documentation here: https://docs.airbyte.com/understanding-airbyte/supported-data-types +CONVERSION_MAP = { + "string": sqlalchemy.types.VARCHAR, + "integer": sqlalchemy.types.BIGINT, + "number": sqlalchemy.types.DECIMAL(38, 9), + "boolean": sqlalchemy.types.BOOLEAN, + "date": sqlalchemy.types.DATE, + "timestamp_with_timezone": sqlalchemy.types.TIMESTAMP, + "timestamp_without_timezone": sqlalchemy.types.TIMESTAMP, + "time_with_timezone": sqlalchemy.types.TIME, + "time_without_timezone": sqlalchemy.types.TIME, + # Technically 'object' and 'array' as JSON Schema types, not airbyte types. + # We include them here for completeness. + "object": sqlalchemy.types.JSON, + "array": sqlalchemy.types.JSON, + "vector_array": sqlalchemy.types.ARRAY, +} + + +class SQLTypeConversionError(Exception): + """An exception to be raised when a type conversion fails.""" + + +def _get_airbyte_type( # noqa: PLR0911 # Too many return statements + json_schema_property_def: dict[str, str | dict[str, Any] | list[Any]], +) -> tuple[str, str | None]: + """Get the airbyte type and subtype from a JSON schema property definition. + + Subtype is only used for array types. Otherwise, subtype will return None. + """ + airbyte_type = cast(str, json_schema_property_def.get("airbyte_type", None)) + if airbyte_type: + return airbyte_type, None + + json_schema_type = json_schema_property_def.get("type", None) + json_schema_format = json_schema_property_def.get("format", None) + + # if json_schema_type is an array of two strings with one of them being null, pick the other one + # this strategy is often used by connectors to indicate a field might not be set all the time + if isinstance(json_schema_type, list): + non_null_types = [t for t in json_schema_type if t != "null"] + if len(non_null_types) == 1: + json_schema_type = non_null_types[0] + + if json_schema_type == "string": + if json_schema_format == "date": + return "date", None + + if json_schema_format == "date-time": + return "timestamp_with_timezone", None + + if json_schema_format == "time": + return "time_without_timezone", None + + if isinstance(json_schema_type, str) and json_schema_type in { + "string", + "number", + "boolean", + "integer", + }: + return json_schema_type, None + + if json_schema_type == "object": + return "object", None + + if json_schema_type == "array": + items_def = json_schema_property_def.get("items", None) + if isinstance(items_def, dict): + try: + subtype, _ = _get_airbyte_type(items_def) + except SQLTypeConversionError: + # We have enough information, so we can ignore parsing errors on subtype. + subtype = None + + return "array", subtype + + return "array", None + + if json_schema_type == "vector_array": + return "vector_array", "Float" + + err_msg = f"Could not determine airbyte type from JSON schema type: {json_schema_property_def}" + raise SQLTypeConversionError(err_msg) + + +class SQLTypeConverter: + """A base class to perform type conversions.""" + + def __init__( + self, + conversion_map: dict[str, Any] | None = None, + ) -> None: + """Initialize the type converter.""" + self.conversion_map = conversion_map or CONVERSION_MAP + + @classmethod + def get_string_type(cls) -> sqlalchemy.types.TypeEngine[str]: + """Get the type to use for string data.""" + return sqlalchemy.types.VARCHAR() + + @classmethod + def get_failover_type(cls) -> sqlalchemy.types.TypeEngine[str]: + """Get the 'last resort' type to use if no other type is found.""" + return cls.get_string_type() + + @classmethod + def get_json_type(cls) -> sqlalchemy.types.TypeEngine[Any]: + """Get the type to use for nested JSON data.""" + return sqlalchemy.types.JSON() + + def to_sql_type( # noqa: PLR0911 # Too many return statements + self, + json_schema_property_def: dict[str, str | dict[str, Any] | list[Any]], + ) -> Any: + """Convert a value to a SQL type.""" + try: + airbyte_type, _ = _get_airbyte_type(json_schema_property_def) + # to-do - is there a better way to check the following + if airbyte_type == "vector_array": + return sqlalchemy.types.ARRAY(sqlalchemy.types.Float()) + sql_type = self.conversion_map[airbyte_type] + except SQLTypeConversionError: + print(f"Could not determine airbyte type from JSON schema: {json_schema_property_def}") + except KeyError: + print(f"Could not find SQL type for airbyte type: {airbyte_type}") + else: + # No exceptions were raised, so we can return the SQL type. + if isinstance(sql_type, type): + # This is a class. Call its constructor. + sql_type = sql_type() + + return sql_type + + json_schema_type = json_schema_property_def.get("type", None) + json_schema_format = json_schema_property_def.get("format", None) + + if json_schema_type == "string" and json_schema_format == "date": + return sqlalchemy.types.DATE() + + if json_schema_type == "string" and json_schema_format == "date-time": + return sqlalchemy.types.TIMESTAMP() + + if json_schema_type == "array": + return sqlalchemy.types.JSON() + + if json_schema_type == "object": + return sqlalchemy.types.JSON() + + return self.get_failover_type() diff --git a/airbyte-cdk/python/airbyte_cdk/test/__init__.py b/airbyte-cdk/python/airbyte_cdk/test/__init__.py new file mode 100644 index 000000000000..6d3fabb5a354 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +""" +This package is provided as tooling to help sources test their implementation. It is not expected to be used as production code. +""" diff --git a/airbyte-cdk/python/airbyte_cdk/test/catalog_builder.py b/airbyte-cdk/python/airbyte_cdk/test/catalog_builder.py new file mode 100644 index 000000000000..235be7c579b6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/catalog_builder.py @@ -0,0 +1,70 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from typing import Any, Dict, List, Union, overload + +from airbyte_cdk.models import ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, ConfiguredAirbyteStreamSerializer, SyncMode + + +class ConfiguredAirbyteStreamBuilder: + def __init__(self) -> None: + self._stream: Dict[str, Any] = { + "stream": { + "name": "any name", + "json_schema": {}, + "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_primary_key": [["id"]], + }, + "primary_key": [["id"]], + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite", + } + + def with_name(self, name: str) -> "ConfiguredAirbyteStreamBuilder": + self._stream["stream"]["name"] = name # type: ignore # we assume that self._stream["stream"] is a Dict[str, Any] + return self + + def with_sync_mode(self, sync_mode: SyncMode) -> "ConfiguredAirbyteStreamBuilder": + self._stream["sync_mode"] = sync_mode.name + return self + + def with_primary_key(self, pk: List[List[str]]) -> "ConfiguredAirbyteStreamBuilder": + self._stream["primary_key"] = pk + self._stream["stream"]["source_defined_primary_key"] = pk # type: ignore # we assume that self._stream["stream"] is a Dict[str, Any] + return self + + def with_json_schema(self, json_schema: Dict[str, Any]) -> "ConfiguredAirbyteStreamBuilder": + self._stream["stream"]["json_schema"] = json_schema + return self + + def build(self) -> ConfiguredAirbyteStream: + return ConfiguredAirbyteStreamSerializer.load(self._stream) + + +class CatalogBuilder: + def __init__(self) -> None: + self._streams: List[ConfiguredAirbyteStreamBuilder] = [] + + @overload + def with_stream(self, name: ConfiguredAirbyteStreamBuilder) -> "CatalogBuilder": + ... + + @overload + def with_stream(self, name: str, sync_mode: SyncMode) -> "CatalogBuilder": + ... + + def with_stream(self, name: Union[str, ConfiguredAirbyteStreamBuilder], sync_mode: Union[SyncMode, None] = None) -> "CatalogBuilder": + # As we are introducing a fully fledge ConfiguredAirbyteStreamBuilder, we would like to deprecate the previous interface + # with_stream(str, SyncMode) + + # to avoid a breaking change, `name` needs to stay in the API but this can be either a name or a builder + name_or_builder = name + builder = ( + name_or_builder + if isinstance(name_or_builder, ConfiguredAirbyteStreamBuilder) + else ConfiguredAirbyteStreamBuilder().with_name(name_or_builder).with_sync_mode(sync_mode) + ) + self._streams.append(builder) + return self + + def build(self) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog(streams=list(map(lambda builder: builder.build(), self._streams))) diff --git a/airbyte-cdk/python/airbyte_cdk/test/entrypoint_wrapper.py b/airbyte-cdk/python/airbyte_cdk/test/entrypoint_wrapper.py new file mode 100644 index 000000000000..9cc74ec2669b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/entrypoint_wrapper.py @@ -0,0 +1,225 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +""" +The AirbyteEntrypoint is important because it is a service layer that orchestrate how we execute commands from the +[common interface](https://docs.airbyte.com/understanding-airbyte/airbyte-protocol#common-interface) through the source Python +implementation. There is some logic about which message we send to the platform and when which is relevant for integration testing. Other +than that, there are integrations point that are annoying to integrate with using Python code: +* Sources communicate with the platform using stdout. The implication is that the source could just print every message instead of + returning things to source. or to using the message repository. WARNING: As part of integration testing, we will not support + messages that are simply printed. The reason is that capturing stdout relies on overriding sys.stdout (see + https://docs.python.org/3/library/contextlib.html#contextlib.redirect_stdout) which clashes with how pytest captures logs and brings + considerations for multithreaded applications. If code you work with uses `print` statements, please migrate to + source.message_repository to emit those messages +* The entrypoint interface relies on file being written on the file system +""" + +import json +import logging +import re +import tempfile +import traceback +from io import StringIO +from pathlib import Path +from typing import Any, List, Mapping, Optional, Union + +from airbyte_cdk.entrypoint import AirbyteEntrypoint +from airbyte_cdk.exception_handler import assemble_uncaught_exception +from airbyte_cdk.logger import AirbyteLogFormatter +from airbyte_cdk.models import ( + AirbyteLogMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteStateMessage, + AirbyteStateMessageSerializer, + AirbyteStreamStatus, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, + Level, + TraceType, + Type, +) +from airbyte_cdk.sources import Source +from orjson import orjson +from pydantic import ValidationError as V2ValidationError +from serpyco_rs import SchemaValidationError + + +class EntrypointOutput: + def __init__(self, messages: List[str], uncaught_exception: Optional[BaseException] = None): + try: + self._messages = [self._parse_message(message) for message in messages] + except V2ValidationError as exception: + raise ValueError("All messages are expected to be AirbyteMessage") from exception + + if uncaught_exception: + self._messages.append(assemble_uncaught_exception(type(uncaught_exception), uncaught_exception).as_airbyte_message()) + + @staticmethod + def _parse_message(message: str) -> AirbyteMessage: + try: + return AirbyteMessageSerializer.load(orjson.loads(message)) # type: ignore[no-any-return] # Serializer.load() always returns AirbyteMessage + except (orjson.JSONDecodeError, SchemaValidationError): + # The platform assumes that logs that are not of AirbyteMessage format are log messages + return AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message=message)) + + @property + def records_and_state_messages(self) -> List[AirbyteMessage]: + return self._get_message_by_types([Type.RECORD, Type.STATE]) + + @property + def records(self) -> List[AirbyteMessage]: + return self._get_message_by_types([Type.RECORD]) + + @property + def state_messages(self) -> List[AirbyteMessage]: + return self._get_message_by_types([Type.STATE]) + + @property + def most_recent_state(self) -> Any: + state_messages = self._get_message_by_types([Type.STATE]) + if not state_messages: + raise ValueError("Can't provide most recent state as there are no state messages") + return state_messages[-1].state.stream # type: ignore[union-attr] # state has `stream` + + @property + def logs(self) -> List[AirbyteMessage]: + return self._get_message_by_types([Type.LOG]) + + @property + def trace_messages(self) -> List[AirbyteMessage]: + return self._get_message_by_types([Type.TRACE]) + + @property + def analytics_messages(self) -> List[AirbyteMessage]: + return self._get_trace_message_by_trace_type(TraceType.ANALYTICS) + + @property + def errors(self) -> List[AirbyteMessage]: + return self._get_trace_message_by_trace_type(TraceType.ERROR) + + @property + def catalog(self) -> AirbyteMessage: + catalog = self._get_message_by_types([Type.CATALOG]) + if len(catalog) != 1: + raise ValueError(f"Expected exactly one catalog but got {len(catalog)}") + return catalog[0] + + def get_stream_statuses(self, stream_name: str) -> List[AirbyteStreamStatus]: + status_messages = map( + lambda message: message.trace.stream_status.status, # type: ignore + filter( + lambda message: message.trace.stream_status.stream_descriptor.name == stream_name, # type: ignore # callable; trace has `stream_status` + self._get_trace_message_by_trace_type(TraceType.STREAM_STATUS), + ), + ) + return list(status_messages) + + def _get_message_by_types(self, message_types: List[Type]) -> List[AirbyteMessage]: + return [message for message in self._messages if message.type in message_types] + + def _get_trace_message_by_trace_type(self, trace_type: TraceType) -> List[AirbyteMessage]: + return [message for message in self._get_message_by_types([Type.TRACE]) if message.trace.type == trace_type] # type: ignore[union-attr] # trace has `type` + + def is_in_logs(self, pattern: str) -> bool: + """Check if any log message case-insensitive matches the pattern.""" + return any(re.search(pattern, entry.log.message, flags=re.IGNORECASE) for entry in self.logs) # type: ignore[union-attr] # log has `message` + + def is_not_in_logs(self, pattern: str) -> bool: + """Check if no log message matches the case-insensitive pattern.""" + return not self.is_in_logs(pattern) + + +def _run_command(source: Source, args: List[str], expecting_exception: bool = False) -> EntrypointOutput: + log_capture_buffer = StringIO() + stream_handler = logging.StreamHandler(log_capture_buffer) + stream_handler.setLevel(logging.INFO) + stream_handler.setFormatter(AirbyteLogFormatter()) + parent_logger = logging.getLogger("") + parent_logger.addHandler(stream_handler) + + parsed_args = AirbyteEntrypoint.parse_args(args) + + source_entrypoint = AirbyteEntrypoint(source) + messages = [] + uncaught_exception = None + try: + for message in source_entrypoint.run(parsed_args): + messages.append(message) + except Exception as exception: + if not expecting_exception: + print("Printing unexpected error from entrypoint_wrapper") + print("".join(traceback.format_exception(None, exception, exception.__traceback__))) + uncaught_exception = exception + + captured_logs = log_capture_buffer.getvalue().split("\n")[:-1] + + parent_logger.removeHandler(stream_handler) + + return EntrypointOutput(messages + captured_logs, uncaught_exception) + + +def discover( + source: Source, + config: Mapping[str, Any], + expecting_exception: bool = False, +) -> EntrypointOutput: + """ + config must be json serializable + :param expecting_exception: By default if there is an uncaught exception, the exception will be printed out. If this is expected, please + provide expecting_exception=True so that the test output logs are cleaner + """ + + with tempfile.TemporaryDirectory() as tmp_directory: + tmp_directory_path = Path(tmp_directory) + config_file = make_file(tmp_directory_path / "config.json", config) + + return _run_command(source, ["discover", "--config", config_file, "--debug"], expecting_exception) + + +def read( + source: Source, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: Optional[List[AirbyteStateMessage]] = None, + expecting_exception: bool = False, +) -> EntrypointOutput: + """ + config and state must be json serializable + + :param expecting_exception: By default if there is an uncaught exception, the exception will be printed out. If this is expected, please + provide expecting_exception=True so that the test output logs are cleaner + """ + with tempfile.TemporaryDirectory() as tmp_directory: + tmp_directory_path = Path(tmp_directory) + config_file = make_file(tmp_directory_path / "config.json", config) + catalog_file = make_file( + tmp_directory_path / "catalog.json", orjson.dumps(ConfiguredAirbyteCatalogSerializer.dump(catalog)).decode() + ) + args = [ + "read", + "--config", + config_file, + "--catalog", + catalog_file, + ] + if state is not None: + args.extend( + [ + "--state", + make_file( + tmp_directory_path / "state.json", + f"[{','.join([orjson.dumps(AirbyteStateMessageSerializer.dump(stream_state)).decode() for stream_state in state])}]", + ), + ] + ) + + return _run_command(source, args, expecting_exception) + + +def make_file(path: Path, file_contents: Optional[Union[str, Mapping[str, Any], List[Mapping[str, Any]]]]) -> str: + if isinstance(file_contents, str): + path.write_text(file_contents) + else: + path.write_text(json.dumps(file_contents)) + return str(path) diff --git a/airbyte-cdk/python/airbyte_cdk/test/mock_http/__init__.py b/airbyte-cdk/python/airbyte_cdk/test/mock_http/__init__.py new file mode 100644 index 000000000000..88b28b0225e8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/mock_http/__init__.py @@ -0,0 +1,6 @@ +from airbyte_cdk.test.mock_http.matcher import HttpRequestMatcher +from airbyte_cdk.test.mock_http.request import HttpRequest +from airbyte_cdk.test.mock_http.response import HttpResponse +from airbyte_cdk.test.mock_http.mocker import HttpMocker + +__all__ = ["HttpMocker", "HttpRequest", "HttpRequestMatcher", "HttpResponse"] diff --git a/airbyte-cdk/python/airbyte_cdk/test/mock_http/matcher.py b/airbyte-cdk/python/airbyte_cdk/test/mock_http/matcher.py new file mode 100644 index 000000000000..d07cec3ec8b2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/mock_http/matcher.py @@ -0,0 +1,41 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +from typing import Any + +from airbyte_cdk.test.mock_http.request import HttpRequest + + +class HttpRequestMatcher: + def __init__(self, request: HttpRequest, minimum_number_of_expected_match: int): + self._request_to_match = request + self._minimum_number_of_expected_match = minimum_number_of_expected_match + self._actual_number_of_matches = 0 + + def matches(self, request: HttpRequest) -> bool: + hit = request.matches(self._request_to_match) + if hit: + self._actual_number_of_matches += 1 + return hit + + def has_expected_match_count(self) -> bool: + return self._actual_number_of_matches >= self._minimum_number_of_expected_match + + @property + def actual_number_of_matches(self) -> int: + return self._actual_number_of_matches + + @property + def request(self) -> HttpRequest: + return self._request_to_match + + def __str__(self) -> str: + return ( + f"HttpRequestMatcher(" + f"request_to_match={self._request_to_match}, " + f"minimum_number_of_expected_match={self._minimum_number_of_expected_match}, " + f"actual_number_of_matches={self._actual_number_of_matches})" + ) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, HttpRequestMatcher): + return self._request_to_match == other._request_to_match + return False diff --git a/airbyte-cdk/python/airbyte_cdk/test/mock_http/mocker.py b/airbyte-cdk/python/airbyte_cdk/test/mock_http/mocker.py new file mode 100644 index 000000000000..4ac690dc5275 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/mock_http/mocker.py @@ -0,0 +1,139 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +import contextlib +import functools +from enum import Enum +from types import TracebackType +from typing import Callable, List, Optional, Union + +import requests_mock +from airbyte_cdk.test.mock_http import HttpRequest, HttpRequestMatcher, HttpResponse + + +class SupportedHttpMethods(str, Enum): + GET = "get" + PATCH = "patch" + POST = "post" + DELETE = "delete" + + +class HttpMocker(contextlib.ContextDecorator): + """ + WARNING 1: This implementation only works if the lib used to perform HTTP requests is `requests`. + + WARNING 2: Given multiple requests that are not mutually exclusive, the request will match the first one. This can happen in scenarios + where the same request is added twice (in which case there will always be an exception because we will never match the second + request) or in a case like this: + ``` + http_mocker.get(HttpRequest(_A_URL, headers={"less_granular": "1", "more_granular": "2"}), <...>) + http_mocker.get(HttpRequest(_A_URL, headers={"less_granular": "1"}), <...>) + requests.get(_A_URL, headers={"less_granular": "1", "more_granular": "2"}) + ``` + In the example above, the matcher would match the second mock as requests_mock iterate over the matcher in reverse order (see + https://github.com/jamielennox/requests-mock/blob/c06f124a33f56e9f03840518e19669ba41b93202/requests_mock/adapter.py#L246) even + though the request sent is a better match for the first `http_mocker.get`. + """ + + def __init__(self) -> None: + self._mocker = requests_mock.Mocker() + self._matchers: List[HttpRequestMatcher] = [] + + def __enter__(self) -> "HttpMocker": + self._mocker.__enter__() + return self + + def __exit__(self, exc_type: Optional[BaseException], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]) -> None: + self._mocker.__exit__(exc_type, exc_val, exc_tb) + + def _validate_all_matchers_called(self) -> None: + for matcher in self._matchers: + if not matcher.has_expected_match_count(): + raise ValueError(f"Invalid number of matches for `{matcher}`") + + def _mock_request_method( + self, method: SupportedHttpMethods, request: HttpRequest, responses: Union[HttpResponse, List[HttpResponse]] + ) -> None: + if isinstance(responses, HttpResponse): + responses = [responses] + + matcher = HttpRequestMatcher(request, len(responses)) + if matcher in self._matchers: + raise ValueError(f"Request {matcher.request} already mocked") + self._matchers.append(matcher) + + getattr(self._mocker, method)( + requests_mock.ANY, + additional_matcher=self._matches_wrapper(matcher), + response_list=[ + {"text": response.body, "status_code": response.status_code, "headers": response.headers} for response in responses + ], + ) + + def get(self, request: HttpRequest, responses: Union[HttpResponse, List[HttpResponse]]) -> None: + self._mock_request_method(SupportedHttpMethods.GET, request, responses) + + def patch(self, request: HttpRequest, responses: Union[HttpResponse, List[HttpResponse]]) -> None: + self._mock_request_method(SupportedHttpMethods.PATCH, request, responses) + + def post(self, request: HttpRequest, responses: Union[HttpResponse, List[HttpResponse]]) -> None: + self._mock_request_method(SupportedHttpMethods.POST, request, responses) + + def delete(self, request: HttpRequest, responses: Union[HttpResponse, List[HttpResponse]]) -> None: + self._mock_request_method(SupportedHttpMethods.DELETE, request, responses) + + @staticmethod + def _matches_wrapper(matcher: HttpRequestMatcher) -> Callable[[requests_mock.request._RequestObjectProxy], bool]: + def matches(requests_mock_request: requests_mock.request._RequestObjectProxy) -> bool: + # query_params are provided as part of `requests_mock_request.url` + http_request = HttpRequest( + requests_mock_request.url, query_params={}, headers=requests_mock_request.headers, body=requests_mock_request.body + ) + return matcher.matches(http_request) + + return matches + + def assert_number_of_calls(self, request: HttpRequest, number_of_calls: int) -> None: + corresponding_matchers = list(filter(lambda matcher: matcher.request == request, self._matchers)) + if len(corresponding_matchers) != 1: + raise ValueError(f"Was expecting only one matcher to match the request but got `{corresponding_matchers}`") + + assert corresponding_matchers[0].actual_number_of_matches == number_of_calls + + # trying to type that using callables provides the error `incompatible with return type "_F" in supertype "ContextDecorator"` + def __call__(self, f): # type: ignore + @functools.wraps(f) + def wrapper(*args, **kwargs): # type: ignore # this is a very generic wrapper that does not need to be typed + with self: + assertion_error = None + + kwargs["http_mocker"] = self + try: + result = f(*args, **kwargs) + except requests_mock.NoMockAddress as no_mock_exception: + matchers_as_string = "\n\t".join(map(lambda matcher: str(matcher.request), self._matchers)) + raise ValueError( + f"No matcher matches {no_mock_exception.args[0]} with headers `{no_mock_exception.request.headers}` " + f"and body `{no_mock_exception.request.body}`. " + f"Matchers currently configured are:\n\t{matchers_as_string}." + ) from no_mock_exception + except AssertionError as test_assertion: + assertion_error = test_assertion + + # We validate the matchers before raising the assertion error because we want to show the tester if an HTTP request wasn't + # mocked correctly + try: + self._validate_all_matchers_called() + except ValueError as http_mocker_exception: + # This seems useless as it catches ValueError and raises ValueError but without this, the prevailing error message in + # the output is the function call that failed the assertion, whereas raising `ValueError(http_mocker_exception)` + # like we do here provides additional context for the exception. + raise ValueError(http_mocker_exception) from None + if assertion_error: + raise assertion_error + return result + + return wrapper + + def clear_all_matchers(self) -> None: + """Clears all stored matchers by resetting the _matchers list to an empty state.""" + self._matchers = [] diff --git a/airbyte-cdk/python/airbyte_cdk/test/mock_http/request.py b/airbyte-cdk/python/airbyte_cdk/test/mock_http/request.py new file mode 100644 index 000000000000..756be23edd06 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/mock_http/request.py @@ -0,0 +1,97 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +import json +from typing import Any, List, Mapping, Optional, Union +from urllib.parse import parse_qs, urlencode, urlparse + +ANY_QUERY_PARAMS = "any query_parameters" + + +def _is_subdict(small: Mapping[str, str], big: Mapping[str, str]) -> bool: + return dict(big, **small) == big + + +class HttpRequest: + def __init__( + self, + url: str, + query_params: Optional[Union[str, Mapping[str, Union[str, List[str]]]]] = None, + headers: Optional[Mapping[str, str]] = None, + body: Optional[Union[str, bytes, Mapping[str, Any]]] = None, + ) -> None: + self._parsed_url = urlparse(url) + self._query_params = query_params + if not self._parsed_url.query and query_params: + self._parsed_url = urlparse(f"{url}?{self._encode_qs(query_params)}") + elif self._parsed_url.query and query_params: + raise ValueError("If query params are provided as part of the url, `query_params` should be empty") + + self._headers = headers or {} + self._body = body + + @staticmethod + def _encode_qs(query_params: Union[str, Mapping[str, Union[str, List[str]]]]) -> str: + if isinstance(query_params, str): + return query_params + return urlencode(query_params, doseq=True) + + def matches(self, other: Any) -> bool: + """ + If the body of any request is a Mapping, we compare as Mappings which means that the order is not important. + If the body is a string, encoding ISO-8859-1 will be assumed + Headers only need to be a subset of `other` in order to match + """ + if isinstance(other, HttpRequest): + # if `other` is a mapping, we match as an object and formatting is not considers + if isinstance(self._body, Mapping) or isinstance(other._body, Mapping): + body_match = self._to_mapping(self._body) == self._to_mapping(other._body) + else: + body_match = self._to_bytes(self._body) == self._to_bytes(other._body) + + return ( + self._parsed_url.scheme == other._parsed_url.scheme + and self._parsed_url.hostname == other._parsed_url.hostname + and self._parsed_url.path == other._parsed_url.path + and ( + ANY_QUERY_PARAMS in (self._query_params, other._query_params) + or parse_qs(self._parsed_url.query) == parse_qs(other._parsed_url.query) + ) + and _is_subdict(other._headers, self._headers) + and body_match + ) + return False + + @staticmethod + def _to_mapping(body: Optional[Union[str, bytes, Mapping[str, Any]]]) -> Optional[Mapping[str, Any]]: + if isinstance(body, Mapping): + return body + elif isinstance(body, bytes): + return json.loads(body.decode()) # type: ignore # assumes return type of Mapping[str, Any] + elif isinstance(body, str): + return json.loads(body) # type: ignore # assumes return type of Mapping[str, Any] + return None + + @staticmethod + def _to_bytes(body: Optional[Union[str, bytes]]) -> bytes: + if isinstance(body, bytes): + return body + elif isinstance(body, str): + # `ISO-8859-1` is the default encoding used by requests + return body.encode("ISO-8859-1") + return b"" + + def __str__(self) -> str: + return f"{self._parsed_url} with headers {self._headers} and body {self._body!r})" + + def __repr__(self) -> str: + return f"HttpRequest(request={self._parsed_url}, headers={self._headers}, body={self._body!r})" + + def __eq__(self, other: Any) -> bool: + if isinstance(other, HttpRequest): + return ( + self._parsed_url == other._parsed_url + and self._query_params == other._query_params + and self._headers == other._headers + and self._body == other._body + ) + return False diff --git a/airbyte-cdk/python/airbyte_cdk/test/mock_http/response.py b/airbyte-cdk/python/airbyte_cdk/test/mock_http/response.py new file mode 100644 index 000000000000..8d5dc4c308da --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/mock_http/response.py @@ -0,0 +1,23 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from types import MappingProxyType +from typing import Mapping + + +class HttpResponse: + def __init__(self, body: str, status_code: int = 200, headers: Mapping[str, str] = MappingProxyType({})): + self._body = body + self._status_code = status_code + self._headers = headers + + @property + def body(self) -> str: + return self._body + + @property + def status_code(self) -> int: + return self._status_code + + @property + def headers(self) -> Mapping[str, str]: + return self._headers diff --git a/airbyte-cdk/python/airbyte_cdk/test/mock_http/response_builder.py b/airbyte-cdk/python/airbyte_cdk/test/mock_http/response_builder.py new file mode 100644 index 000000000000..27bb5125d396 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/mock_http/response_builder.py @@ -0,0 +1,207 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +import functools +import json +from abc import ABC, abstractmethod +from pathlib import Path as FilePath +from typing import Any, Dict, List, Optional, Union + +from airbyte_cdk.test.mock_http import HttpResponse +from airbyte_cdk.test.utils.data import get_unit_test_folder + + +def _extract(path: List[str], response_template: Dict[str, Any]) -> Any: + return functools.reduce(lambda a, b: a[b], path, response_template) + + +def _replace_value(dictionary: Dict[str, Any], path: List[str], value: Any) -> None: + current = dictionary + for key in path[:-1]: + current = current[key] + current[path[-1]] = value + + +def _write(dictionary: Dict[str, Any], path: List[str], value: Any) -> None: + current = dictionary + for key in path[:-1]: + current = current.setdefault(key, {}) + current[path[-1]] = value + + +class Path(ABC): + @abstractmethod + def write(self, template: Dict[str, Any], value: Any) -> None: + pass + + @abstractmethod + def update(self, template: Dict[str, Any], value: Any) -> None: + pass + + def extract(self, template: Dict[str, Any]) -> Any: + pass + + +class FieldPath(Path): + def __init__(self, field: str): + self._path = [field] + + def write(self, template: Dict[str, Any], value: Any) -> None: + _write(template, self._path, value) + + def update(self, template: Dict[str, Any], value: Any) -> None: + _replace_value(template, self._path, value) + + def extract(self, template: Dict[str, Any]) -> Any: + return _extract(self._path, template) + + def __str__(self) -> str: + return f"FieldPath(field={self._path[0]})" + + +class NestedPath(Path): + def __init__(self, path: List[str]): + self._path = path + + def write(self, template: Dict[str, Any], value: Any) -> None: + _write(template, self._path, value) + + def update(self, template: Dict[str, Any], value: Any) -> None: + _replace_value(template, self._path, value) + + def extract(self, template: Dict[str, Any]) -> Any: + return _extract(self._path, template) + + def __str__(self) -> str: + return f"NestedPath(path={self._path})" + + +class PaginationStrategy(ABC): + @abstractmethod + def update(self, response: Dict[str, Any]) -> None: + pass + + +class FieldUpdatePaginationStrategy(PaginationStrategy): + def __init__(self, path: Path, value: Any): + self._path = path + self._value = value + + def update(self, response: Dict[str, Any]) -> None: + self._path.update(response, self._value) + + +class RecordBuilder: + def __init__(self, template: Dict[str, Any], id_path: Optional[Path], cursor_path: Optional[Union[FieldPath, NestedPath]]): + self._record = template + self._id_path = id_path + self._cursor_path = cursor_path + + self._validate_template() + + def _validate_template(self) -> None: + paths_to_validate = [ + ("_id_path", self._id_path), + ("_cursor_path", self._cursor_path), + ] + for field_name, field_path in paths_to_validate: + self._validate_field(field_name, field_path) + + def _validate_field(self, field_name: str, path: Optional[Path]) -> None: + try: + if path and not path.extract(self._record): + raise ValueError(f"{field_name} `{path}` was provided but it is not part of the template `{self._record}`") + except (IndexError, KeyError) as exception: + raise ValueError(f"{field_name} `{path}` was provided but it is not part of the template `{self._record}`") from exception + + def with_id(self, identifier: Any) -> "RecordBuilder": + self._set_field("id", self._id_path, identifier) + return self + + def with_cursor(self, cursor_value: Any) -> "RecordBuilder": + self._set_field("cursor", self._cursor_path, cursor_value) + return self + + def with_field(self, path: Path, value: Any) -> "RecordBuilder": + path.write(self._record, value) + return self + + def _set_field(self, field_name: str, path: Optional[Path], value: Any) -> None: + if not path: + raise ValueError( + f"{field_name}_path was not provided and hence, the record {field_name} can't be modified. Please provide `id_field` while " + f"instantiating RecordBuilder to leverage this capability" + ) + path.update(self._record, value) + + def build(self) -> Dict[str, Any]: + return self._record + + +class HttpResponseBuilder: + def __init__( + self, template: Dict[str, Any], records_path: Union[FieldPath, NestedPath], pagination_strategy: Optional[PaginationStrategy] + ): + self._response = template + self._records: List[RecordBuilder] = [] + self._records_path = records_path + self._pagination_strategy = pagination_strategy + self._status_code = 200 + + def with_record(self, record: RecordBuilder) -> "HttpResponseBuilder": + self._records.append(record) + return self + + def with_pagination(self) -> "HttpResponseBuilder": + if not self._pagination_strategy: + raise ValueError( + "`pagination_strategy` was not provided and hence, fields related to the pagination can't be modified. Please provide " + "`pagination_strategy` while instantiating ResponseBuilder to leverage this capability" + ) + self._pagination_strategy.update(self._response) + return self + + def with_status_code(self, status_code: int) -> "HttpResponseBuilder": + self._status_code = status_code + return self + + def build(self) -> HttpResponse: + self._records_path.update(self._response, [record.build() for record in self._records]) + return HttpResponse(json.dumps(self._response), self._status_code) + + +def _get_unit_test_folder(execution_folder: str) -> FilePath: + # FIXME: This function should be removed after the next CDK release to avoid breaking amazon-seller-partner test code. + return get_unit_test_folder(execution_folder) # type: ignore # get_unit_test_folder is known to return a FilePath + + +def find_template(resource: str, execution_folder: str) -> Dict[str, Any]: + response_template_filepath = str(get_unit_test_folder(execution_folder) / "resource" / "http" / "response" / f"{resource}.json") + with open(response_template_filepath, "r") as template_file: + return json.load(template_file) # type: ignore # we assume the dev correctly set up the resource file + + +def create_record_builder( + response_template: Dict[str, Any], + records_path: Union[FieldPath, NestedPath], + record_id_path: Optional[Path] = None, + record_cursor_path: Optional[Union[FieldPath, NestedPath]] = None, +) -> RecordBuilder: + """ + This will use the first record define at `records_path` as a template for the records. If more records are defined, they will be ignored + """ + try: + record_template = records_path.extract(response_template)[0] + if not record_template: + raise ValueError( + f"Could not extract any record from template at path `{records_path}`. " + f"Please fix the template to provide a record sample or fix `records_path`." + ) + return RecordBuilder(record_template, record_id_path, record_cursor_path) + except (IndexError, KeyError): + raise ValueError(f"Error while extracting records at path `{records_path}` from response template `{response_template}`") + + +def create_response_builder( + response_template: Dict[str, Any], records_path: Union[FieldPath, NestedPath], pagination_strategy: Optional[PaginationStrategy] = None +) -> HttpResponseBuilder: + return HttpResponseBuilder(response_template, records_path, pagination_strategy) diff --git a/airbyte-cdk/python/airbyte_cdk/test/state_builder.py b/airbyte-cdk/python/airbyte_cdk/test/state_builder.py new file mode 100644 index 000000000000..50b5dbe5f793 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/state_builder.py @@ -0,0 +1,25 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from typing import Any, List + +from airbyte_cdk.models import AirbyteStateBlob, AirbyteStateMessage, AirbyteStateType, AirbyteStreamState, StreamDescriptor + + +class StateBuilder: + def __init__(self) -> None: + self._state: List[AirbyteStateMessage] = [] + + def with_stream_state(self, stream_name: str, state: Any) -> "StateBuilder": + self._state.append( + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_state=state if isinstance(state, AirbyteStateBlob) else AirbyteStateBlob(state), + stream_descriptor=StreamDescriptor(**{"name": stream_name}), + ), + ) + ) + return self + + def build(self) -> List[AirbyteStateMessage]: + return self._state diff --git a/airbyte-cdk/python/airbyte_cdk/test/utils/__init__.py b/airbyte-cdk/python/airbyte_cdk/test/utils/__init__.py new file mode 100644 index 000000000000..7f66676b8716 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. diff --git a/airbyte-cdk/python/airbyte_cdk/test/utils/data.py b/airbyte-cdk/python/airbyte_cdk/test/utils/data.py new file mode 100644 index 000000000000..a4d4fef6d2a5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/utils/data.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from pydantic import FilePath + + +def get_unit_test_folder(execution_folder: str) -> FilePath: + path = FilePath(execution_folder) + while path.name != "unit_tests": + if path.name == path.root or path.name == path.drive: + raise ValueError(f"Could not find `unit_tests` folder as a parent of {execution_folder}") + path = path.parent + return path + + +def read_resource_file_contents(resource: str, test_location: str) -> str: + """Read the contents of a test data file from the test resource folder.""" + file_path = str(get_unit_test_folder(test_location) / "resource" / "http" / "response" / f"{resource}") + with open(file_path) as f: + response = f.read() + return response diff --git a/airbyte-cdk/python/airbyte_cdk/test/utils/http_mocking.py b/airbyte-cdk/python/airbyte_cdk/test/utils/http_mocking.py new file mode 100644 index 000000000000..0cdd8f4cef1b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/utils/http_mocking.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import re +from typing import Any, Mapping + +from requests_mock import Mocker + + +def register_mock_responses(mocker: Mocker, http_calls: list[Mapping[str, Mapping[str, Any]]]) -> None: + """Register a list of HTTP request-response pairs.""" + for call in http_calls: + request, response = call["request"], call["response"] + matcher = re.compile(request["url"]) if request["is_regex"] else request["url"] + mocker.register_uri(request["method"], matcher, **response) diff --git a/airbyte-cdk/python/airbyte_cdk/test/utils/reading.py b/airbyte-cdk/python/airbyte_cdk/test/utils/reading.py new file mode 100644 index 000000000000..2d89cb870984 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/test/utils/reading.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from typing import Any, List, Mapping, Optional + +from airbyte_cdk import AbstractSource +from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog, SyncMode +from airbyte_cdk.test.catalog_builder import CatalogBuilder +from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput, read + + +def catalog(stream_name: str, sync_mode: SyncMode) -> ConfiguredAirbyteCatalog: + """Create a catalog with a single stream.""" + return CatalogBuilder().with_stream(stream_name, sync_mode).build() + + +def read_records( + source: AbstractSource, + config: Mapping[str, Any], + stream_name: str, + sync_mode: SyncMode, + state: Optional[List[AirbyteStateMessage]] = None, + expecting_exception: bool = False, +) -> EntrypointOutput: + """Read records from a stream.""" + _catalog = catalog(stream_name, sync_mode) + return read(source, config, _catalog, state, expecting_exception) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/__init__.py b/airbyte-cdk/python/airbyte_cdk/utils/__init__.py new file mode 100644 index 000000000000..70b1375b0e83 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from .is_cloud_environment import is_cloud_environment +from .schema_inferrer import SchemaInferrer +from .traced_exception import AirbyteTracedException +from .print_buffer import PrintBuffer + +__all__ = ["AirbyteTracedException", "SchemaInferrer", "is_cloud_environment", "PrintBuffer"] diff --git a/airbyte-cdk/python/airbyte_cdk/utils/airbyte_secrets_utils.py b/airbyte-cdk/python/airbyte_cdk/utils/airbyte_secrets_utils.py new file mode 100644 index 000000000000..5afd305f38ed --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/airbyte_secrets_utils.py @@ -0,0 +1,78 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, List, Mapping + +import dpath + + +def get_secret_paths(spec: Mapping[str, Any]) -> List[List[str]]: + paths = [] + + def traverse_schema(schema_item: Any, path: List[str]) -> None: + """ + schema_item can be any property or value in the originally input jsonschema, depending on how far down the recursion stack we go + path is the path to that schema item in the original input + for example if we have the input {'password': {'type': 'string', 'airbyte_secret': True}} then the arguments will evolve + as follows: + schema_item=, path=[] + schema_item={'type': 'string', 'airbyte_secret': True}, path=['password'] + schema_item='string', path=['password', 'type'] + schema_item=True, path=['password', 'airbyte_secret'] + """ + if isinstance(schema_item, dict): + for k, v in schema_item.items(): + traverse_schema(v, [*path, k]) + elif isinstance(schema_item, list): + for i in schema_item: + traverse_schema(i, path) + else: + if path[-1] == "airbyte_secret" and schema_item is True: + filtered_path = [p for p in path[:-1] if p not in ["properties", "oneOf"]] + paths.append(filtered_path) + + traverse_schema(spec, []) + return paths + + +def get_secrets(connection_specification: Mapping[str, Any], config: Mapping[str, Any]) -> List[Any]: + """ + Get a list of secret values from the source config based on the source specification + :type connection_specification: the connection_specification field of an AirbyteSpecification i.e the JSONSchema definition + """ + secret_paths = get_secret_paths(connection_specification.get("properties", {})) + result = [] + for path in secret_paths: + try: + result.append(dpath.get(config, path)) + except KeyError: + # Since we try to get paths to all known secrets in the spec, in the case of oneOfs, some secret fields may not be present + # In that case, a KeyError is thrown. This is expected behavior. + pass + return result + + +__SECRETS_FROM_CONFIG: List[str] = [] + + +def update_secrets(secrets: List[str]) -> None: + """Update the list of secrets to be replaced""" + global __SECRETS_FROM_CONFIG + __SECRETS_FROM_CONFIG = secrets + + +def add_to_secrets(secret: str) -> None: + """Add to the list of secrets to be replaced""" + global __SECRETS_FROM_CONFIG + __SECRETS_FROM_CONFIG.append(secret) + + +def filter_secrets(string: str) -> str: + """Filter secrets from a string by replacing them with ****""" + # TODO this should perform a maximal match for each secret. if "x" and "xk" are both secret values, and this method is called twice on + # the input "xk", then depending on call order it might only obfuscate "*k". This is a bug. + for secret in __SECRETS_FROM_CONFIG: + if secret: + string = string.replace(str(secret), "****") + return string diff --git a/airbyte-cdk/python/airbyte_cdk/utils/analytics_message.py b/airbyte-cdk/python/airbyte_cdk/utils/analytics_message.py new file mode 100644 index 000000000000..54c3e984f93c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/analytics_message.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +import time +from typing import Any, Optional + +from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, AirbyteMessage, AirbyteTraceMessage, TraceType, Type + + +def create_analytics_message(type: str, value: Optional[Any]) -> AirbyteMessage: + return AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ANALYTICS, + emitted_at=time.time() * 1000, + analytics=AirbyteAnalyticsTraceMessage(type=type, value=str(value) if value is not None else None), + ), + ) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/constants.py b/airbyte-cdk/python/airbyte_cdk/utils/constants.py new file mode 100644 index 000000000000..1d6345cbd8f4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/constants.py @@ -0,0 +1,5 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +ENV_REQUEST_CACHE_PATH = "REQUEST_CACHE_PATH" diff --git a/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py b/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py new file mode 100644 index 000000000000..cd423db9c201 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Dict, Optional + +from airbyte_cdk.models import AirbyteRecordMessage +from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser + + +class DatetimeFormatInferrer: + """ + This class is used to detect toplevel fields in records that might be datetime values, along with the used format. + """ + + def __init__(self) -> None: + self._parser = DatetimeParser() + self._datetime_candidates: Optional[Dict[str, str]] = None + self._formats = [ + "%Y-%m-%d", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%d %H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S.%f%z", + "%s", + "%ms", + "%d/%m/%Y %H:%M", + "%Y-%m", + "%d-%m-%Y", + ] + self._timestamp_heuristic_ranges = [range(1_000_000_000, 2_000_000_000), range(1_000_000_000_000, 2_000_000_000_000)] + + def _can_be_datetime(self, value: Any) -> bool: + """Checks if the value can be a datetime. + This is the case if the value is a string or an integer between 1_000_000_000 and 2_000_000_000 for seconds + or between 1_000_000_000_000 and 2_000_000_000_000 for milliseconds. + This is separate from the format check for performance reasons""" + if isinstance(value, (str, int)): + try: + value_as_int = int(value) + for timestamp_range in self._timestamp_heuristic_ranges: + if value_as_int in timestamp_range: + return True + except ValueError: + # given that it's not parsable as an int, it can represent a datetime with one of the self._formats + return True + return False + + def _matches_format(self, value: Any, format: str) -> bool: + """Checks if the value matches the format""" + try: + self._parser.parse(value, format) + return True + except ValueError: + return False + + def _initialize(self, record: AirbyteRecordMessage) -> None: + """Initializes the internal state of the class""" + self._datetime_candidates = {} + for field_name, field_value in record.data.items(): + if not self._can_be_datetime(field_value): + continue + for format in self._formats: + if self._matches_format(field_value, format): + self._datetime_candidates[field_name] = format + break + + def _validate(self, record: AirbyteRecordMessage) -> None: + """Validates that the record is consistent with the inferred datetime formats""" + if self._datetime_candidates: + for candidate_field_name in list(self._datetime_candidates.keys()): + candidate_field_format = self._datetime_candidates[candidate_field_name] + current_value = record.data.get(candidate_field_name, None) + if ( + current_value is None + or not self._can_be_datetime(current_value) + or not self._matches_format(current_value, candidate_field_format) + ): + self._datetime_candidates.pop(candidate_field_name) + + def accumulate(self, record: AirbyteRecordMessage) -> None: + """Analyzes the record and updates the internal state of candidate datetime fields""" + self._initialize(record) if self._datetime_candidates is None else self._validate(record) + + def get_inferred_datetime_formats(self) -> Dict[str, str]: + """ + Returns the list of candidate datetime fields - the keys are the field names and the values are the inferred datetime formats. + For these fields the format was consistent across all visited records. + """ + return self._datetime_candidates or {} diff --git a/airbyte-cdk/python/airbyte_cdk/utils/event_timing.py b/airbyte-cdk/python/airbyte_cdk/utils/event_timing.py new file mode 100644 index 000000000000..447543ec0b23 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/event_timing.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime +import logging +import time +from contextlib import contextmanager +from dataclasses import dataclass, field +from typing import Optional + +logger = logging.getLogger("airbyte") + + +class EventTimer: + """Simple nanosecond resolution event timer for debugging, initially intended to be used to record streams execution + time for a source. + Event nesting follows a LIFO pattern, so finish will apply to the last started event. + """ + + def __init__(self, name): + self.name = name + self.events = {} + self.count = 0 + self.stack = [] + + def start_event(self, name): + """ + Start a new event and push it to the stack. + """ + self.events[name] = Event(name=name) + self.count += 1 + self.stack.insert(0, self.events[name]) + + def finish_event(self): + """ + Finish the current event and pop it from the stack. + """ + + if self.stack: + event = self.stack.pop(0) + event.finish() + else: + logger.warning(f"{self.name} finish_event called without start_event") + + def report(self, order_by="name"): + """ + :param order_by: 'name' or 'duration' + """ + if order_by == "name": + events = sorted(self.events.values(), key=lambda event: event.name) + elif order_by == "duration": + events = sorted(self.events.values(), key=lambda event: event.duration) + text = f"{self.name} runtimes:\n" + text += "\n".join(str(event) for event in events) + return text + + +@dataclass +class Event: + name: str + start: float = field(default_factory=time.perf_counter_ns) + end: Optional[float] = field(default=None) + + @property + def duration(self) -> float: + """Returns the elapsed time in seconds or positive infinity if event was never finished""" + if self.end: + return (self.end - self.start) / 1e9 + return float("+inf") + + def __str__(self): + return f"{self.name} {datetime.timedelta(seconds=self.duration)}" + + def finish(self): + self.end = time.perf_counter_ns() + + +@contextmanager +def create_timer(name): + """ + Creates a new EventTimer as a context manager to improve code readability. + """ + a_timer = EventTimer(name) + yield a_timer diff --git a/airbyte-cdk/python/airbyte_cdk/utils/is_cloud_environment.py b/airbyte-cdk/python/airbyte_cdk/utils/is_cloud_environment.py new file mode 100644 index 000000000000..25b1eee87fad --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/is_cloud_environment.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import os + +CLOUD_DEPLOYMENT_MODE = "cloud" + + +def is_cloud_environment() -> bool: + """ + Returns True if the connector is running in a cloud environment, False otherwise. + + The function checks the value of the DEPLOYMENT_MODE environment variable which is set by the platform. + This function can be used to determine whether stricter security measures should be applied. + """ + deployment_mode = os.environ.get("DEPLOYMENT_MODE", "") + return deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE diff --git a/airbyte-cdk/python/airbyte_cdk/utils/mapping_helpers.py b/airbyte-cdk/python/airbyte_cdk/utils/mapping_helpers.py new file mode 100644 index 000000000000..ae5e898f667d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/mapping_helpers.py @@ -0,0 +1,43 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from typing import Any, List, Mapping, Optional, Set, Union + + +def combine_mappings(mappings: List[Optional[Union[Mapping[str, Any], str]]]) -> Union[Mapping[str, Any], str]: + """ + Combine multiple mappings into a single mapping. If any of the mappings are a string, return + that string. Raise errors in the following cases: + * If there are duplicate keys across mappings + * If there are multiple string mappings + * If there are multiple mappings containing keys and one of them is a string + """ + all_keys: List[Set[str]] = [] + for part in mappings: + if part is None: + continue + keys = set(part.keys()) if not isinstance(part, str) else set() + all_keys.append(keys) + + string_options = sum(isinstance(mapping, str) for mapping in mappings) + # If more than one mapping is a string, raise a ValueError + if string_options > 1: + raise ValueError("Cannot combine multiple string options") + + if string_options == 1 and sum(len(keys) for keys in all_keys) > 0: + raise ValueError("Cannot combine multiple options if one is a string") + + # If any mapping is a string, return it + for mapping in mappings: + if isinstance(mapping, str): + return mapping + + # If there are duplicate keys across mappings, raise a ValueError + intersection = set().union(*all_keys) + if len(intersection) < sum(len(keys) for keys in all_keys): + raise ValueError(f"Duplicate keys found: {intersection}") + + # Return the combined mappings + return {key: value for mapping in mappings if mapping for key, value in mapping.items()} # type: ignore # mapping can't be string here diff --git a/airbyte-cdk/python/airbyte_cdk/utils/message_utils.py b/airbyte-cdk/python/airbyte_cdk/utils/message_utils.py new file mode 100644 index 000000000000..a862d4696495 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/message_utils.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from airbyte_cdk.models import AirbyteMessage, Type +from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor + + +def get_stream_descriptor(message: AirbyteMessage) -> HashableStreamDescriptor: + match message.type: + case Type.RECORD: + return HashableStreamDescriptor(name=message.record.stream, namespace=message.record.namespace) # type: ignore[union-attr] # record has `stream` and `namespace` + case Type.STATE: + if not message.state.stream or not message.state.stream.stream_descriptor: # type: ignore[union-attr] # state has `stream` + raise ValueError("State message was not in per-stream state format, which is required for record counts.") + return HashableStreamDescriptor( + name=message.state.stream.stream_descriptor.name, namespace=message.state.stream.stream_descriptor.namespace # type: ignore[union-attr] # state has `stream` + ) + case _: + raise NotImplementedError(f"get_stream_descriptor is not implemented for message type '{message.type}'.") diff --git a/airbyte-cdk/python/airbyte_cdk/utils/oneof_option_config.py b/airbyte-cdk/python/airbyte_cdk/utils/oneof_option_config.py new file mode 100644 index 000000000000..17ebf0511beb --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/oneof_option_config.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Dict + + +class OneOfOptionConfig: + """ + Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers. + + Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema). + + Usage: + + ```python + class OptionModel(BaseModel): + mode: Literal["option_a"] = Field("option_a", const=True) + option_a_field: str = Field(...) + + class Config(OneOfOptionConfig): + title = "Option A" + description = "Option A description" + discriminator = "mode" + ``` + """ + + @staticmethod + def schema_extra(schema: Dict[str, Any], model: Any) -> None: + if hasattr(model.Config, "description"): + schema["description"] = model.Config.description + if hasattr(model.Config, "discriminator"): + schema.setdefault("required", []).append(model.Config.discriminator) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/print_buffer.py b/airbyte-cdk/python/airbyte_cdk/utils/print_buffer.py new file mode 100644 index 000000000000..51ca2a84b0fe --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/print_buffer.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import sys +import time +from io import StringIO +from threading import RLock +from types import TracebackType +from typing import Optional + + +class PrintBuffer: + """ + A class to buffer print statements and flush them at a specified interval. + + The PrintBuffer class is designed to capture and buffer output that would + normally be printed to the standard output (stdout). This can be useful for + scenarios where you want to minimize the number of I/O operations by grouping + multiple print statements together and flushing them as a single operation. + + Attributes: + buffer (StringIO): A buffer to store the messages before flushing. + flush_interval (float): The time interval (in seconds) after which the buffer is flushed. + last_flush_time (float): The last time the buffer was flushed. + lock (RLock): A reentrant lock to ensure thread-safe operations. + + Methods: + write(message: str) -> None: + Writes a message to the buffer and flushes if the interval has passed. + + flush() -> None: + Flushes the buffer content to the standard output. + + __enter__() -> "PrintBuffer": + Enters the runtime context related to this object, redirecting stdout and stderr. + + __exit__(exc_type, exc_val, exc_tb) -> None: + Exits the runtime context and restores the original stdout and stderr. + """ + + def __init__(self, flush_interval: float = 0.1): + self.buffer = StringIO() + self.flush_interval = flush_interval + self.last_flush_time = time.monotonic() + self.lock = RLock() + + def write(self, message: str) -> None: + with self.lock: + self.buffer.write(message) + current_time = time.monotonic() + if (current_time - self.last_flush_time) >= self.flush_interval: + self.flush() + self.last_flush_time = current_time + + def flush(self) -> None: + with self.lock: + combined_message = self.buffer.getvalue() + sys.__stdout__.write(combined_message) # type: ignore[union-attr] + self.buffer = StringIO() + + def __enter__(self) -> "PrintBuffer": + self.old_stdout, self.old_stderr = sys.stdout, sys.stderr + # Used to disable buffering during the pytest session, because it is not compatible with capsys + if "pytest" not in str(type(sys.stdout)).lower(): + sys.stdout = self + sys.stderr = self + return self + + def __exit__(self, exc_type: Optional[BaseException], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]) -> None: + self.flush() + sys.stdout, sys.stderr = self.old_stdout, self.old_stderr diff --git a/airbyte-cdk/python/airbyte_cdk/utils/schema_inferrer.py b/airbyte-cdk/python/airbyte_cdk/utils/schema_inferrer.py new file mode 100644 index 000000000000..fd749850900b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/schema_inferrer.py @@ -0,0 +1,248 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from collections import defaultdict +from typing import Any, Dict, List, Mapping, Optional + +from airbyte_cdk.models import AirbyteRecordMessage +from genson import SchemaBuilder, SchemaNode +from genson.schema.strategies.object import Object +from genson.schema.strategies.scalar import Number + +# schema keywords +_TYPE = "type" +_NULL_TYPE = "null" +_OBJECT_TYPE = "object" +_ANY_OF = "anyOf" +_ITEMS = "items" +_PROPERTIES = "properties" +_REQUIRED = "required" + + +class NoRequiredObj(Object): + """ + This class has Object behaviour, but it does not generate "required[]" fields + every time it parses object. So we don't add unnecessary extra field. + + The logic is that even reading all the data from a source, it does not mean that there can be another record added with those fields as + optional. Hence, we make everything nullable. + """ + + def to_schema(self) -> Mapping[str, Any]: + schema: Dict[str, Any] = super(NoRequiredObj, self).to_schema() + schema.pop("required", None) + return schema + + +class IntegerToNumber(Number): + """ + This class has the regular Number behaviour, but it will never emit an integer type. + """ + + def __init__(self, node_class: SchemaNode): + super().__init__(node_class) + self._type = "number" + + +class NoRequiredSchemaBuilder(SchemaBuilder): + EXTRA_STRATEGIES = (NoRequiredObj, IntegerToNumber) + + +# This type is inferred from the genson lib, but there is no alias provided for it - creating it here for type safety +InferredSchema = Dict[str, Any] + + +class SchemaValidationException(Exception): + @classmethod + def merge_exceptions(cls, exceptions: List["SchemaValidationException"]) -> "SchemaValidationException": + # We assume the schema is the same for all SchemaValidationException + return SchemaValidationException(exceptions[0].schema, [x for exception in exceptions for x in exception._validation_errors]) + + def __init__(self, schema: InferredSchema, validation_errors: List[Exception]): + self._schema = schema + self._validation_errors = validation_errors + + @property + def schema(self) -> InferredSchema: + return self._schema + + @property + def validation_errors(self) -> List[str]: + return list(map(lambda error: str(error), self._validation_errors)) + + +class SchemaInferrer: + """ + This class is used to infer a JSON schema which fits all the records passed into it + throughout its lifecycle via the accumulate method. + + Instances of this class are stateful, meaning they build their inferred schemas + from every record passed into the accumulate method. + + """ + + stream_to_builder: Dict[str, SchemaBuilder] + + def __init__(self, pk: Optional[List[List[str]]] = None, cursor_field: Optional[List[List[str]]] = None) -> None: + self.stream_to_builder = defaultdict(NoRequiredSchemaBuilder) + self._pk = [] if pk is None else pk + self._cursor_field = [] if cursor_field is None else cursor_field + + def accumulate(self, record: AirbyteRecordMessage) -> None: + """Uses the input record to add to the inferred schemas maintained by this object""" + self.stream_to_builder[record.stream].add_object(record.data) + + def _null_type_in_any_of(self, node: InferredSchema) -> bool: + if _ANY_OF in node: + return {_TYPE: _NULL_TYPE} in node[_ANY_OF] + else: + return False + + def _remove_type_from_any_of(self, node: InferredSchema) -> None: + if _ANY_OF in node: + node.pop(_TYPE, None) + + def _clean_any_of(self, node: InferredSchema) -> None: + if len(node[_ANY_OF]) == 2 and self._null_type_in_any_of(node): + real_type = node[_ANY_OF][1] if node[_ANY_OF][0][_TYPE] == _NULL_TYPE else node[_ANY_OF][0] + node.update(real_type) + node[_TYPE] = [node[_TYPE], _NULL_TYPE] + node.pop(_ANY_OF) + # populate `type` for `anyOf` if it's not present to pass all other checks + elif len(node[_ANY_OF]) == 2 and not self._null_type_in_any_of(node): + node[_TYPE] = [_NULL_TYPE] + + def _clean_properties(self, node: InferredSchema) -> None: + for key, value in list(node[_PROPERTIES].items()): + if isinstance(value, dict) and value.get(_TYPE, None) == _NULL_TYPE: + node[_PROPERTIES].pop(key) + else: + self._clean(value) + + def _ensure_null_type_on_top(self, node: InferredSchema) -> None: + if isinstance(node[_TYPE], list): + if _NULL_TYPE in node[_TYPE]: + # we want to make sure null is always at the end as it makes schemas more readable + node[_TYPE].remove(_NULL_TYPE) + node[_TYPE].append(_NULL_TYPE) + else: + node[_TYPE] = [node[_TYPE], _NULL_TYPE] + + def _clean(self, node: InferredSchema) -> InferredSchema: + """ + Recursively cleans up a produced schema: + - remove anyOf if one of them is just a null value + - remove properties of type "null" + """ + + if isinstance(node, dict): + if _ANY_OF in node: + self._clean_any_of(node) + + if _PROPERTIES in node and isinstance(node[_PROPERTIES], dict): + self._clean_properties(node) + + if _ITEMS in node: + self._clean(node[_ITEMS]) + + # this check needs to follow the "anyOf" cleaning as it might populate `type` + self._ensure_null_type_on_top(node) + + # remove added `type: ["null"]` for `anyOf` nested node + self._remove_type_from_any_of(node) + + return node + + def _add_required_properties(self, node: InferredSchema) -> InferredSchema: + """ + This method takes properties that should be marked as required (self._pk and self._cursor_field) and travel the schema to mark every + node as required. + """ + # Removing nullable for the root as when we call `_clean`, we make everything nullable + node[_TYPE] = _OBJECT_TYPE + + exceptions = [] + for field in [x for x in [self._pk, self._cursor_field] if x]: + try: + self._add_fields_as_required(node, field) + except SchemaValidationException as exception: + exceptions.append(exception) + + if exceptions: + raise SchemaValidationException.merge_exceptions(exceptions) + + return node + + def _add_fields_as_required(self, node: InferredSchema, composite_key: List[List[str]]) -> None: + """ + Take a list of nested keys (this list represents a composite key) and travel the schema to mark every node as required. + """ + errors: List[Exception] = [] + + for path in composite_key: + try: + self._add_field_as_required(node, path) + except ValueError as exception: + errors.append(exception) + + if errors: + raise SchemaValidationException(node, errors) + + def _add_field_as_required(self, node: InferredSchema, path: List[str], traveled_path: Optional[List[str]] = None) -> None: + """ + Take a nested key and travel the schema to mark every node as required. + """ + self._remove_null_from_type(node) + if self._is_leaf(path): + return + + if not traveled_path: + traveled_path = [] + + if _PROPERTIES not in node: + # This validation is only relevant when `traveled_path` is empty + raise ValueError( + f"Path {traveled_path} does not refer to an object but is `{node}` and hence {path} can't be marked as required." + ) + + next_node = path[0] + if next_node not in node[_PROPERTIES]: + raise ValueError(f"Path {traveled_path} does not have field `{next_node}` in the schema and hence can't be marked as required.") + + if _TYPE not in node: + # We do not expect this case to happen but we added a specific error message just in case + raise ValueError( + f"Unknown schema error: {traveled_path} is expected to have a type but did not. Schema inferrence is probably broken" + ) + + if node[_TYPE] not in [_OBJECT_TYPE, [_NULL_TYPE, _OBJECT_TYPE], [_OBJECT_TYPE, _NULL_TYPE]]: + raise ValueError(f"Path {traveled_path} is expected to be an object but was of type `{node['properties'][next_node]['type']}`") + + if _REQUIRED not in node or not node[_REQUIRED]: + node[_REQUIRED] = [next_node] + elif next_node not in node[_REQUIRED]: + node[_REQUIRED].append(next_node) + + traveled_path.append(next_node) + self._add_field_as_required(node[_PROPERTIES][next_node], path[1:], traveled_path) + + def _is_leaf(self, path: List[str]) -> bool: + return len(path) == 0 + + def _remove_null_from_type(self, node: InferredSchema) -> None: + if isinstance(node[_TYPE], list): + if _NULL_TYPE in node[_TYPE]: + node[_TYPE].remove(_NULL_TYPE) + if len(node[_TYPE]) == 1: + node[_TYPE] = node[_TYPE][0] + + def get_stream_schema(self, stream_name: str) -> Optional[InferredSchema]: + """ + Returns the inferred JSON schema for the specified stream. Might be `None` if there were no records for the given stream name. + """ + return ( + self._add_required_properties(self._clean(self.stream_to_builder[stream_name].to_schema())) + if stream_name in self.stream_to_builder + else None + ) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/spec_schema_transformations.py b/airbyte-cdk/python/airbyte_cdk/utils/spec_schema_transformations.py new file mode 100644 index 000000000000..2a772d50b6c3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/spec_schema_transformations.py @@ -0,0 +1,23 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import re + +from jsonschema import RefResolver + + +def resolve_refs(schema: dict) -> dict: + """ + For spec schemas generated using Pydantic models, the resulting JSON schema can contain refs between object + relationships. + """ + json_schema_ref_resolver = RefResolver.from_schema(schema) + str_schema = json.dumps(schema) + for ref_block in re.findall(r'{"\$ref": "#\/definitions\/.+?(?="})"}', str_schema): + ref = json.loads(ref_block)["$ref"] + str_schema = str_schema.replace(ref_block, json.dumps(json_schema_ref_resolver.resolve(ref)[1])) + pyschema: dict = json.loads(str_schema) + del pyschema["definitions"] + return pyschema diff --git a/airbyte-cdk/python/airbyte_cdk/utils/stream_status_utils.py b/airbyte-cdk/python/airbyte_cdk/utils/stream_status_utils.py new file mode 100644 index 000000000000..49c07f49cd97 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/stream_status_utils.py @@ -0,0 +1,43 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from datetime import datetime +from typing import List, Optional, Union + +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteStream, + AirbyteStreamStatus, + AirbyteStreamStatusReason, + AirbyteStreamStatusTraceMessage, + AirbyteTraceMessage, + StreamDescriptor, + TraceType, +) +from airbyte_cdk.models import Type as MessageType + + +def as_airbyte_message( + stream: Union[AirbyteStream, StreamDescriptor], + current_status: AirbyteStreamStatus, + reasons: Optional[List[AirbyteStreamStatusReason]] = None, +) -> AirbyteMessage: + """ + Builds an AirbyteStreamStatusTraceMessage for the provided stream + """ + + now_millis = datetime.now().timestamp() * 1000.0 + + trace_message = AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=now_millis, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=stream.name, namespace=stream.namespace), + status=current_status, + reasons=reasons, + ), + ) + + return AirbyteMessage(type=MessageType.TRACE, trace=trace_message) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/traced_exception.py b/airbyte-cdk/python/airbyte_cdk/utils/traced_exception.py new file mode 100644 index 000000000000..bd96ea398146 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/traced_exception.py @@ -0,0 +1,116 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import time +import traceback +from typing import Optional + +from airbyte_cdk.models import ( + AirbyteConnectionStatus, + AirbyteErrorTraceMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteTraceMessage, + FailureType, + Status, + StreamDescriptor, + TraceType, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from orjson import orjson + + +class AirbyteTracedException(Exception): + """ + An exception that should be emitted as an AirbyteTraceMessage + """ + + def __init__( + self, + internal_message: Optional[str] = None, + message: Optional[str] = None, + failure_type: FailureType = FailureType.system_error, + exception: Optional[BaseException] = None, + stream_descriptor: Optional[StreamDescriptor] = None, + ): + """ + :param internal_message: the internal error that caused the failure + :param message: a user-friendly message that indicates the cause of the error + :param failure_type: the type of error + :param exception: the exception that caused the error, from which the stack trace should be retrieved + :param stream_descriptor: describe the stream from which the exception comes from + """ + self.internal_message = internal_message + self.message = message + self.failure_type = failure_type + self._exception = exception + self._stream_descriptor = stream_descriptor + super().__init__(internal_message) + + def as_airbyte_message(self, stream_descriptor: Optional[StreamDescriptor] = None) -> AirbyteMessage: + """ + Builds an AirbyteTraceMessage from the exception + + :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many + stream_descriptors are defined, the one from `as_airbyte_message` will be discarded. + """ + now_millis = time.time_ns() // 1_000_000 + + trace_exc = self._exception or self + stack_trace_str = "".join(traceback.TracebackException.from_exception(trace_exc).format()) + + trace_message = AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=now_millis, + error=AirbyteErrorTraceMessage( + message=self.message or "Something went wrong in the connector. See the logs for more details.", + internal_message=self.internal_message, + failure_type=self.failure_type, + stack_trace=stack_trace_str, + stream_descriptor=self._stream_descriptor if self._stream_descriptor is not None else stream_descriptor, + ), + ) + + return AirbyteMessage(type=MessageType.TRACE, trace=trace_message) + + def as_connection_status_message(self) -> Optional[AirbyteMessage]: + if self.failure_type == FailureType.config_error: + return AirbyteMessage( + type=MessageType.CONNECTION_STATUS, connectionStatus=AirbyteConnectionStatus(status=Status.FAILED, message=self.message) + ) + return None + + def emit_message(self) -> None: + """ + Prints the exception as an AirbyteTraceMessage. + Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint. + """ + message = orjson.dumps(AirbyteMessageSerializer.dump(self.as_airbyte_message())).decode() + filtered_message = filter_secrets(message) + print(filtered_message) + + @classmethod + def from_exception(cls, exc: BaseException, stream_descriptor: Optional[StreamDescriptor] = None, *args, **kwargs) -> "AirbyteTracedException": # type: ignore # ignoring because of args and kwargs + """ + Helper to create an AirbyteTracedException from an existing exception + :param exc: the exception that caused the error + :param stream_descriptor: describe the stream from which the exception comes from + """ + return cls(internal_message=str(exc), exception=exc, stream_descriptor=stream_descriptor, *args, **kwargs) # type: ignore # ignoring because of args and kwargs + + def as_sanitized_airbyte_message(self, stream_descriptor: Optional[StreamDescriptor] = None) -> AirbyteMessage: + """ + Builds an AirbyteTraceMessage from the exception and sanitizes any secrets from the message body + + :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many + stream_descriptors are defined, the one from `as_sanitized_airbyte_message` will be discarded. + """ + error_message = self.as_airbyte_message(stream_descriptor=stream_descriptor) + if error_message.trace.error.message: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage + error_message.trace.error.message = filter_secrets(error_message.trace.error.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage + if error_message.trace.error.internal_message: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage + error_message.trace.error.internal_message = filter_secrets(error_message.trace.error.internal_message) # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage + if error_message.trace.error.stack_trace: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage + error_message.trace.error.stack_trace = filter_secrets(error_message.trace.error.stack_trace) # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage + return error_message diff --git a/airbyte-cdk/python/bin/generate-component-manifest-dagger.sh b/airbyte-cdk/python/bin/generate-component-manifest-dagger.sh new file mode 100755 index 000000000000..f920ff727e49 --- /dev/null +++ b/airbyte-cdk/python/bin/generate-component-manifest-dagger.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +# We need to run this script in a docker container because we need to use a +# specific version of datamodel-codegen that generates pydantic v1 models (correctly). +# The newer datamodel-codegen's "pydantic v1" models are different than those v1 models +# generated by the older version of datamodel-codegen. + +set -e + +pip install dagger-io==0.13.3 +python bin/generate_component_manifest_files.py diff --git a/airbyte-cdk/python/bin/generate_component_manifest_files.py b/airbyte-cdk/python/bin/generate_component_manifest_files.py new file mode 100755 index 000000000000..152486f9fb6b --- /dev/null +++ b/airbyte-cdk/python/bin/generate_component_manifest_files.py @@ -0,0 +1,76 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import sys +from glob import glob +from pathlib import Path + +import anyio +import dagger + +PYTHON_IMAGE = "python:3.10" +LOCAL_YAML_DIR_PATH = "airbyte_cdk/sources/declarative" +LOCAL_OUTPUT_DIR_PATH = "airbyte_cdk/sources/declarative/models" + + +PIP_DEPENDENCIES = [ + "datamodel_code_generator==0.11.19", +] + + +def get_all_yaml_files_without_ext() -> list[str]: + return [Path(f).stem for f in glob(f"{LOCAL_YAML_DIR_PATH}/*.yaml")] + + +def generate_init_module_content() -> str: + header = "# generated by bin/generate_component_manifest_files.py\n" + for module_name in get_all_yaml_files_without_ext(): + header += f"from .{module_name} import *\n" + return header + + +async def post_process_codegen(codegen_container: dagger.Container): + codegen_container = codegen_container.with_exec(["mkdir", "/generated_post_processed"], use_entrypoint=True) + for generated_file in await codegen_container.directory("/generated").entries(): + if generated_file.endswith(".py"): + original_content = await codegen_container.file(f"/generated/{generated_file}").contents() + # the space before _parameters is intentional to avoid replacing things like `request_parameters:` with `requestparameters:` + post_processed_content = original_content.replace(" _parameters:", " parameters:").replace("from pydantic", "from pydantic.v1") + codegen_container = codegen_container.with_new_file( + f"/generated_post_processed/{generated_file}", contents=post_processed_content + ) + return codegen_container + + +async def main(): + init_module_content = generate_init_module_content() + + async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as dagger_client: + + codegen_container = ( + dagger_client.container() + .from_(PYTHON_IMAGE) + .with_exec(["mkdir", "/generated"], use_entrypoint=True) + .with_exec(["pip", "install", " ".join(PIP_DEPENDENCIES)], use_entrypoint=True) + .with_mounted_directory("/yaml", dagger_client.host().directory(LOCAL_YAML_DIR_PATH, include=["*.yaml"])) + .with_new_file("/generated/__init__.py", contents=init_module_content) + ) + for yaml_file in get_all_yaml_files_without_ext(): + codegen_container = codegen_container.with_exec( + [ + "datamodel-codegen", + "--input", + f"/yaml/{yaml_file}.yaml", + "--output", + f"/generated/{yaml_file}.py", + "--disable-timestamp", + "--enum-field-as-literal", + "one", + "--set-default-enum-member", + ], + use_entrypoint=True, + ) + + await ((await post_process_codegen(codegen_container)).directory("/generated_post_processed").export(LOCAL_OUTPUT_DIR_PATH)) + + +anyio.run(main) diff --git a/airbyte-cdk/python/bin/run-mypy-on-modified-files.sh b/airbyte-cdk/python/bin/run-mypy-on-modified-files.sh new file mode 100755 index 000000000000..96f757be1b7d --- /dev/null +++ b/airbyte-cdk/python/bin/run-mypy-on-modified-files.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env sh + +set -e + +# Ensure script always runs from the project directory. +cd "$(dirname "${0}")/.." || exit 1 + +# TODO change this to include unit_tests as well once it's in a good state +{ + git diff --name-only --diff-filter=d --relative ':(exclude)unit_tests' + git diff --name-only --diff-filter=d --staged --relative ':(exclude)unit_tests' + git diff --name-only --diff-filter=d master... --relative ':(exclude)unit_tests' +} | grep -E '\.py$' | sort | uniq | xargs mypy --config-file mypy.ini --install-types --non-interactive diff --git a/airbyte-cdk/python/cdk-migrations.md b/airbyte-cdk/python/cdk-migrations.md new file mode 100644 index 000000000000..d94726263ff9 --- /dev/null +++ b/airbyte-cdk/python/cdk-migrations.md @@ -0,0 +1,290 @@ +# CDK Migration Guide + +## Upgrading to 6.x.x + +Version 6.x.x of the CDK introduces concurrent processing of low-code incremental streams. This is breaking because non-manifest only connectors must update their self-managed `run.py` and `source.py` files. This section is intended to clarify how to upgrade a low-code connector to use the Concurrent CDK to sync incremental streams. + +> [!NOTE] +> This version introduces parallel processing of only incremental streams. +> It does not include the parallel processing of substreams that rely on a parent stream +> It also does not include processing of full-refresh streams in parallel. + +Low-code incremental streams that match any of the following criteria are not supported by concurrent as of this version: +- Uses a custom implementation of the `DatetimeBasedCursor` component +- The `DatetimeBasedCursor` defines a `step` which will partition a stream's request into time intervals AND a + `AddedField` / `HttpRequester` / `RecordFilter` that relies on interpolation of the `stream_state` value. See below + for the complete list + +In order to enable concurrency for a low-code connector, the following changes must be made: +- In the connector's `source.py`, change the method signature to accept catalog, config, and state parameters. Change the invocation of `super()` to pass in those new parameters + +```python3 +class SourceName(YamlDeclarativeSource): + def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: TState, **kwargs): + super().__init__(catalog=catalog, config=config, state=state, **{"path_to_yaml": "manifest.yaml"}) +``` +- In the connector's `run.py`, update it to pass variables + +```python3 +def _get_source(args: List[str]): + catalog_path = AirbyteEntrypoint.extract_catalog(args) + config_path = AirbyteEntrypoint.extract_config(args) + state_path = AirbyteEntrypoint.extract_state(args) + try: + return SourceName( + SourceName.read_catalog(catalog_path) if catalog_path else None, + SourceName.read_config(config_path) if config_path else None, + SourceName.read_state(state_path) if state_path else None, + ) + except Exception as error: + print( + orjson.dumps( + AirbyteMessageSerializer.dump( + AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=int(datetime.now().timestamp() * 1000), + error=AirbyteErrorTraceMessage( + message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}", + stack_trace=traceback.format_exc(), + ), + ), + ) + ) + ).decode() + ) + return None + + +def run(): + _args = sys.argv[1:] + source = _get_source(_args) + if source: + launch(source, _args) +``` + +- Add the `ConcurrencyLevel` component to the connector's `manifest.yaml` file + +```yaml +concurrency_level: + type: ConcurrencyLevel + default_concurrency: "{{ config['num_workers'] or 10 }}" + max_concurrency: 20 +``` + +### Connectors that have streams that cannot be processed concurrently + +Connectors that have streams that use `stream_state` during interpolation and must be run synchronously until they are fixed or updated: +- Http Requester + - `source-insightly`: Uses an DatetimeBasedCursor with a step interval and the HttpRequester has request_parameters relying on `stream_state`. This should be replaced by `step_interval` + - `source-intercom`: Uses a custom `incremental_sync` component and `stream_state` used as part of the HttpRequester request_body_json. However, because this processed on a single slice, `stream_interval` can be used +- Record Filter + - `source-chargebee`: Uses a custom `incremental_sync` component and `stream_state` in the RecordFilter condition. However, because this processed on a single slice, `stream_interval` can be used + - `source-intercom`: Uses a custom `incremental_sync` component and `stream_state` used as part of the RecordFilter condition. However, because this processed on a single slice, `stream_interval` can be used + - `source-railz`: Uses a custom `incremental_sync` component and `stream_state` used as part of the RecordFilter condition. This also uses multiple one month time intervals and is not currently compatible for concurrent + - `source-tiktok-marketing`: Contains DatetimeBasedCursor with a step interval and relies on a CustomRecordFilter with a condition relying on `stream_state`. This should be replaced by `stream_interval` +- `AddFields`: No connectors use `stream_state` when performing an additive transformation for a record + +To enable concurrency on these streams, `stream_state` should be removed from the interpolated value and replaced +by a thread safe interpolation context like `stream_interval` or `stream_partition`. + +### Upgrading manifest-only sources to process incremental streams concurrently + +All manifest-only sources are run using the `source-declarative-manifest` which serves as the base image with the common code and flows for connectors that only define a `manifest.yaml` file. + +Within this package, to enable concurrent processing: +- Modify `airbyte-cdk` package in `pyproject.toml` to the current version +- In `run.py`, parse all entrypoint arguments into the respective config, catalog, and state objects +- In `run.py`, modify the flow that instantiates a `ManifestDeclarativeSource` from the `__injected_declarative_manifest` to instantiate a `ConcurrentDeclarativeSource` +- In `run.py` modify the `SourceLocalYaml` class to accept config, catalog, and state. And use that in the `YamlDeclarativeSource.__init__`. This should look similar to the migration of sources that are not manifest-only + +## Upgrading to 5.0.0 + +Version 5.0.0 of the CDK updates the `airbyte_cdk.models` dependency to replace Pydantic v2 models with Python `dataclasses`. It also +updates the `airbyte-protocol-models` dependency to a version that uses dataclasses models. + +The changes to Airbyte CDK itself are backwards-compatible, but some changes are required if the connector: +- uses the `airbyte_protocol` models directly, or `airbyte_cdk.models`, which points to `airbyte_protocol` models +- uses third-party libraries, such as `pandas`, to read data from sources, which output non-native Python objects that cannot be serialized by the [orjson](https://github.com/ijl/orjson) library. + +> [!NOTE] +> All Serializers have omit_none=True parameter that is applied recursively. Thus, all None values are excluded from output. +> This is expected behaviour and does not break anything in protocol. + +### Updating direct usage of Pydantic based Airbyte Protocol Models + +- If the connector uses Pydantic based Airbyte Protocol Models, the code will need to be updated to reflect the changes `pydantic`. +- It is recommended to import protocol classes not directly by `import airbyte_protocol` statement, but from `airbyte_cdk.models` package. +- It is also recommended to use *-`Serializer` from `airbyte_cdk.models` to manipulate the data or convert to/from JSON. + These are based on the [serpyco-rs](https://pypi.org/project/serpyco-rs/) library. +- These classes have a `dump` method that converts the model to a dictionary and a `load` method that converts a dictionary to a model. +- The recommended serialization strategy is to pass the dictionary to the `orjson` library when serializing as a JSON string. + +E.g. + +```python3 +import orjson + +from airbyte_cdk.models import AirbyteMessage, AirbyteMessageSerializer + +# Before (pydantic model message serialization) +AirbyteMessage().model_dump_json() + +# After (dataclass model serialization) +orjson.dumps(AirbyteMessageSerializer.dump(AirbyteMessage())).decode() +``` + +### Updating third-party libraries + +For example, if `pandas` outputs data from the source, which has date-time `pandas.Timestamp` object in +it, [Orjson supported Types](https://github.com/ijl/orjson?tab=readme-ov-file#types), these fields should be transformed to native JSON +objects. + +```python3 +# Before +yield from df.to_dict(orient="records") + +# After - Option 1 +yield orjson.loads(df.to_json(orient="records", date_format="iso", date_unit="us")) + +``` + + +## Upgrading to 4.5.0 + +In this release, we are no longer supporting the legacy state format in favor of the current per-stream state +format which has been running in production for over 2 years. The impacts to connectors should be minimal, but for +the small number of connectors that instantiate their own `ConnectorStateManager`, the fix to upgrade to the latest +version of the CDK is to stop passing the `stream_instance_map` parameter to the `ConnectorStateManager` constructor. + +## Upgrading to 4.1.0 +We are unifying the `BackoffStrategy` interface as it currently differs from the Python CDK package to the declarative one. The different is that the interface will require the attempt_count to be passed. + +Main impact: This change is mostly internal but we spotted a couple of tests that expect `backoff_time` to not have the `attempt_count` parameter so these tests would fail ([example](https://github.com/airbytehq/airbyte/blob/c9f45a0b85735f58102fcd78385f6f673e731aa6/airbyte-integrations/connectors/source-github/unit_tests/test_stream.py#L99)). + +This change should not impact the following classes even though they have a different interface as they accept `kwargs` and `attempt_count` is currently passed as a keyword argument within the CDK. However, once there is a CDK change where `backoff_time` is called not as a keyword argument, they will fail: +* Zendesk Support: ZendeskSupportBackoffStrategy (this one will be updated shortly after as it is used for CI to validate CDK changes) +* Klaviyo: KlaviyoBackoffStrategy (the logic has been generified so we will remove this custom component shortly after this update) +* GitHub: GithubStreamABCBackoffStrategy and ContributorActivityBackoffStrategy +* Airtable: AirtableBackoffStrategy +* Slack: SlackBackoffStrategy + +This change should not impact `WaitUntilMidnightBackoffStrategy` from source-gnews as well but it is interesting to note that its interface is also wrong as it considers the first parameter as a `requests.Response` instead of a `Optional[Union[requests.Response, requests.RequestException]]`. + +## Upgrading to 4.0.0 + +Updated the codebase to utilize new Python syntax features. As a result, support for Python 3.9 has been dropped. The minimum required Python version is now 3.10. + +## Upgrading to 3.0.0 +Version 3.0.0 of the CDK updates the `HTTPStream` class by reusing the `HTTPClient` under the hood. + +- `backoff_time` and `should_retry` methods are removed from HttpStream +- `HttpStreamAdapterHttpStatusErrorHandler` and `HttpStreamAdapterBackoffStrategy` adapters are marked as `deprecated` +- `raise_on_http_errors`, `max_retries`, `max_time`, `retry_factor` are marked as `deprecated` + +Exceptions from the `requests` library should no longer be raised when calling `read_records`. +Therefore, catching exceptions should be updated, and error messages might change. +See [Migration of Source Zendesk Support](https://github.com/airbytehq/airbyte/pull/41032/commits/4d3a247f36b9826dcea4b98d30fc19802b03d014) as an example. + +### Migration of `should_retry` method +In case the connector uses custom logic for backoff based on the response from the server, a new method `get_error_handler` should be implemented. +This method should return instance of [`ErrorHandler`](https://github.com/airbytehq/airbyte/blob/master/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/error_handler.py). + +### Migration of `backoff_time` method +In case the connector uses custom logic for backoff time calculation, a new method `get_backoff_strategy` should be implemented. +This method should return instance(s) of [`BackoffStrategy`](https://github.com/airbytehq/airbyte/blob/master/airbyte-cdk/python/airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py). + +## Upgrading to 2.0.0 +Version 2.0.0 of the CDK updates the `pydantic` dependency to from Pydantic v1 to Pydantic v2. It also +updates the `airbyte-protocol-models` dependency to a version that uses Pydantic V2 models. + +The changes to Airbyte CDK itself are backwards-compatible, but some changes are required if the connector: +- uses Pydantic directly, e.g. for its own custom models, or +- uses the `airbyte_protocol` models directly, or `airbyte_cdk.models`, which points to `airbyte_protocol` models, or +- customizes HashableStreamDescriptor, which inherits from a protocol model and has therefore been updated to use Pydantic V2 models. + +Some test assertions may also need updating due to changes to default serialization of the protocol models. + +### Updating direct usage of Pydantic + +If the connector uses pydantic, the code will need to be updated to reflect the change `pydantic` dependency version. +The Pydantic [migration guide](https://docs.pydantic.dev/latest/migration/) is a great resource for any questions that +might arise around upgrade behavior. + +#### Using Pydantic V1 models with Pydantic V2 +The easiest way to update the code to be compatible without major changes is to update the import statements from +`from pydantic` to `from pydantic.v1`, as Pydantic has kept the v1 module for backwards compatibility. + +Some potential gotchas: + - `ValidationError` must be imported from `pydantic.v1.error_wrappers` instead of `pydantic.v1` + - `ModelMetaclass` must be imported from `pydantic.v1.main` instead of `pydantic.v1` + - `resolve_annotations` must be imported from `pydantic.v1.typing` instead of `pydantic.v1` + +#### Upgrading to Pydantic V2 +To upgrade all the way to V2 proper, Pydantic also offers a [migration tool](https://docs.pydantic.dev/latest/migration/#code-transformation-tool) +to automatically update the code to be compatible with Pydantic V2. + +#### Updating assertions +It's possible that a connector might make assertions against protocol models without actually +importing them - for example when testing methods which return `AirbyteStateBlob` or `AnyUrl`. + +To resolve this, either compare directly to a model, or `dict()` or `str()` your model accordingly, depending +on if you care most about the serialized output or the model (for a method which returns a model, option 1 is +preferred). For example: + +```python +# Before +assert stream_read.slices[1].state[0].stream.stream_state == {"a_timestamp": 123} + +# After - Option 1 +from airbyte_cdk.models import AirbyteStateBlob +assert stream_read.slices[1].state[0].stream.stream_state == AirbyteStateBlob(a_timestamp=123) + +# After - Option 2 +assert stream_read.slices[1].state[0].stream.stream_state.dict() == {"a_timestamp": 123} +``` + + +## Upgrading to 1.0.0 +Starting from 1.0.0, CDK classes and functions should be imported directly from `airbyte_cdk` (example: `from airbyte_cdk import HttpStream`). Lower-level `__init__` files are not considered stable, and will be modified without introducing a major release. + +Introducing breaking changes to a class or function exported from the top level `__init__.py` will require a major version bump and a migration note to help developer upgrade. + +Note that the following packages are not part of the top level init because they require extras dependencies, but are still considered stable: +- `destination.vector_db_based` +- `source.file_based` + +The `test` package is not included in the top level init either. The `test` package is still evolving and isn't considered stable. + + +A few classes were deleted from the Airbyte CDK in version 1.0.0: +- AirbyteLogger +- AirbyteSpec +- Authenticators in the `sources.streams.http.auth` module + + + +### Migrating off AirbyteLogger +No connectors should still be using `AirbyteLogger` directly, but the class is still used in some interfaces. The only required change is to update the type annotation from `AirbyteLogger` to `logging.Logger`. For example: + +``` +def check_connection(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> Tuple[bool, any]: +``` + +to + +``` +def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, any]: +``` +Don't forget to also update the imports. You can delete `from airbyte_cdk import AirbyteLogger` and replace it with `import logging`. + +### Migrating off AirbyteSpec +AirbyteSpec isn't used by any connectors in the repository, and I don't expect any custom connectors to use the class either. This should be a no-op. + +### Migrating off Authenticators +Replace usage of authenticators in the `airbyte_cdk.sources.streams.http.auth` module with their sister classes in the `airbyte_cdk.sources.streams.http.requests_native_auth` module. + +If any of your streams reference `self.authenticator`, you'll also need to update these references to `self._session.auth` as the authenticator is embedded in the session object. + +Here is a [pull request that can serve as an example](https://github.com/airbytehq/airbyte/pull/38065/files). diff --git a/airbyte-cdk/python/docs/.gitignore b/airbyte-cdk/python/docs/.gitignore new file mode 100644 index 000000000000..86d4c2dd380e --- /dev/null +++ b/airbyte-cdk/python/docs/.gitignore @@ -0,0 +1 @@ +generated diff --git a/airbyte-cdk/python/docs/CONTRIBUTING.md b/airbyte-cdk/python/docs/CONTRIBUTING.md new file mode 100644 index 000000000000..4b417ada9a9f --- /dev/null +++ b/airbyte-cdk/python/docs/CONTRIBUTING.md @@ -0,0 +1,32 @@ +# Contributing to the Python CDK + +Learn how you can become a contributor to the Airbyte Python CDK. + +## Development + +- Make sure [Poetry is installed](https://python-poetry.org/docs/#). +- Run `poetry install` +- For examples, check out the `examples` folder. They can be run via `poetry run python examples/` +- Unit tests and type checks can be run via `poetry run pytest` + +## Documentation + +Documentation auto-gen code lives in the `/docs` folder. Based on the doc strings of public methods, we generate API documentation using [pdoc](https://pdoc.dev). + +To generate the documentation, run: + +```console +poe docs-generate +``` + +Or to build and open the docs preview in one step: + +```console +poe docs-preview +``` + +or `poetry run poe docs-preview` if you don't have [Poe](https://poethepoet.natn.io/index.html) installed yet. + +The `docs-generate` Poe task is mapped to the `run()` function of `docs/generate.py`. + +Documentation pages will be generated in the `docs/generated` folder. The `test_docs.py` test in pytest will automatically update generated content. This updates must be manually committed before docs tests will pass. diff --git a/airbyte-cdk/python/docs/generate.py b/airbyte-cdk/python/docs/generate.py new file mode 100644 index 000000000000..f5467f670dab --- /dev/null +++ b/airbyte-cdk/python/docs/generate.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Generate docs for all public modules in the Airbyte CDK and save them to docs/generated. + +Usage: + poetry run python docs/generate.py + +Or with Poe-the-Poet: + poe docs-generate + poe docs-preview + +""" + +from __future__ import annotations + +import os +import pathlib +import shutil +from typing import cast + +import pdoc + + +def run() -> None: + """Generate docs for all public modules in the Airbyte CDK and save them to docs/generated.""" + + public_modules = [ + "airbyte_cdk", + ] + + # Walk all subdirectories and add them to the `public_modules` list + # if they do not begin with a "_" character. + for parent_dir, dirs, files in os.walk(pathlib.Path("airbyte_cdk")): + for dir_name in dirs: + if "/." in parent_dir or "/_" in parent_dir: + continue + + if dir_name.startswith((".", "_")): + continue + + print(f"Found module dir: {parent_dir + '|' + dir_name}") + + # Check if the directory name does not begin with a "_" + module = (parent_dir + "." + dir_name).replace("/", ".") + if "._" not in module and not module.startswith("_"): + public_modules.append(module) + + for file_name in files: + if not file_name.endswith(".py"): + continue + if file_name in ["py.typed"]: + continue + if file_name.startswith((".", "_")): + continue + + print(f"Found module file: {'|'.join([parent_dir, file_name])}") + module = cast(str, ".".join([parent_dir, file_name])).replace("/", ".").removesuffix(".py") + public_modules.append(module) + + # recursively delete the docs/generated folder if it exists + if pathlib.Path("docs/generated").exists(): + shutil.rmtree("docs/generated") + + pdoc.render.configure( + template_directory="docs", + show_source=True, + search=True, + logo="https://docs.airbyte.com/img/logo-dark.png", + favicon="https://docs.airbyte.com/img/favicon.png", + mermaid=True, + docformat="google", + ) + nl = "\n" + print(f"Generating docs for public modules: {nl.join(public_modules)}") + pdoc.pdoc( + *set(public_modules), + output_directory=pathlib.Path("docs/generated"), + ) + + +if __name__ == "__main__": + run() diff --git a/airbyte-cdk/python/mypy.ini b/airbyte-cdk/python/mypy.ini new file mode 100644 index 000000000000..f51c0846fb8f --- /dev/null +++ b/airbyte-cdk/python/mypy.ini @@ -0,0 +1,26 @@ +# Global options: + +[mypy] +warn_unused_configs = True +warn_redundant_casts = True +ignore_missing_imports = True +strict_equality = True +check_untyped_defs = True +disallow_untyped_decorators = False +disallow_any_generics = True +disallow_untyped_calls = True +disallow_incomplete_defs = True +disallow_untyped_defs = True +warn_return_any = True + +# Only alert on the files we want to check +follow_imports = silent + +# Allow re-exporting types for airbyte-protocol +no_implicit_reexport = False + +[tool.mypy] +plugins = ["pydantic.mypy", "pendulum", "pytest-mypy-plugins"] + +[mypy-airbyte_cdk.models] +ignore_errors = True diff --git a/airbyte-cdk/python/poetry.lock b/airbyte-cdk/python/poetry.lock new file mode 100644 index 000000000000..869632ddcb66 --- /dev/null +++ b/airbyte-cdk/python/poetry.lock @@ -0,0 +1,5223 @@ +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. + +[[package]] +name = "aiohappyeyeballs" +version = "2.4.3" +description = "Happy Eyeballs for asyncio" +optional = true +python-versions = ">=3.8" +files = [ + {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"}, + {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"}, +] + +[[package]] +name = "aiohttp" +version = "3.10.10" +description = "Async http client/server framework (asyncio)" +optional = true +python-versions = ">=3.8" +files = [ + {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"}, + {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"}, + {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"}, + {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"}, + {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"}, + {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"}, + {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"}, + {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"}, + {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"}, + {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"}, + {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"}, + {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"}, + {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"}, + {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"}, + {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"}, + {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"}, + {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"}, + {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"}, + {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"}, + {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"}, + {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"}, + {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"}, + {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"}, + {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"}, + {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"}, + {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"}, + {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"}, + {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"}, + {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"}, + {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"}, + {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"}, +] + +[package.dependencies] +aiohappyeyeballs = ">=2.3.0" +aiosignal = ">=1.1.2" +async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} +attrs = ">=17.3.0" +frozenlist = ">=1.1.1" +multidict = ">=4.5,<7.0" +yarl = ">=1.12.0,<2.0" + +[package.extras] +speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] + +[[package]] +name = "aiosignal" +version = "1.3.1" +description = "aiosignal: a list of registered asynchronous callbacks" +optional = true +python-versions = ">=3.7" +files = [ + {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, + {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, +] + +[package.dependencies] +frozenlist = ">=1.1.0" + +[[package]] +name = "airbyte-protocol-models-dataclasses" +version = "0.13.1" +description = "Declares the Airbyte Protocol using Python Dataclasses. Dataclasses in Python have less performance overhead compared to Pydantic models, making them a more efficient choice for scenarios where speed and memory usage are critical" +optional = false +python-versions = ">=3.8" +files = [ + {file = "airbyte_protocol_models_dataclasses-0.13.1-py3-none-any.whl", hash = "sha256:20a734b7b1c3479a643777830db6a2e0a34428f33d16abcfd320552576fabe5a"}, + {file = "airbyte_protocol_models_dataclasses-0.13.1.tar.gz", hash = "sha256:ec6a0fb6b16267bde910f52279445d06c8e1a3e4ed82ac2937b405ab280449d5"}, +] + +[[package]] +name = "alabaster" +version = "0.7.16" +description = "A light, configurable Sphinx theme" +optional = true +python-versions = ">=3.9" +files = [ + {file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"}, + {file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"}, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, + {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, +] + +[[package]] +name = "anyio" +version = "4.6.2.post1" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = true +python-versions = ">=3.9" +files = [ + {file = "anyio-4.6.2.post1-py3-none-any.whl", hash = "sha256:6d170c36fba3bdd840c73d3868c1e777e33676a69c3a72cf0a0d5d6d8009b61d"}, + {file = "anyio-4.6.2.post1.tar.gz", hash = "sha256:4c8bc31ccdb51c7f7bd251f51c609e038d63e34219b44aa86e47576389880b4c"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21.0b1)"] +trio = ["trio (>=0.26.1)"] + +[[package]] +name = "async-timeout" +version = "4.0.3" +description = "Timeout context manager for asyncio programs" +optional = true +python-versions = ">=3.7" +files = [ + {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, + {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, +] + +[[package]] +name = "asyncio" +version = "3.4.3" +description = "reference implementation of PEP 3156" +optional = false +python-versions = "*" +files = [ + {file = "asyncio-3.4.3-cp33-none-win32.whl", hash = "sha256:b62c9157d36187eca799c378e572c969f0da87cd5fc42ca372d92cdb06e7e1de"}, + {file = "asyncio-3.4.3-cp33-none-win_amd64.whl", hash = "sha256:c46a87b48213d7464f22d9a497b9eef8c1928b68320a2fa94240f969f6fec08c"}, + {file = "asyncio-3.4.3-py3-none-any.whl", hash = "sha256:c4d18b22701821de07bd6aea8b53d21449ec0ec5680645e5317062ea21817d2d"}, + {file = "asyncio-3.4.3.tar.gz", hash = "sha256:83360ff8bc97980e4ff25c964c7bd3923d333d177aa4f7fb736b019f26c7cb41"}, +] + +[[package]] +name = "attributes-doc" +version = "0.4.0" +description = "PEP 224 implementation" +optional = false +python-versions = ">=3.8" +files = [ + {file = "attributes-doc-0.4.0.tar.gz", hash = "sha256:b1576c94a714e9fc2c65c47cf10d0c8e1a5f7c4f5ae7f69006be108d95cbfbfb"}, + {file = "attributes_doc-0.4.0-py2.py3-none-any.whl", hash = "sha256:4c3007d9e58f3a6cb4b9c614c4d4ce2d92161581f28e594ddd8241cc3a113bdd"}, +] + +[[package]] +name = "attrs" +version = "24.2.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, + {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, +] + +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] + +[[package]] +name = "avro" +version = "1.11.3" +description = "Avro is a serialization and RPC framework." +optional = true +python-versions = ">=3.6" +files = [ + {file = "avro-1.11.3.tar.gz", hash = "sha256:3393bb5139f9cf0791d205756ce1e39a5b58586af5b153d6a3b5a199610e9d17"}, +] + +[package.extras] +snappy = ["python-snappy"] +zstandard = ["zstandard"] + +[[package]] +name = "babel" +version = "2.16.0" +description = "Internationalization utilities" +optional = true +python-versions = ">=3.8" +files = [ + {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, + {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, +] + +[package.extras] +dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] + +[[package]] +name = "backoff" +version = "2.2.1" +description = "Function decoration for backoff and retry" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, + {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, +] + +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +description = "Screen-scraping library" +optional = true +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +name = "bracex" +version = "2.5.post1" +description = "Bash style brace expander." +optional = false +python-versions = ">=3.8" +files = [ + {file = "bracex-2.5.post1-py3-none-any.whl", hash = "sha256:13e5732fec27828d6af308628285ad358047cec36801598368cb28bc631dbaf6"}, + {file = "bracex-2.5.post1.tar.gz", hash = "sha256:12c50952415bfa773d2d9ccb8e79651b8cdb1f31a42f6091b804f6ba2b4a66b6"}, +] + +[[package]] +name = "cachetools" +version = "5.5.0" +description = "Extensible memoizing collections and decorators" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, +] + +[[package]] +name = "cattrs" +version = "24.1.2" +description = "Composable complex class support for attrs and dataclasses." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cattrs-24.1.2-py3-none-any.whl", hash = "sha256:67c7495b760168d931a10233f979b28dc04daf853b30752246f4f8471c6d68d0"}, + {file = "cattrs-24.1.2.tar.gz", hash = "sha256:8028cfe1ff5382df59dd36474a86e02d817b06eaf8af84555441bac915d2ef85"}, +] + +[package.dependencies] +attrs = ">=23.1.0" +exceptiongroup = {version = ">=1.1.1", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.1.0,<4.6.3 || >4.6.3", markers = "python_version < \"3.11\""} + +[package.extras] +bson = ["pymongo (>=4.4.0)"] +cbor2 = ["cbor2 (>=5.4.6)"] +msgpack = ["msgpack (>=1.0.5)"] +msgspec = ["msgspec (>=0.18.5)"] +orjson = ["orjson (>=3.9.2)"] +pyyaml = ["pyyaml (>=6.0)"] +tomlkit = ["tomlkit (>=0.11.8)"] +ujson = ["ujson (>=5.7.0)"] + +[[package]] +name = "certifi" +version = "2024.8.30" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, + {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, +] + +[[package]] +name = "cffi" +version = "1.17.1" +description = "Foreign Function Interface for Python calling C code." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, + {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"}, + {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"}, + {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"}, + {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"}, + {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"}, + {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"}, + {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, + {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, + {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, + {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"}, + {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"}, + {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, + {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, + {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, + {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, +] + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = true +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"}, + {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"}, + {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, +] + +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = true +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "cohere" +version = "4.21" +description = "" +optional = true +python-versions = ">=3.7,<4.0" +files = [ + {file = "cohere-4.21-py3-none-any.whl", hash = "sha256:5eb81db62e78b3156e734421cc3e657054f9d9f1d68b9f38cf48fe3a8ae40dbc"}, + {file = "cohere-4.21.tar.gz", hash = "sha256:f611438f409dfc5d5a0a153a585349f5a80b169c7102b5994d9999ecf8440866"}, +] + +[package.dependencies] +aiohttp = ">=3.0,<4.0" +backoff = ">=2.0,<3.0" +fastavro = {version = "1.8.2", markers = "python_version >= \"3.8\""} +importlib_metadata = ">=6.0,<7.0" +requests = ">=2.25.0,<3.0.0" +urllib3 = ">=1.26,<3" + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "contourpy" +version = "1.3.0" +description = "Python library for calculating contours of 2D quadrilateral grids" +optional = true +python-versions = ">=3.9" +files = [ + {file = "contourpy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:880ea32e5c774634f9fcd46504bf9f080a41ad855f4fef54f5380f5133d343c7"}, + {file = "contourpy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:76c905ef940a4474a6289c71d53122a4f77766eef23c03cd57016ce19d0f7b42"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92f8557cbb07415a4d6fa191f20fd9d2d9eb9c0b61d1b2f52a8926e43c6e9af7"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36f965570cff02b874773c49bfe85562b47030805d7d8360748f3eca570f4cab"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacd81e2d4b6f89c9f8a5b69b86490152ff39afc58a95af002a398273e5ce589"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69375194457ad0fad3a839b9e29aa0b0ed53bb54db1bfb6c3ae43d111c31ce41"}, + {file = "contourpy-1.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a52040312b1a858b5e31ef28c2e865376a386c60c0e248370bbea2d3f3b760d"}, + {file = "contourpy-1.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3faeb2998e4fcb256542e8a926d08da08977f7f5e62cf733f3c211c2a5586223"}, + {file = "contourpy-1.3.0-cp310-cp310-win32.whl", hash = "sha256:36e0cff201bcb17a0a8ecc7f454fe078437fa6bda730e695a92f2d9932bd507f"}, + {file = "contourpy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:87ddffef1dbe5e669b5c2440b643d3fdd8622a348fe1983fad7a0f0ccb1cd67b"}, + {file = "contourpy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fa4c02abe6c446ba70d96ece336e621efa4aecae43eaa9b030ae5fb92b309ad"}, + {file = "contourpy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:834e0cfe17ba12f79963861e0f908556b2cedd52e1f75e6578801febcc6a9f49"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbc4c3217eee163fa3984fd1567632b48d6dfd29216da3ded3d7b844a8014a66"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4865cd1d419e0c7a7bf6de1777b185eebdc51470800a9f42b9e9decf17762081"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:303c252947ab4b14c08afeb52375b26781ccd6a5ccd81abcdfc1fafd14cf93c1"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637f674226be46f6ba372fd29d9523dd977a291f66ab2a74fbeb5530bb3f445d"}, + {file = "contourpy-1.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76a896b2f195b57db25d6b44e7e03f221d32fe318d03ede41f8b4d9ba1bff53c"}, + {file = "contourpy-1.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e1fd23e9d01591bab45546c089ae89d926917a66dceb3abcf01f6105d927e2cb"}, + {file = "contourpy-1.3.0-cp311-cp311-win32.whl", hash = "sha256:d402880b84df3bec6eab53cd0cf802cae6a2ef9537e70cf75e91618a3801c20c"}, + {file = "contourpy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:6cb6cc968059db9c62cb35fbf70248f40994dfcd7aa10444bbf8b3faeb7c2d67"}, + {file = "contourpy-1.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:570ef7cf892f0afbe5b2ee410c507ce12e15a5fa91017a0009f79f7d93a1268f"}, + {file = "contourpy-1.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:da84c537cb8b97d153e9fb208c221c45605f73147bd4cadd23bdae915042aad6"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0be4d8425bfa755e0fd76ee1e019636ccc7c29f77a7c86b4328a9eb6a26d0639"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c0da700bf58f6e0b65312d0a5e695179a71d0163957fa381bb3c1f72972537c"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eb8b141bb00fa977d9122636b16aa67d37fd40a3d8b52dd837e536d64b9a4d06"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3634b5385c6716c258d0419c46d05c8aa7dc8cb70326c9a4fb66b69ad2b52e09"}, + {file = "contourpy-1.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dce35502151b6bd35027ac39ba6e5a44be13a68f55735c3612c568cac3805fd"}, + {file = "contourpy-1.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea348f053c645100612b333adc5983d87be69acdc6d77d3169c090d3b01dc35"}, + {file = "contourpy-1.3.0-cp312-cp312-win32.whl", hash = "sha256:90f73a5116ad1ba7174341ef3ea5c3150ddf20b024b98fb0c3b29034752c8aeb"}, + {file = "contourpy-1.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:b11b39aea6be6764f84360fce6c82211a9db32a7c7de8fa6dd5397cf1d079c3b"}, + {file = "contourpy-1.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3e1c7fa44aaae40a2247e2e8e0627f4bea3dd257014764aa644f319a5f8600e3"}, + {file = "contourpy-1.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:364174c2a76057feef647c802652f00953b575723062560498dc7930fc9b1cb7"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32b238b3b3b649e09ce9aaf51f0c261d38644bdfa35cbaf7b263457850957a84"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d51fca85f9f7ad0b65b4b9fe800406d0d77017d7270d31ec3fb1cc07358fdea0"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:732896af21716b29ab3e988d4ce14bc5133733b85956316fb0c56355f398099b"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d73f659398a0904e125280836ae6f88ba9b178b2fed6884f3b1f95b989d2c8da"}, + {file = "contourpy-1.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c6c7c2408b7048082932cf4e641fa3b8ca848259212f51c8c59c45aa7ac18f14"}, + {file = "contourpy-1.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f317576606de89da6b7e0861cf6061f6146ead3528acabff9236458a6ba467f8"}, + {file = "contourpy-1.3.0-cp313-cp313-win32.whl", hash = "sha256:31cd3a85dbdf1fc002280c65caa7e2b5f65e4a973fcdf70dd2fdcb9868069294"}, + {file = "contourpy-1.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:4553c421929ec95fb07b3aaca0fae668b2eb5a5203d1217ca7c34c063c53d087"}, + {file = "contourpy-1.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:345af746d7766821d05d72cb8f3845dfd08dd137101a2cb9b24de277d716def8"}, + {file = "contourpy-1.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3bb3808858a9dc68f6f03d319acd5f1b8a337e6cdda197f02f4b8ff67ad2057b"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:420d39daa61aab1221567b42eecb01112908b2cab7f1b4106a52caaec8d36973"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4d63ee447261e963af02642ffcb864e5a2ee4cbfd78080657a9880b8b1868e18"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:167d6c890815e1dac9536dca00828b445d5d0df4d6a8c6adb4a7ec3166812fa8"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:710a26b3dc80c0e4febf04555de66f5fd17e9cf7170a7b08000601a10570bda6"}, + {file = "contourpy-1.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:75ee7cb1a14c617f34a51d11fa7524173e56551646828353c4af859c56b766e2"}, + {file = "contourpy-1.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:33c92cdae89ec5135d036e7218e69b0bb2851206077251f04a6c4e0e21f03927"}, + {file = "contourpy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a11077e395f67ffc2c44ec2418cfebed032cd6da3022a94fc227b6faf8e2acb8"}, + {file = "contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e8134301d7e204c88ed7ab50028ba06c683000040ede1d617298611f9dc6240c"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e12968fdfd5bb45ffdf6192a590bd8ddd3ba9e58360b29683c6bb71a7b41edca"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd2a0fc506eccaaa7595b7e1418951f213cf8255be2600f1ea1b61e46a60c55f"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cfb5c62ce023dfc410d6059c936dcf96442ba40814aefbfa575425a3a7f19dc"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68a32389b06b82c2fdd68276148d7b9275b5f5cf13e5417e4252f6d1a34f72a2"}, + {file = "contourpy-1.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94e848a6b83da10898cbf1311a815f770acc9b6a3f2d646f330d57eb4e87592e"}, + {file = "contourpy-1.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d78ab28a03c854a873787a0a42254a0ccb3cb133c672f645c9f9c8f3ae9d0800"}, + {file = "contourpy-1.3.0-cp39-cp39-win32.whl", hash = "sha256:81cb5ed4952aae6014bc9d0421dec7c5835c9c8c31cdf51910b708f548cf58e5"}, + {file = "contourpy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:14e262f67bd7e6eb6880bc564dcda30b15e351a594657e55b7eec94b6ef72843"}, + {file = "contourpy-1.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fe41b41505a5a33aeaed2a613dccaeaa74e0e3ead6dd6fd3a118fb471644fd6c"}, + {file = "contourpy-1.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eca7e17a65f72a5133bdbec9ecf22401c62bcf4821361ef7811faee695799779"}, + {file = "contourpy-1.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1ec4dc6bf570f5b22ed0d7efba0dfa9c5b9e0431aeea7581aa217542d9e809a4"}, + {file = "contourpy-1.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:00ccd0dbaad6d804ab259820fa7cb0b8036bda0686ef844d24125d8287178ce0"}, + {file = "contourpy-1.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca947601224119117f7c19c9cdf6b3ab54c5726ef1d906aa4a69dfb6dd58102"}, + {file = "contourpy-1.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6ec93afeb848a0845a18989da3beca3eec2c0f852322efe21af1931147d12cb"}, + {file = "contourpy-1.3.0.tar.gz", hash = "sha256:7ffa0db17717a8ffb127efd0c95a4362d996b892c2904db72428d5b52e1938a4"}, +] + +[package.dependencies] +numpy = ">=1.23" + +[package.extras] +bokeh = ["bokeh", "selenium"] +docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"] +mypy = ["contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.11.1)", "types-Pillow"] +test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] +test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] + +[[package]] +name = "coverage" +version = "7.6.4" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "coverage-7.6.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5f8ae553cba74085db385d489c7a792ad66f7f9ba2ee85bfa508aeb84cf0ba07"}, + {file = "coverage-7.6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8165b796df0bd42e10527a3f493c592ba494f16ef3c8b531288e3d0d72c1f6f0"}, + {file = "coverage-7.6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7c8b95bf47db6d19096a5e052ffca0a05f335bc63cef281a6e8fe864d450a72"}, + {file = "coverage-7.6.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ed9281d1b52628e81393f5eaee24a45cbd64965f41857559c2b7ff19385df51"}, + {file = "coverage-7.6.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0809082ee480bb8f7416507538243c8863ac74fd8a5d2485c46f0f7499f2b491"}, + {file = "coverage-7.6.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d541423cdd416b78626b55f123412fcf979d22a2c39fce251b350de38c15c15b"}, + {file = "coverage-7.6.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:58809e238a8a12a625c70450b48e8767cff9eb67c62e6154a642b21ddf79baea"}, + {file = "coverage-7.6.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c9b8e184898ed014884ca84c70562b4a82cbc63b044d366fedc68bc2b2f3394a"}, + {file = "coverage-7.6.4-cp310-cp310-win32.whl", hash = "sha256:6bd818b7ea14bc6e1f06e241e8234508b21edf1b242d49831831a9450e2f35fa"}, + {file = "coverage-7.6.4-cp310-cp310-win_amd64.whl", hash = "sha256:06babbb8f4e74b063dbaeb74ad68dfce9186c595a15f11f5d5683f748fa1d172"}, + {file = "coverage-7.6.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:73d2b73584446e66ee633eaad1a56aad577c077f46c35ca3283cd687b7715b0b"}, + {file = "coverage-7.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:51b44306032045b383a7a8a2c13878de375117946d68dcb54308111f39775a25"}, + {file = "coverage-7.6.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b3fb02fe73bed561fa12d279a417b432e5b50fe03e8d663d61b3d5990f29546"}, + {file = "coverage-7.6.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ed8fe9189d2beb6edc14d3ad19800626e1d9f2d975e436f84e19efb7fa19469b"}, + {file = "coverage-7.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b369ead6527d025a0fe7bd3864e46dbee3aa8f652d48df6174f8d0bac9e26e0e"}, + {file = "coverage-7.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ade3ca1e5f0ff46b678b66201f7ff477e8fa11fb537f3b55c3f0568fbfe6e718"}, + {file = "coverage-7.6.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:27fb4a050aaf18772db513091c9c13f6cb94ed40eacdef8dad8411d92d9992db"}, + {file = "coverage-7.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4f704f0998911abf728a7783799444fcbbe8261c4a6c166f667937ae6a8aa522"}, + {file = "coverage-7.6.4-cp311-cp311-win32.whl", hash = "sha256:29155cd511ee058e260db648b6182c419422a0d2e9a4fa44501898cf918866cf"}, + {file = "coverage-7.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:8902dd6a30173d4ef09954bfcb24b5d7b5190cf14a43170e386979651e09ba19"}, + {file = "coverage-7.6.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:12394842a3a8affa3ba62b0d4ab7e9e210c5e366fbac3e8b2a68636fb19892c2"}, + {file = "coverage-7.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b6b4c83d8e8ea79f27ab80778c19bc037759aea298da4b56621f4474ffeb117"}, + {file = "coverage-7.6.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d5b8007f81b88696d06f7df0cb9af0d3b835fe0c8dbf489bad70b45f0e45613"}, + {file = "coverage-7.6.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b57b768feb866f44eeed9f46975f3d6406380275c5ddfe22f531a2bf187eda27"}, + {file = "coverage-7.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5915fcdec0e54ee229926868e9b08586376cae1f5faa9bbaf8faf3561b393d52"}, + {file = "coverage-7.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b58c672d14f16ed92a48db984612f5ce3836ae7d72cdd161001cc54512571f2"}, + {file = "coverage-7.6.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:2fdef0d83a2d08d69b1f2210a93c416d54e14d9eb398f6ab2f0a209433db19e1"}, + {file = "coverage-7.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8cf717ee42012be8c0cb205dbbf18ffa9003c4cbf4ad078db47b95e10748eec5"}, + {file = "coverage-7.6.4-cp312-cp312-win32.whl", hash = "sha256:7bb92c539a624cf86296dd0c68cd5cc286c9eef2d0c3b8b192b604ce9de20a17"}, + {file = "coverage-7.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:1032e178b76a4e2b5b32e19d0fd0abbce4b58e77a1ca695820d10e491fa32b08"}, + {file = "coverage-7.6.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:023bf8ee3ec6d35af9c1c6ccc1d18fa69afa1cb29eaac57cb064dbb262a517f9"}, + {file = "coverage-7.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b0ac3d42cb51c4b12df9c5f0dd2f13a4f24f01943627120ec4d293c9181219ba"}, + {file = "coverage-7.6.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8fe4984b431f8621ca53d9380901f62bfb54ff759a1348cd140490ada7b693c"}, + {file = "coverage-7.6.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5fbd612f8a091954a0c8dd4c0b571b973487277d26476f8480bfa4b2a65b5d06"}, + {file = "coverage-7.6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dacbc52de979f2823a819571f2e3a350a7e36b8cb7484cdb1e289bceaf35305f"}, + {file = "coverage-7.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:dab4d16dfef34b185032580e2f2f89253d302facba093d5fa9dbe04f569c4f4b"}, + {file = "coverage-7.6.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:862264b12ebb65ad8d863d51f17758b1684560b66ab02770d4f0baf2ff75da21"}, + {file = "coverage-7.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5beb1ee382ad32afe424097de57134175fea3faf847b9af002cc7895be4e2a5a"}, + {file = "coverage-7.6.4-cp313-cp313-win32.whl", hash = "sha256:bf20494da9653f6410213424f5f8ad0ed885e01f7e8e59811f572bdb20b8972e"}, + {file = "coverage-7.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:182e6cd5c040cec0a1c8d415a87b67ed01193ed9ad458ee427741c7d8513d963"}, + {file = "coverage-7.6.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a181e99301a0ae128493a24cfe5cfb5b488c4e0bf2f8702091473d033494d04f"}, + {file = "coverage-7.6.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:df57bdbeffe694e7842092c5e2e0bc80fff7f43379d465f932ef36f027179806"}, + {file = "coverage-7.6.4-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bcd1069e710600e8e4cf27f65c90c7843fa8edfb4520fb0ccb88894cad08b11"}, + {file = "coverage-7.6.4-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99b41d18e6b2a48ba949418db48159d7a2e81c5cc290fc934b7d2380515bd0e3"}, + {file = "coverage-7.6.4-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b1e54712ba3474f34b7ef7a41e65bd9037ad47916ccb1cc78769bae324c01a"}, + {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:53d202fd109416ce011578f321460795abfe10bb901b883cafd9b3ef851bacfc"}, + {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:c48167910a8f644671de9f2083a23630fbf7a1cb70ce939440cd3328e0919f70"}, + {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cc8ff50b50ce532de2fa7a7daae9dd12f0a699bfcd47f20945364e5c31799fef"}, + {file = "coverage-7.6.4-cp313-cp313t-win32.whl", hash = "sha256:b8d3a03d9bfcaf5b0141d07a88456bb6a4c3ce55c080712fec8418ef3610230e"}, + {file = "coverage-7.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:f3ddf056d3ebcf6ce47bdaf56142af51bb7fad09e4af310241e9db7a3a8022e1"}, + {file = "coverage-7.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cb7fa111d21a6b55cbf633039f7bc2749e74932e3aa7cb7333f675a58a58bf3"}, + {file = "coverage-7.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11a223a14e91a4693d2d0755c7a043db43d96a7450b4f356d506c2562c48642c"}, + {file = "coverage-7.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a413a096c4cbac202433c850ee43fa326d2e871b24554da8327b01632673a076"}, + {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00a1d69c112ff5149cabe60d2e2ee948752c975d95f1e1096742e6077affd376"}, + {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f76846299ba5c54d12c91d776d9605ae33f8ae2b9d1d3c3703cf2db1a67f2c0"}, + {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe439416eb6380de434886b00c859304338f8b19f6f54811984f3420a2e03858"}, + {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0294ca37f1ba500667b1aef631e48d875ced93ad5e06fa665a3295bdd1d95111"}, + {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6f01ba56b1c0e9d149f9ac85a2f999724895229eb36bd997b61e62999e9b0901"}, + {file = "coverage-7.6.4-cp39-cp39-win32.whl", hash = "sha256:bc66f0bf1d7730a17430a50163bb264ba9ded56739112368ba985ddaa9c3bd09"}, + {file = "coverage-7.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:c481b47f6b5845064c65a7bc78bc0860e635a9b055af0df46fdf1c58cebf8e8f"}, + {file = "coverage-7.6.4-pp39.pp310-none-any.whl", hash = "sha256:3c65d37f3a9ebb703e710befdc489a38683a5b152242664b973a7b7b22348a4e"}, + {file = "coverage-7.6.4.tar.gz", hash = "sha256:29fc0f17b1d3fea332f8001d4558f8214af7f1d87a345f3a133c901d60347c73"}, +] + +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + +[package.extras] +toml = ["tomli"] + +[[package]] +name = "cramjam" +version = "2.9.0" +description = "Thin Python bindings to de/compression algorithms in Rust" +optional = true +python-versions = ">=3.8" +files = [ + {file = "cramjam-2.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:eb16d995e454b0155b166f6e6da7df4ac812d44e0f3b6dc0f344a934609fd5bc"}, + {file = "cramjam-2.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb1e86bfea656b51f2e75f2cedb17fc08b552d105b814d19b595294ecbe94d8d"}, + {file = "cramjam-2.9.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4bd76b654275736fd4f55521981b73751c34dacf70a1dbce96e454a39d43201f"}, + {file = "cramjam-2.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21569f19d5848606b85ac0dde0dc3639319d26fed8522c7103515df875bcb300"}, + {file = "cramjam-2.9.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8f8b1117b4e697d39950ecab01700ce0aef66541e4478eb4d7b3ade8703347b"}, + {file = "cramjam-2.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3464d0042a03e8ef38a2b774ef23163cf3c0cdc41b8dfbf7c4aadf93e40b459"}, + {file = "cramjam-2.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0711c776750e243ae347d6609c975f0ff4be9ae65b2764d29e4bbdad8e574c3a"}, + {file = "cramjam-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00d96f798bc980b29f8e1c3ed7d554050e05d4cde23d1633ffed4cd63110024a"}, + {file = "cramjam-2.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fc49b6575e3cb15da3180c5a3926ec81db33b109e48530708da76614b306904b"}, + {file = "cramjam-2.9.0-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:c4fa6c23e56d48df18f534af921ec936c812743a8972ecdd5e5ff47b464fea00"}, + {file = "cramjam-2.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b4b8d8160685c11ffb4e8e6daaab79cb351a1c54ceec41cc18a0a62c89309fe0"}, + {file = "cramjam-2.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0ed6362cb6c964f8d0c6e7f790e8961b9242cd3acd87c56169ca14d642653707"}, + {file = "cramjam-2.9.0-cp310-none-win32.whl", hash = "sha256:fe9af350dfbdc7ed4c93a8016a8ad7b5492fc116e7197cad7cbce99b434d3fe1"}, + {file = "cramjam-2.9.0-cp310-none-win_amd64.whl", hash = "sha256:37054c73704a3183b60869e7fec1614648752c31d89f44de1ffe1f01ad4d20d5"}, + {file = "cramjam-2.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:170a50407f9400073621cc1d5f3200ca3ad9de3000831e3e86f5561ca8048a08"}, + {file = "cramjam-2.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:912c94781c8ff318a4d3f3306f8d94d41ae5aa7b9760c4bb0476b01142084845"}, + {file = "cramjam-2.9.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:df089639983a03070be6eabc60317aa1ffbf2c5409023b57a5fc2e4975163bc4"}, + {file = "cramjam-2.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ca28a8f6ab5fca35f163fd7d7a970880ce4fc1a0bead1249ecdaa96ec9ac1f4"}, + {file = "cramjam-2.9.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:abd8bf9a94e3866215ac181a7dbcfa1ddbedca4f8048494a79934febe88537df"}, + {file = "cramjam-2.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7de19a382bcab93cd4d028d51f6f581920a3b79659a384775188135b7fc64f15"}, + {file = "cramjam-2.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a4156fcefa1dfaa65d35ff82c252d1e32be12820f26d04748be6cd3b461cf85f"}, + {file = "cramjam-2.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4a3104022129d7463100dfaf12efd398ebfa4b7e4e50832ccc596754f7c26df"}, + {file = "cramjam-2.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6ebee5f5d7e2b9277895ea4fd94646b72075fe9cfc0e8f4770b65c9e72b1fec1"}, + {file = "cramjam-2.9.0-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:8e33ebe4d709b21bc15e7ddf485ac6b30d7fdc7ed7c3c65130654c007f50c183"}, + {file = "cramjam-2.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d5a39118008bb9f2fba36a0ceea6c41fbd0b55d2647b043ba51a868e5f6de92"}, + {file = "cramjam-2.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7f6ef35eba883927af2678b561cc4407e0b3b0d58a251c863bec4b3d8258cc2f"}, + {file = "cramjam-2.9.0-cp311-none-win32.whl", hash = "sha256:b21e55b5cfdaff96eae1f323ae9a0d36e86852cdf62fe23b60a2481d2fed5571"}, + {file = "cramjam-2.9.0-cp311-none-win_amd64.whl", hash = "sha256:9f685fe4e49b2f3e233548e3397b3f9189d71a265718ec631d13eca3d5718ddb"}, + {file = "cramjam-2.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:34578e4c1518b10dad5e0ba40c721e529ef13e7742a528843b40e1f20dd6078c"}, + {file = "cramjam-2.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1d5b5512dc61ea78f32e021e88a5fd5b46a821409479e6657d33614fc9e45677"}, + {file = "cramjam-2.9.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0b4f1b5e33915ed591c0c19b8c3bbdd7aa0f6a9bfe2b7246b475d497bda15f18"}, + {file = "cramjam-2.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad301801afa0eecdacabf353a2802df5e6770f9bfb0a559d6c069813d83cfd42"}, + {file = "cramjam-2.9.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:399baf80fea574e3870f233e12e6a12f02c53b054e13d792348b272b0614370a"}, + {file = "cramjam-2.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3121e2fbec58907fa70636adaeaf30c27614c867e08a7a5bd2887b33786ff790"}, + {file = "cramjam-2.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd04205b2a87087ffc2257c3ad33f11daabc053956f64ac1ec7bae299cac3f2f"}, + {file = "cramjam-2.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddb9c4db36188a8f08c2303100a83100f26a8572803ae35eadff359bebd3d204"}, + {file = "cramjam-2.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ef553d4080368006817c1a935ed619c71987cf10417a32386acc00c5418a2934"}, + {file = "cramjam-2.9.0-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:9862ca8ead80857ecfb9b07f02f577733261e981346f31585fe118975eabb738"}, + {file = "cramjam-2.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4714e1ea0c3329368b83fe5ad6e831d5ca11fb794ca7cf491622eb6b2d420d2f"}, + {file = "cramjam-2.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1b4ca30c9f27e3b88bc082d4637e7648f93da5cb69a2dbe0c0300bc51353c820"}, + {file = "cramjam-2.9.0-cp312-none-win32.whl", hash = "sha256:0ed2fef010d1caca9ea63814e9cb5b1d47d907b80302b8cc0b3a1e116ea241e2"}, + {file = "cramjam-2.9.0-cp312-none-win_amd64.whl", hash = "sha256:bd26d71939de5dcf169d479fbc7fcfed21e6675bab33e7f7e9f8405f19711c71"}, + {file = "cramjam-2.9.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:dd70ea5d7b2c5e479e04ac3a00d8bc3deca146d2b5dbfbe3d7b42ed136e19de4"}, + {file = "cramjam-2.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b1410e68c464666473a89cade17483b94bb4639d9161c440ee54ee1e0eca583"}, + {file = "cramjam-2.9.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:b0078727fe8c28ef1695e5d04aae5c41ac697eb087cba387c6a02b825f9071c0"}, + {file = "cramjam-2.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a63c4e63319bf7dfc3ab46c06afb76d3d9cc1c94369b609dde480e5cc78e4de"}, + {file = "cramjam-2.9.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47d7253b5a10c201cc65aecfb517dfa1c0b5831b2524ac32dd2964fceafc0dc4"}, + {file = "cramjam-2.9.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05970fb640f236767003e62c256a085754536169bac863f4a3502ecb59cbf197"}, + {file = "cramjam-2.9.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0b062d261fa3fac00146cf801896c8cfafe1e41332eb047aa0a36558299daa6"}, + {file = "cramjam-2.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:017b7066f18b7b676068f51b1dbdecc02d76d9af10092252b22dcbd03a78ed33"}, + {file = "cramjam-2.9.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:9de33ef3bc006c11fbad1dc8b15341dcc78430df2c5ce1e790dfb729b11ab593"}, + {file = "cramjam-2.9.0-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:b99efaf81be8e381de1cde6574e2c89030ed53994e73b0e75b62d6e232f491c5"}, + {file = "cramjam-2.9.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:36426e3f1920f6aa4c644d007bf9cfad06dd9f1a30cd0a921d72b010492d8447"}, + {file = "cramjam-2.9.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ea9bcaff298f5d35ef67346d474fca388c5cf6d4edab1d06b84868800f88bd36"}, + {file = "cramjam-2.9.0-cp313-none-win32.whl", hash = "sha256:c48da60a5eb481b412e5e462b81ad307fb2203178a2840a743f0a7c5fc1718c9"}, + {file = "cramjam-2.9.0-cp313-none-win_amd64.whl", hash = "sha256:97a6311bd32f301ff1b922bc9de62ace3d9fd845e20efc0f71b4d0239a45b8d2"}, + {file = "cramjam-2.9.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:78e7349f945a83bc48855fb042873092a69b155a088b8c11942eb76418b32705"}, + {file = "cramjam-2.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:65a097ea765dd4ef2fb868b5b0959d7c93a64c250b2c52f462898c823ae4b950"}, + {file = "cramjam-2.9.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:35cad507eb02c775e6c5444312f98b28dd8bf122425677ae199484996e838673"}, + {file = "cramjam-2.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8982925d179b940efa860513a31b839bb06343501077cca3e67f7a2f7360d355"}, + {file = "cramjam-2.9.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ba7e2d33e1d092dffd0a3ff4bd1b86177594aa3c2901fd478e78e1fb2aee8ed3"}, + {file = "cramjam-2.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:904be92e3bc25e78343ee52aa0fd5fba3a31d11d474e8af4623a9d00baa84bc2"}, + {file = "cramjam-2.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9221297c547d702e1431e96705fce26c6a87df34a681a6b97fe63b536d09c1d8"}, + {file = "cramjam-2.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e98a18c22a85f321091cc8db6694af1d713a369c2d60ec611c10ccfe24ab103a"}, + {file = "cramjam-2.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e248510f8e2dbc71fa99f86238c9023365dbe1a4520eb40e33d73416527349f2"}, + {file = "cramjam-2.9.0-cp38-cp38-musllinux_1_1_armv7l.whl", hash = "sha256:dc07376aa33b6004ea372ac9b0ba0ed3455aa2fc4e18727414142ecb46b176b8"}, + {file = "cramjam-2.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e94021c541eb2a199b5a2ffae0ea84fb8b99863dab99a5b154b00bc7a44b5c48"}, + {file = "cramjam-2.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4adbf4366f8dc29b7c5c731c800cf633be76c9911e928daeb606827d6ae7c599"}, + {file = "cramjam-2.9.0-cp38-none-win32.whl", hash = "sha256:ca880f555c8db40942acc8a50722c33e229b6be90e598acc1a201f36487b917d"}, + {file = "cramjam-2.9.0-cp38-none-win_amd64.whl", hash = "sha256:ab17a429a92db90bf40115efb97d10e71b94b0dcacf30cf724552df2794a58fb"}, + {file = "cramjam-2.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ed7fd7bc2b86ec3161fe0cc49f5f392e6efa55c91a95397d5047820c38117660"}, + {file = "cramjam-2.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a0f654c739a6bc4a69a2aaf31463328a208757ed780ff886234532f78e06a864"}, + {file = "cramjam-2.9.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:cd4d4ab9deb5846af0ac6cf1fa139cfa40291ad14d073efa8b8e20c8d1aa90bd"}, + {file = "cramjam-2.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bafc32f01d4ab64f83fdbc29bc5bd25a920b59c751c12e06e6f4b1e379be7600"}, + {file = "cramjam-2.9.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0fb5ea631dbf998f667766a9e485e757817d66ed559916ba553a0ec2f902d788"}, + {file = "cramjam-2.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c902e56e60c48f5f15e55257aaa1c2678323df5f18a1b839e8d05cac1107576c"}, + {file = "cramjam-2.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:441d3875cdffe5df9294b93ef570058837732dd727cd9d18efa0f089f1c2687a"}, + {file = "cramjam-2.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed486e57a79ccc7aebaa2ec12517d891fdc5d2fde16915e3db705b8a47570981"}, + {file = "cramjam-2.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:013cb872205641c6e5269f530ed40aaaa5640d84e0d8f33b89f5a1bf7f655527"}, + {file = "cramjam-2.9.0-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:a41b4b10a381be1d42a1a7dd07b8c3faccd3d12c7e98e973a6ec558fd040a607"}, + {file = "cramjam-2.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:598eac1713ddbe69c3b30dcc890d69b206ce08903fc3aed58149aae87c61973a"}, + {file = "cramjam-2.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:72e9ebc27c557706a3c9964c1d1b4522857760dbd60c105a4f5421f3b66e31a2"}, + {file = "cramjam-2.9.0-cp39-none-win32.whl", hash = "sha256:dbbd6fba677e1cbc9d6bd4ebbe3e8b3667d0295f1731489db2a971c95f0ceca0"}, + {file = "cramjam-2.9.0-cp39-none-win_amd64.whl", hash = "sha256:7f33a83969fa94ee8e0c1f0aef8eb303ead3e9142338dc543abeb7e1a28734ab"}, + {file = "cramjam-2.9.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:132db7d3346ea21ba44e7ee23ec73bd6fa9eb1e77133ca6dfe1f7449a69999af"}, + {file = "cramjam-2.9.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:2addf801c88bead21256ccd87dc97cffead03758c4a4947fad8e454f4abfda0a"}, + {file = "cramjam-2.9.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24afad3ba62774abbb150dc25aab21b047ab999c4143c7a8d96577848baf7af6"}, + {file = "cramjam-2.9.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:604c16052cf29d0c796927ed7e107f65429d2036c82c9a8009bd453c94e5e4f0"}, + {file = "cramjam-2.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:65bded20fd2cef17b22246c336ddd67fac842341ee311042b4a70e65dc745aa7"}, + {file = "cramjam-2.9.0.tar.gz", hash = "sha256:f103e648aa3ebe9b8e2c1a3a92719288d8f3f41007c319ad298cdce2d0c28641"}, +] + +[package.extras] +dev = ["black (==22.3.0)", "hypothesis", "numpy", "pytest (>=5.30)", "pytest-benchmark", "pytest-xdist"] + +[[package]] +name = "cryptography" +version = "42.0.8" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +optional = false +python-versions = ">=3.7" +files = [ + {file = "cryptography-42.0.8-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e"}, + {file = "cryptography-42.0.8-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7"}, + {file = "cryptography-42.0.8-cp37-abi3-win32.whl", hash = "sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2"}, + {file = "cryptography-42.0.8-cp37-abi3-win_amd64.whl", hash = "sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba"}, + {file = "cryptography-42.0.8-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14"}, + {file = "cryptography-42.0.8-cp39-abi3-win32.whl", hash = "sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c"}, + {file = "cryptography-42.0.8-cp39-abi3-win_amd64.whl", hash = "sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad"}, + {file = "cryptography-42.0.8.tar.gz", hash = "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2"}, +] + +[package.dependencies] +cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] +docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] +nox = ["nox"] +pep8test = ["check-sdist", "click", "mypy", "ruff"] +sdist = ["build"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test-randomorder = ["pytest-randomly"] + +[[package]] +name = "cycler" +version = "0.12.1" +description = "Composable style cycles" +optional = true +python-versions = ">=3.8" +files = [ + {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"}, + {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"}, +] + +[package.extras] +docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] +tests = ["pytest", "pytest-cov", "pytest-xdist"] + +[[package]] +name = "dataclasses-json" +version = "0.6.7" +description = "Easily serialize dataclasses to and from JSON." +optional = true +python-versions = "<4.0,>=3.7" +files = [ + {file = "dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a"}, + {file = "dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0"}, +] + +[package.dependencies] +marshmallow = ">=3.18.0,<4.0.0" +typing-inspect = ">=0.4.0,<1" + +[[package]] +name = "deprecated" +version = "1.2.14" +description = "Python @deprecated decorator to deprecate old python classes, functions or methods." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"}, + {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"}, +] + +[package.dependencies] +wrapt = ">=1.10,<2" + +[package.extras] +dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] + +[[package]] +name = "docutils" +version = "0.17.1" +description = "Docutils -- Python Documentation Utilities" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "docutils-0.17.1-py2.py3-none-any.whl", hash = "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"}, + {file = "docutils-0.17.1.tar.gz", hash = "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125"}, +] + +[[package]] +name = "dpath" +version = "2.2.0" +description = "Filesystem-like pathing and searching for dictionaries" +optional = false +python-versions = ">=3.7" +files = [ + {file = "dpath-2.2.0-py3-none-any.whl", hash = "sha256:b330a375ded0a0d2ed404440f6c6a715deae5313af40bbb01c8a41d891900576"}, + {file = "dpath-2.2.0.tar.gz", hash = "sha256:34f7e630dc55ea3f219e555726f5da4b4b25f2200319c8e6902c394258dd6a3e"}, +] + +[[package]] +name = "emoji" +version = "2.14.0" +description = "Emoji for Python" +optional = true +python-versions = ">=3.7" +files = [ + {file = "emoji-2.14.0-py3-none-any.whl", hash = "sha256:fcc936bf374b1aec67dda5303ae99710ba88cc9cdce2d1a71c5f2204e6d78799"}, + {file = "emoji-2.14.0.tar.gz", hash = "sha256:f68ac28915a2221667cddb3e6c589303c3c6954c6c5af6fefaec7f9bdf72fdca"}, +] + +[package.extras] +dev = ["coverage", "pytest (>=7.4.4)"] + +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = true +python-versions = ">=3.8" +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "fastavro" +version = "1.8.2" +description = "Fast read/write of AVRO files" +optional = true +python-versions = ">=3.8" +files = [ + {file = "fastavro-1.8.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:0e08964b2e9a455d831f2557402a683d4c4d45206f2ab9ade7c69d3dc14e0e58"}, + {file = "fastavro-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:401a70b1e5c7161420c6019e0c8afa88f7c8a373468591f5ec37639a903c2509"}, + {file = "fastavro-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef1ed3eaa4240c05698d02d8d0c010b9a03780eda37b492da6cd4c9d37e04ec"}, + {file = "fastavro-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:543185a672ff6306beb329b57a7b8a3a2dd1eb21a5ccc530150623d58d48bb98"}, + {file = "fastavro-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ffbf8bae1edb50fe7beeffc3afa8e684686550c2e5d31bf01c25cfa213f581e1"}, + {file = "fastavro-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:bb545eb9d876bc7b785e27e98e7720ada7eee7d7a1729798d2ed51517f13500a"}, + {file = "fastavro-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2b837d3038c651046252bc92c1b9899bf21c7927a148a1ff89599c36c2a331ca"}, + {file = "fastavro-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3510e96c0a47e4e914bd1a29c954eb662bfa24849ad92e597cb97cc79f21af7"}, + {file = "fastavro-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccc0e74f2c2ab357f39bb73d67fcdb6dc10e23fdbbd399326139f72ec0fb99a3"}, + {file = "fastavro-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:add51c70d0ab1175601c75cd687bbe9d16ae312cd8899b907aafe0d79ee2bc1d"}, + {file = "fastavro-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d9e2662f57e6453e9a2c9fb4f54b2a9e62e3e46f5a412ac00558112336d23883"}, + {file = "fastavro-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:fea75cf53a93c56dd56e68abce8d314ef877b27451c870cd7ede7582d34c08a7"}, + {file = "fastavro-1.8.2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:f489020bb8664c2737c03457ad5dbd490579ddab6f0a7b5c17fecfe982715a89"}, + {file = "fastavro-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a547625c138efd5e61300119241041906ee8cb426fc7aa789900f87af7ed330d"}, + {file = "fastavro-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53beb458f30c9ad4aa7bff4a42243ff990ffb713b6ce0cd9b360cbc3d648fe52"}, + {file = "fastavro-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7b1b2cbd2dd851452306beed0ab9bdaeeab1cc8ad46f84b47cd81eeaff6dd6b8"}, + {file = "fastavro-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d29e9baee0b2f37ecd09bde3b487cf900431fd548c85be3e4fe1b9a0b2a917f1"}, + {file = "fastavro-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:66e132c710663230292bc63e2cb79cf95b16ccb94a5fc99bb63694b24e312fc5"}, + {file = "fastavro-1.8.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:38aca63ce604039bcdf2edd14912d00287bdbf8b76f9aa42b28e6ca0bf950092"}, + {file = "fastavro-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9787835f6449ee94713e7993a700432fce3763024791ffa8a58dc91ef9d1f950"}, + {file = "fastavro-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:536cb448bc83811056be02749fd9df37a69621678f02597d272970a769e9b40c"}, + {file = "fastavro-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e9d5027cf7d9968f8f819958b41bfedb933323ea6d6a0485eefacaa1afd91f54"}, + {file = "fastavro-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:792adfc0c80c7f1109e0ab4b0decef20691fdf0a45091d397a0563872eb56d42"}, + {file = "fastavro-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:650b22766259f7dd7519dfa4e4658f0e233c319efa130b9cf0c36a500e09cc57"}, + {file = "fastavro-1.8.2.tar.gz", hash = "sha256:ab9d9226d4b66b6b3d0661a57cd45259b0868fed1c0cd4fac95249b9e0973320"}, +] + +[package.extras] +codecs = ["lz4", "python-snappy", "zstandard"] +lz4 = ["lz4"] +snappy = ["python-snappy"] +zstandard = ["zstandard"] + +[[package]] +name = "filetype" +version = "1.2.0" +description = "Infer file type and MIME type of any file/buffer. No external dependencies." +optional = true +python-versions = "*" +files = [ + {file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"}, + {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"}, +] + +[[package]] +name = "flake8" +version = "6.1.0" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.8.1" +files = [ + {file = "flake8-6.1.0-py2.py3-none-any.whl", hash = "sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5"}, + {file = "flake8-6.1.0.tar.gz", hash = "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.11.0,<2.12.0" +pyflakes = ">=3.1.0,<3.2.0" + +[[package]] +name = "fonttools" +version = "4.54.1" +description = "Tools to manipulate font files" +optional = true +python-versions = ">=3.8" +files = [ + {file = "fonttools-4.54.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7ed7ee041ff7b34cc62f07545e55e1468808691dddfd315d51dd82a6b37ddef2"}, + {file = "fonttools-4.54.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41bb0b250c8132b2fcac148e2e9198e62ff06f3cc472065dff839327945c5882"}, + {file = "fonttools-4.54.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7965af9b67dd546e52afcf2e38641b5be956d68c425bef2158e95af11d229f10"}, + {file = "fonttools-4.54.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:278913a168f90d53378c20c23b80f4e599dca62fbffae4cc620c8eed476b723e"}, + {file = "fonttools-4.54.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0e88e3018ac809b9662615072dcd6b84dca4c2d991c6d66e1970a112503bba7e"}, + {file = "fonttools-4.54.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4aa4817f0031206e637d1e685251ac61be64d1adef111060df84fdcbc6ab6c44"}, + {file = "fonttools-4.54.1-cp310-cp310-win32.whl", hash = "sha256:7e3b7d44e18c085fd8c16dcc6f1ad6c61b71ff463636fcb13df7b1b818bd0c02"}, + {file = "fonttools-4.54.1-cp310-cp310-win_amd64.whl", hash = "sha256:dd9cc95b8d6e27d01e1e1f1fae8559ef3c02c76317da650a19047f249acd519d"}, + {file = "fonttools-4.54.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5419771b64248484299fa77689d4f3aeed643ea6630b2ea750eeab219588ba20"}, + {file = "fonttools-4.54.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:301540e89cf4ce89d462eb23a89464fef50915255ece765d10eee8b2bf9d75b2"}, + {file = "fonttools-4.54.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76ae5091547e74e7efecc3cbf8e75200bc92daaeb88e5433c5e3e95ea8ce5aa7"}, + {file = "fonttools-4.54.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82834962b3d7c5ca98cb56001c33cf20eb110ecf442725dc5fdf36d16ed1ab07"}, + {file = "fonttools-4.54.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d26732ae002cc3d2ecab04897bb02ae3f11f06dd7575d1df46acd2f7c012a8d8"}, + {file = "fonttools-4.54.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58974b4987b2a71ee08ade1e7f47f410c367cdfc5a94fabd599c88165f56213a"}, + {file = "fonttools-4.54.1-cp311-cp311-win32.whl", hash = "sha256:ab774fa225238986218a463f3fe151e04d8c25d7de09df7f0f5fce27b1243dbc"}, + {file = "fonttools-4.54.1-cp311-cp311-win_amd64.whl", hash = "sha256:07e005dc454eee1cc60105d6a29593459a06321c21897f769a281ff2d08939f6"}, + {file = "fonttools-4.54.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:54471032f7cb5fca694b5f1a0aaeba4af6e10ae989df408e0216f7fd6cdc405d"}, + {file = "fonttools-4.54.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fa92cb248e573daab8d032919623cc309c005086d743afb014c836636166f08"}, + {file = "fonttools-4.54.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a911591200114969befa7f2cb74ac148bce5a91df5645443371aba6d222e263"}, + {file = "fonttools-4.54.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93d458c8a6a354dc8b48fc78d66d2a8a90b941f7fec30e94c7ad9982b1fa6bab"}, + {file = "fonttools-4.54.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5eb2474a7c5be8a5331146758debb2669bf5635c021aee00fd7c353558fc659d"}, + {file = "fonttools-4.54.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c9c563351ddc230725c4bdf7d9e1e92cbe6ae8553942bd1fb2b2ff0884e8b714"}, + {file = "fonttools-4.54.1-cp312-cp312-win32.whl", hash = "sha256:fdb062893fd6d47b527d39346e0c5578b7957dcea6d6a3b6794569370013d9ac"}, + {file = "fonttools-4.54.1-cp312-cp312-win_amd64.whl", hash = "sha256:e4564cf40cebcb53f3dc825e85910bf54835e8a8b6880d59e5159f0f325e637e"}, + {file = "fonttools-4.54.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6e37561751b017cf5c40fce0d90fd9e8274716de327ec4ffb0df957160be3bff"}, + {file = "fonttools-4.54.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:357cacb988a18aace66e5e55fe1247f2ee706e01debc4b1a20d77400354cddeb"}, + {file = "fonttools-4.54.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e953cc0bddc2beaf3a3c3b5dd9ab7554677da72dfaf46951e193c9653e515a"}, + {file = "fonttools-4.54.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58d29b9a294573d8319f16f2f79e42428ba9b6480442fa1836e4eb89c4d9d61c"}, + {file = "fonttools-4.54.1-cp313-cp313-win32.whl", hash = "sha256:9ef1b167e22709b46bf8168368b7b5d3efeaaa746c6d39661c1b4405b6352e58"}, + {file = "fonttools-4.54.1-cp313-cp313-win_amd64.whl", hash = "sha256:262705b1663f18c04250bd1242b0515d3bbae177bee7752be67c979b7d47f43d"}, + {file = "fonttools-4.54.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ed2f80ca07025551636c555dec2b755dd005e2ea8fbeb99fc5cdff319b70b23b"}, + {file = "fonttools-4.54.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9dc080e5a1c3b2656caff2ac2633d009b3a9ff7b5e93d0452f40cd76d3da3b3c"}, + {file = "fonttools-4.54.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d152d1be65652fc65e695e5619e0aa0982295a95a9b29b52b85775243c06556"}, + {file = "fonttools-4.54.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8583e563df41fdecef31b793b4dd3af8a9caa03397be648945ad32717a92885b"}, + {file = "fonttools-4.54.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:0d1d353ef198c422515a3e974a1e8d5b304cd54a4c2eebcae708e37cd9eeffb1"}, + {file = "fonttools-4.54.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fda582236fee135d4daeca056c8c88ec5f6f6d88a004a79b84a02547c8f57386"}, + {file = "fonttools-4.54.1-cp38-cp38-win32.whl", hash = "sha256:e7d82b9e56716ed32574ee106cabca80992e6bbdcf25a88d97d21f73a0aae664"}, + {file = "fonttools-4.54.1-cp38-cp38-win_amd64.whl", hash = "sha256:ada215fd079e23e060157aab12eba0d66704316547f334eee9ff26f8c0d7b8ab"}, + {file = "fonttools-4.54.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f5b8a096e649768c2f4233f947cf9737f8dbf8728b90e2771e2497c6e3d21d13"}, + {file = "fonttools-4.54.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4e10d2e0a12e18f4e2dd031e1bf7c3d7017be5c8dbe524d07706179f355c5dac"}, + {file = "fonttools-4.54.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31c32d7d4b0958600eac75eaf524b7b7cb68d3a8c196635252b7a2c30d80e986"}, + {file = "fonttools-4.54.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c39287f5c8f4a0c5a55daf9eaf9ccd223ea59eed3f6d467133cc727d7b943a55"}, + {file = "fonttools-4.54.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a7a310c6e0471602fe3bf8efaf193d396ea561486aeaa7adc1f132e02d30c4b9"}, + {file = "fonttools-4.54.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d3b659d1029946f4ff9b6183984578041b520ce0f8fb7078bb37ec7445806b33"}, + {file = "fonttools-4.54.1-cp39-cp39-win32.whl", hash = "sha256:e96bc94c8cda58f577277d4a71f51c8e2129b8b36fd05adece6320dd3d57de8a"}, + {file = "fonttools-4.54.1-cp39-cp39-win_amd64.whl", hash = "sha256:e8a4b261c1ef91e7188a30571be6ad98d1c6d9fa2427244c545e2fa0a2494dd7"}, + {file = "fonttools-4.54.1-py3-none-any.whl", hash = "sha256:37cddd62d83dc4f72f7c3f3c2bcf2697e89a30efb152079896544a93907733bd"}, + {file = "fonttools-4.54.1.tar.gz", hash = "sha256:957f669d4922f92c171ba01bef7f29410668db09f6c02111e22b2bce446f3285"}, +] + +[package.extras] +all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "fs (>=2.2.0,<3)", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "pycairo", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.23.0)", "unicodedata2 (>=15.1.0)", "xattr", "zopfli (>=0.1.4)"] +graphite = ["lz4 (>=1.7.4.2)"] +interpolatable = ["munkres", "pycairo", "scipy"] +lxml = ["lxml (>=4.0)"] +pathops = ["skia-pathops (>=0.5.0)"] +plot = ["matplotlib"] +repacker = ["uharfbuzz (>=0.23.0)"] +symfont = ["sympy"] +type1 = ["xattr"] +ufo = ["fs (>=2.2.0,<3)"] +unicode = ["unicodedata2 (>=15.1.0)"] +woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] + +[[package]] +name = "freezegun" +version = "1.5.1" +description = "Let your Python tests travel through time" +optional = false +python-versions = ">=3.7" +files = [ + {file = "freezegun-1.5.1-py3-none-any.whl", hash = "sha256:bf111d7138a8abe55ab48a71755673dbaa4ab87f4cff5634a4442dfec34c15f1"}, + {file = "freezegun-1.5.1.tar.gz", hash = "sha256:b29dedfcda6d5e8e083ce71b2b542753ad48cfec44037b3fc79702e2980a89e9"}, +] + +[package.dependencies] +python-dateutil = ">=2.7" + +[[package]] +name = "frozenlist" +version = "1.5.0" +description = "A list-like structure which implements collections.abc.MutableSequence" +optional = true +python-versions = ">=3.8" +files = [ + {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"}, + {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"}, + {file = "frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5"}, + {file = "frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb"}, + {file = "frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4"}, + {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30"}, + {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5"}, + {file = "frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf"}, + {file = "frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942"}, + {file = "frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d"}, + {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21"}, + {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d"}, + {file = "frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f"}, + {file = "frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8"}, + {file = "frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f"}, + {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953"}, + {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0"}, + {file = "frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"}, + {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"}, + {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"}, + {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca"}, + {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10"}, + {file = "frozenlist-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e"}, + {file = "frozenlist-1.5.0-cp38-cp38-win32.whl", hash = "sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723"}, + {file = "frozenlist-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923"}, + {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"}, + {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"}, + {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"}, + {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"}, + {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"}, + {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"}, + {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"}, +] + +[[package]] +name = "genson" +version = "1.2.2" +description = "GenSON is a powerful, user-friendly JSON Schema generator." +optional = false +python-versions = "*" +files = [ + {file = "genson-1.2.2.tar.gz", hash = "sha256:8caf69aa10af7aee0e1a1351d1d06801f4696e005f06cedef438635384346a16"}, +] + +[[package]] +name = "greenlet" +version = "3.1.1" +description = "Lightweight in-process concurrent programming" +optional = true +python-versions = ">=3.7" +files = [ + {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36b89d13c49216cadb828db8dfa6ce86bbbc476a82d3a6c397f0efae0525bdd0"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b6150a85e1b33b40b1464a3f9988dcc5251d6ed06842abff82e42632fac120"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93147c513fac16385d1036b7e5b102c7fbbdb163d556b791f0f11eada7ba65dc"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7a9bff22ce038e19bf62c4dd1ec8391062878710ded0a845bcf47cc0200617"}, + {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b2795058c23988728eec1f36a4e5e4ebad22f8320c85f3587b539b9ac84128d7"}, + {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ed10eac5830befbdd0c32f83e8aa6288361597550ba669b04c48f0f9a2c843c6"}, + {file = "greenlet-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:77c386de38a60d1dfb8e55b8c1101d68c79dfdd25c7095d51fec2dd800892b80"}, + {file = "greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a"}, + {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511"}, + {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395"}, + {file = "greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39"}, + {file = "greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9"}, + {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0"}, + {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942"}, + {file = "greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01"}, + {file = "greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e"}, + {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1"}, + {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c"}, + {file = "greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"}, + {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"}, + {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47da355d8687fd65240c364c90a31569a133b7b60de111c255ef5b606f2ae291"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98884ecf2ffb7d7fe6bd517e8eb99d31ff7855a840fa6d0d63cd07c037f6a981"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1d4aeb8891338e60d1ab6127af1fe45def5259def8094b9c7e34690c8858803"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db32b5348615a04b82240cc67983cb315309e88d444a288934ee6ceaebcad6cc"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dcc62f31eae24de7f8dce72134c8651c58000d3b1868e01392baea7c32c247de"}, + {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1d3755bcb2e02de341c55b4fca7a745a24a9e7212ac953f6b3a48d117d7257aa"}, + {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b8da394b34370874b4572676f36acabac172602abf054cbc4ac910219f3340af"}, + {file = "greenlet-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:a0dfc6c143b519113354e780a50381508139b07d2177cb6ad6a08278ec655798"}, + {file = "greenlet-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:54558ea205654b50c438029505def3834e80f0869a70fb15b871c29b4575ddef"}, + {file = "greenlet-3.1.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:346bed03fe47414091be4ad44786d1bd8bef0c3fcad6ed3dee074a032ab408a9"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc59d69fc48664bc693842bd57acfdd490acafda1ab52c7836e3fc75c90a111"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21e10da6ec19b457b82636209cbe2331ff4306b54d06fa04b7c138ba18c8a81"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37b9de5a96111fc15418819ab4c4432e4f3c2ede61e660b1e33971eba26ef9ba"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef9ea3f137e5711f0dbe5f9263e8c009b7069d8a1acea822bd5e9dae0ae49c8"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85f3ff71e2e60bd4b4932a043fbbe0f499e263c628390b285cb599154a3b03b1"}, + {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95ffcf719966dd7c453f908e208e14cde192e09fde6c7186c8f1896ef778d8cd"}, + {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:03a088b9de532cbfe2ba2034b2b85e82df37874681e8c470d6fb2f8c04d7e4b7"}, + {file = "greenlet-3.1.1-cp38-cp38-win32.whl", hash = "sha256:8b8b36671f10ba80e159378df9c4f15c14098c4fd73a36b9ad715f057272fbef"}, + {file = "greenlet-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7017b2be767b9d43cc31416aba48aab0d2309ee31b4dbf10a1d38fb7972bdf9d"}, + {file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"}, + {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"}, + {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"}, + {file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"}, + {file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"}, + {file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"}, +] + +[package.extras] +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil"] + +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = true +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "httpcore" +version = "1.0.6" +description = "A minimal low-level HTTP client." +optional = true +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.6-py3-none-any.whl", hash = "sha256:27b59625743b85577a8c0e10e55b50b5368a4f2cfe8cc7bcfa9cf00829c2682f"}, + {file = "httpcore-1.0.6.tar.gz", hash = "sha256:73f6dbd6eb8c21bbf7ef8efad555481853f5f6acdeaff1edb0694289269ee17f"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<1.0)"] + +[[package]] +name = "httpx" +version = "0.27.2" +description = "The next generation HTTP client." +optional = true +python-versions = ">=3.8" +files = [ + {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, + {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +httpcore = "==1.*" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "idna" +version = "3.10" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.6" +files = [ + {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, + {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, +] + +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + +[[package]] +name = "imagesize" +version = "1.4.1" +description = "Getting image size from png/jpeg/jpeg2000/gif file" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, + {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, +] + +[[package]] +name = "importlib-metadata" +version = "6.11.0" +description = "Read metadata from Python packages" +optional = true +python-versions = ">=3.8" +files = [ + {file = "importlib_metadata-6.11.0-py3-none-any.whl", hash = "sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b"}, + {file = "importlib_metadata-6.11.0.tar.gz", hash = "sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443"}, +] + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +perf = ["ipython"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "isodate" +version = "0.6.1" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = false +python-versions = "*" +files = [ + {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, + {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, +] + +[package.dependencies] +six = "*" + +[[package]] +name = "jinja2" +version = "3.1.4" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "joblib" +version = "1.4.2" +description = "Lightweight pipelining with Python functions" +optional = true +python-versions = ">=3.8" +files = [ + {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, + {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, +] + +[[package]] +name = "jsonpatch" +version = "1.33" +description = "Apply JSON-Patches (RFC 6902)" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" +files = [ + {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"}, + {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"}, +] + +[package.dependencies] +jsonpointer = ">=1.9" + +[[package]] +name = "jsonpointer" +version = "3.0.0" +description = "Identify specific nodes in a JSON document (RFC 6901)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"}, + {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"}, +] + +[[package]] +name = "jsonref" +version = "0.2" +description = "An implementation of JSON Reference for Python" +optional = false +python-versions = "*" +files = [ + {file = "jsonref-0.2-py3-none-any.whl", hash = "sha256:b1e82fa0b62e2c2796a13e5401fe51790b248f6d9bf9d7212a3e31a3501b291f"}, + {file = "jsonref-0.2.tar.gz", hash = "sha256:f3c45b121cf6257eafabdc3a8008763aed1cd7da06dbabc59a9e4d2a5e4e6697"}, +] + +[[package]] +name = "jsonschema" +version = "3.2.0" +description = "An implementation of JSON Schema validation for Python" +optional = false +python-versions = "*" +files = [ + {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, + {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, +] + +[package.dependencies] +attrs = ">=17.4.0" +pyrsistent = ">=0.14.0" +setuptools = "*" +six = ">=1.11.0" + +[package.extras] +format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] +format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] + +[[package]] +name = "kiwisolver" +version = "1.4.7" +description = "A fast implementation of the Cassowary constraint solver" +optional = true +python-versions = ">=3.8" +files = [ + {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8a9c83f75223d5e48b0bc9cb1bf2776cf01563e00ade8775ffe13b0b6e1af3a6"}, + {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58370b1ffbd35407444d57057b57da5d6549d2d854fa30249771775c63b5fe17"}, + {file = "kiwisolver-1.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa0abdf853e09aff551db11fce173e2177d00786c688203f52c87ad7fcd91ef9"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8d53103597a252fb3ab8b5845af04c7a26d5e7ea8122303dd7a021176a87e8b9"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:88f17c5ffa8e9462fb79f62746428dd57b46eb931698e42e990ad63103f35e6c"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88a9ca9c710d598fd75ee5de59d5bda2684d9db36a9f50b6125eaea3969c2599"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f4d742cb7af1c28303a51b7a27aaee540e71bb8e24f68c736f6f2ffc82f2bf05"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28c7fea2196bf4c2f8d46a0415c77a1c480cc0724722f23d7410ffe9842c407"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e968b84db54f9d42046cf154e02911e39c0435c9801681e3fc9ce8a3c4130278"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0c18ec74c0472de033e1bebb2911c3c310eef5649133dd0bedf2a169a1b269e5"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8f0ea6da6d393d8b2e187e6a5e3fb81f5862010a40c3945e2c6d12ae45cfb2ad"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:f106407dda69ae456dd1227966bf445b157ccc80ba0dff3802bb63f30b74e895"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84ec80df401cfee1457063732d90022f93951944b5b58975d34ab56bb150dfb3"}, + {file = "kiwisolver-1.4.7-cp310-cp310-win32.whl", hash = "sha256:71bb308552200fb2c195e35ef05de12f0c878c07fc91c270eb3d6e41698c3bcc"}, + {file = "kiwisolver-1.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:44756f9fd339de0fb6ee4f8c1696cfd19b2422e0d70b4cefc1cc7f1f64045a8c"}, + {file = "kiwisolver-1.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:78a42513018c41c2ffd262eb676442315cbfe3c44eed82385c2ed043bc63210a"}, + {file = "kiwisolver-1.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d2b0e12a42fb4e72d509fc994713d099cbb15ebf1103545e8a45f14da2dfca54"}, + {file = "kiwisolver-1.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a8781ac3edc42ea4b90bc23e7d37b665d89423818e26eb6df90698aa2287c95"}, + {file = "kiwisolver-1.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46707a10836894b559e04b0fd143e343945c97fd170d69a2d26d640b4e297935"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef97b8df011141c9b0f6caf23b29379f87dd13183c978a30a3c546d2c47314cb"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ab58c12a2cd0fc769089e6d38466c46d7f76aced0a1f54c77652446733d2d02"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:803b8e1459341c1bb56d1c5c010406d5edec8a0713a0945851290a7930679b51"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9a9e8a507420fe35992ee9ecb302dab68550dedc0da9e2880dd88071c5fb052"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18077b53dc3bb490e330669a99920c5e6a496889ae8c63b58fbc57c3d7f33a18"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6af936f79086a89b3680a280c47ea90b4df7047b5bdf3aa5c524bbedddb9e545"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3abc5b19d24af4b77d1598a585b8a719beb8569a71568b66f4ebe1fb0449460b"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:933d4de052939d90afbe6e9d5273ae05fb836cc86c15b686edd4b3560cc0ee36"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:65e720d2ab2b53f1f72fb5da5fb477455905ce2c88aaa671ff0a447c2c80e8e3"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3bf1ed55088f214ba6427484c59553123fdd9b218a42bbc8c6496d6754b1e523"}, + {file = "kiwisolver-1.4.7-cp311-cp311-win32.whl", hash = "sha256:4c00336b9dd5ad96d0a558fd18a8b6f711b7449acce4c157e7343ba92dd0cf3d"}, + {file = "kiwisolver-1.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:929e294c1ac1e9f615c62a4e4313ca1823ba37326c164ec720a803287c4c499b"}, + {file = "kiwisolver-1.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:e33e8fbd440c917106b237ef1a2f1449dfbb9b6f6e1ce17c94cd6a1e0d438376"}, + {file = "kiwisolver-1.4.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:5360cc32706dab3931f738d3079652d20982511f7c0ac5711483e6eab08efff2"}, + {file = "kiwisolver-1.4.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942216596dc64ddb25adb215c3c783215b23626f8d84e8eff8d6d45c3f29f75a"}, + {file = "kiwisolver-1.4.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48b571ecd8bae15702e4f22d3ff6a0f13e54d3d00cd25216d5e7f658242065ee"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad42ba922c67c5f219097b28fae965e10045ddf145d2928bfac2eb2e17673640"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:612a10bdae23404a72941a0fc8fa2660c6ea1217c4ce0dbcab8a8f6543ea9e7f"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e838bba3a3bac0fe06d849d29772eb1afb9745a59710762e4ba3f4cb8424483"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:22f499f6157236c19f4bbbd472fa55b063db77a16cd74d49afe28992dff8c258"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693902d433cf585133699972b6d7c42a8b9f8f826ebcaf0132ff55200afc599e"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4e77f2126c3e0b0d055f44513ed349038ac180371ed9b52fe96a32aa071a5107"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:657a05857bda581c3656bfc3b20e353c232e9193eb167766ad2dc58b56504948"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4bfa75a048c056a411f9705856abfc872558e33c055d80af6a380e3658766038"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:34ea1de54beef1c104422d210c47c7d2a4999bdecf42c7b5718fbe59a4cac383"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:90da3b5f694b85231cf93586dad5e90e2d71b9428f9aad96952c99055582f520"}, + {file = "kiwisolver-1.4.7-cp312-cp312-win32.whl", hash = "sha256:18e0cca3e008e17fe9b164b55735a325140a5a35faad8de92dd80265cd5eb80b"}, + {file = "kiwisolver-1.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:58cb20602b18f86f83a5c87d3ee1c766a79c0d452f8def86d925e6c60fbf7bfb"}, + {file = "kiwisolver-1.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:f5a8b53bdc0b3961f8b6125e198617c40aeed638b387913bf1ce78afb1b0be2a"}, + {file = "kiwisolver-1.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2e6039dcbe79a8e0f044f1c39db1986a1b8071051efba3ee4d74f5b365f5226e"}, + {file = "kiwisolver-1.4.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a1ecf0ac1c518487d9d23b1cd7139a6a65bc460cd101ab01f1be82ecf09794b6"}, + {file = "kiwisolver-1.4.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7ab9ccab2b5bd5702ab0803676a580fffa2aa178c2badc5557a84cc943fcf750"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f816dd2277f8d63d79f9c8473a79fe54047bc0467754962840782c575522224d"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf8bcc23ceb5a1b624572a1623b9f79d2c3b337c8c455405ef231933a10da379"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dea0bf229319828467d7fca8c7c189780aa9ff679c94539eed7532ebe33ed37c"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c06a4c7cf15ec739ce0e5971b26c93638730090add60e183530d70848ebdd34"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:913983ad2deb14e66d83c28b632fd35ba2b825031f2fa4ca29675e665dfecbe1"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5337ec7809bcd0f424c6b705ecf97941c46279cf5ed92311782c7c9c2026f07f"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4c26ed10c4f6fa6ddb329a5120ba3b6db349ca192ae211e882970bfc9d91420b"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c619b101e6de2222c1fcb0531e1b17bbffbe54294bfba43ea0d411d428618c27"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:073a36c8273647592ea332e816e75ef8da5c303236ec0167196793eb1e34657a"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3ce6b2b0231bda412463e152fc18335ba32faf4e8c23a754ad50ffa70e4091ee"}, + {file = "kiwisolver-1.4.7-cp313-cp313-win32.whl", hash = "sha256:f4c9aee212bc89d4e13f58be11a56cc8036cabad119259d12ace14b34476fd07"}, + {file = "kiwisolver-1.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:8a3ec5aa8e38fc4c8af308917ce12c536f1c88452ce554027e55b22cbbfbff76"}, + {file = "kiwisolver-1.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:76c8094ac20ec259471ac53e774623eb62e6e1f56cd8690c67ce6ce4fcb05650"}, + {file = "kiwisolver-1.4.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5d5abf8f8ec1f4e22882273c423e16cae834c36856cac348cfbfa68e01c40f3a"}, + {file = "kiwisolver-1.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aeb3531b196ef6f11776c21674dba836aeea9d5bd1cf630f869e3d90b16cfade"}, + {file = "kiwisolver-1.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7d755065e4e866a8086c9bdada157133ff466476a2ad7861828e17b6026e22c"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08471d4d86cbaec61f86b217dd938a83d85e03785f51121e791a6e6689a3be95"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7bbfcb7165ce3d54a3dfbe731e470f65739c4c1f85bb1018ee912bae139e263b"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d34eb8494bea691a1a450141ebb5385e4b69d38bb8403b5146ad279f4b30fa3"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9242795d174daa40105c1d86aba618e8eab7bf96ba8c3ee614da8302a9f95503"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a0f64a48bb81af7450e641e3fe0b0394d7381e342805479178b3d335d60ca7cf"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8e045731a5416357638d1700927529e2b8ab304811671f665b225f8bf8d8f933"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4322872d5772cae7369f8351da1edf255a604ea7087fe295411397d0cfd9655e"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:e1631290ee9271dffe3062d2634c3ecac02c83890ada077d225e081aca8aab89"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:edcfc407e4eb17e037bca59be0e85a2031a2ac87e4fed26d3e9df88b4165f92d"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4d05d81ecb47d11e7f8932bd8b61b720bf0b41199358f3f5e36d38e28f0532c5"}, + {file = "kiwisolver-1.4.7-cp38-cp38-win32.whl", hash = "sha256:b38ac83d5f04b15e515fd86f312479d950d05ce2368d5413d46c088dda7de90a"}, + {file = "kiwisolver-1.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:d83db7cde68459fc803052a55ace60bea2bae361fc3b7a6d5da07e11954e4b09"}, + {file = "kiwisolver-1.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3f9362ecfca44c863569d3d3c033dbe8ba452ff8eed6f6b5806382741a1334bd"}, + {file = "kiwisolver-1.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8df2eb9b2bac43ef8b082e06f750350fbbaf2887534a5be97f6cf07b19d9583"}, + {file = "kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f32d6edbc638cde7652bd690c3e728b25332acbadd7cad670cc4a02558d9c417"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e2e6c39bd7b9372b0be21456caab138e8e69cc0fc1190a9dfa92bd45a1e6e904"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dda56c24d869b1193fcc763f1284b9126550eaf84b88bbc7256e15028f19188a"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79849239c39b5e1fd906556c474d9b0439ea6792b637511f3fe3a41158d89ca8"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5e3bc157fed2a4c02ec468de4ecd12a6e22818d4f09cde2c31ee3226ffbefab2"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3da53da805b71e41053dc670f9a820d1157aae77b6b944e08024d17bcd51ef88"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8705f17dfeb43139a692298cb6637ee2e59c0194538153e83e9ee0c75c2eddde"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:82a5c2f4b87c26bb1a0ef3d16b5c4753434633b83d365cc0ddf2770c93829e3c"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce8be0466f4c0d585cdb6c1e2ed07232221df101a4c6f28821d2aa754ca2d9e2"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:409afdfe1e2e90e6ee7fc896f3df9a7fec8e793e58bfa0d052c8a82f99c37abb"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5b9c3f4ee0b9a439d2415012bd1b1cc2df59e4d6a9939f4d669241d30b414327"}, + {file = "kiwisolver-1.4.7-cp39-cp39-win32.whl", hash = "sha256:a79ae34384df2b615eefca647a2873842ac3b596418032bef9a7283675962644"}, + {file = "kiwisolver-1.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:cf0438b42121a66a3a667de17e779330fc0f20b0d97d59d2f2121e182b0505e4"}, + {file = "kiwisolver-1.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:764202cc7e70f767dab49e8df52c7455e8de0df5d858fa801a11aa0d882ccf3f"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:94252291e3fe68001b1dd747b4c0b3be12582839b95ad4d1b641924d68fd4643"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b7dfa3b546da08a9f622bb6becdb14b3e24aaa30adba66749d38f3cc7ea9706"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd3de6481f4ed8b734da5df134cd5a6a64fe32124fe83dde1e5b5f29fe30b1e6"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a91b5f9f1205845d488c928e8570dcb62b893372f63b8b6e98b863ebd2368ff2"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40fa14dbd66b8b8f470d5fc79c089a66185619d31645f9b0773b88b19f7223c4"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bfa1acfa0c54932d5607e19a2c24646fb4c1ae2694437789129cf099789a3b00"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:eee3ea935c3d227d49b4eb85660ff631556841f6e567f0f7bda972df6c2c9935"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f3160309af4396e0ed04db259c3ccbfdc3621b5559b5453075e5de555e1f3a1b"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a17f6a29cf8935e587cc8a4dbfc8368c55edc645283db0ce9801016f83526c2d"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10849fb2c1ecbfae45a693c070e0320a91b35dd4bcf58172c023b994283a124d"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:ac542bf38a8a4be2dc6b15248d36315ccc65f0743f7b1a76688ffb6b5129a5c2"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b01aac285f91ca889c800042c35ad3b239e704b150cfd3382adfc9dcc780e39"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:48be928f59a1f5c8207154f935334d374e79f2b5d212826307d072595ad76a2e"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f37cfe618a117e50d8c240555331160d73d0411422b59b5ee217843d7b693608"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:599b5c873c63a1f6ed7eead644a8a380cfbdf5db91dcb6f85707aaab213b1674"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:801fa7802e5cfabe3ab0c81a34c323a319b097dfb5004be950482d882f3d7225"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0c6c43471bc764fad4bc99c5c2d6d16a676b1abf844ca7c8702bdae92df01ee0"}, + {file = "kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60"}, +] + +[[package]] +name = "langchain" +version = "0.1.16" +description = "Building applications with LLMs through composability" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain-0.1.16-py3-none-any.whl", hash = "sha256:bc074cc5e51fad79b9ead1572fc3161918d0f614a6c8f0460543d505ad249ac7"}, + {file = "langchain-0.1.16.tar.gz", hash = "sha256:b6bce78f8c071baa898884accfff15c3d81da2f0dd86c20e2f4c80b41463f49f"}, +] + +[package.dependencies] +aiohttp = ">=3.8.3,<4.0.0" +async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""} +dataclasses-json = ">=0.5.7,<0.7" +jsonpatch = ">=1.33,<2.0" +langchain-community = ">=0.0.32,<0.1" +langchain-core = ">=0.1.42,<0.2.0" +langchain-text-splitters = ">=0.0.1,<0.1" +langsmith = ">=0.1.17,<0.2.0" +numpy = ">=1,<2" +pydantic = ">=1,<3" +PyYAML = ">=5.3" +requests = ">=2,<3" +SQLAlchemy = ">=1.4,<3" +tenacity = ">=8.1.0,<9.0.0" + +[package.extras] +azure = ["azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-textanalytics (>=5.3.0,<6.0.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "azure-search-documents (==11.4.0b8)", "openai (<2)"] +clarifai = ["clarifai (>=9.1.0)"] +cli = ["typer (>=0.9.0,<0.10.0)"] +cohere = ["cohere (>=4,<6)"] +docarray = ["docarray[hnswlib] (>=0.32.0,<0.33.0)"] +embeddings = ["sentence-transformers (>=2,<3)"] +extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cohere (>=4,<6)", "couchbase (>=4.1.9,<5.0.0)", "dashvector (>=1.0.1,<2.0.0)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "langchain-openai (>=0.0.2,<0.1)", "lxml (>=4.9.3,<6.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)"] +javascript = ["esprima (>=4.0.1,<5.0.0)"] +llms = ["clarifai (>=9.1.0)", "cohere (>=4,<6)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (<2)", "openlm (>=0.0.5,<0.0.6)", "torch (>=1,<3)", "transformers (>=4,<5)"] +openai = ["openai (<2)", "tiktoken (>=0.3.2,<0.6.0)"] +qdrant = ["qdrant-client (>=1.3.1,<2.0.0)"] +text-helpers = ["chardet (>=5.1.0,<6.0.0)"] + +[[package]] +name = "langchain-community" +version = "0.0.32" +description = "Community contributed LangChain integrations." +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain_community-0.0.32-py3-none-any.whl", hash = "sha256:406977009999952d0705de3806de2b4867e9bb8eda8ca154a59c7a8ed58da38d"}, + {file = "langchain_community-0.0.32.tar.gz", hash = "sha256:1510217d646c8380f54e9850351f6d2a0b0dd73c501b666c6f4b40baa8160b29"}, +] + +[package.dependencies] +aiohttp = ">=3.8.3,<4.0.0" +dataclasses-json = ">=0.5.7,<0.7" +langchain-core = ">=0.1.41,<0.2.0" +langsmith = ">=0.1.0,<0.2.0" +numpy = ">=1,<2" +PyYAML = ">=5.3" +requests = ">=2,<3" +SQLAlchemy = ">=1.4,<3" +tenacity = ">=8.1.0,<9.0.0" + +[package.extras] +cli = ["typer (>=0.9.0,<0.10.0)"] +extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-ai-documentintelligence (>=1.0.0b1,<2.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cloudpickle (>=2.0.0)", "cohere (>=4,<5)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "elasticsearch (>=8.12.0,<9.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "friendli-client (>=1.2.4,<2.0.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "gradientai (>=1.4.0,<2.0.0)", "hdbcli (>=2.19.21,<3.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "httpx (>=0.24.1,<0.25.0)", "httpx-sse (>=0.4.0,<0.5.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.3,<6.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "nvidia-riva-client (>=2.14.0,<3.0.0)", "oci (>=2.119.1,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "oracle-ads (>=2.9.1,<3.0.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "premai (>=0.3.25,<0.4.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pyjwt (>=2.8.0,<3.0.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "tidb-vector (>=0.0.3,<1.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "tree-sitter (>=0.20.2,<0.21.0)", "tree-sitter-languages (>=1.8.0,<2.0.0)", "upstash-redis (>=0.15.0,<0.16.0)", "vdms (>=0.0.20,<0.0.21)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)"] + +[[package]] +name = "langchain-core" +version = "0.1.42" +description = "Building applications with LLMs through composability" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain_core-0.1.42-py3-none-any.whl", hash = "sha256:c5653ffa08a44f740295c157a24c0def4a753333f6a2c41f76bf431cd00be8b5"}, + {file = "langchain_core-0.1.42.tar.gz", hash = "sha256:40751bf60ea5d8e2b2efe65290db434717ee3834870c002e40e2811f09d814e6"}, +] + +[package.dependencies] +jsonpatch = ">=1.33,<2.0" +langsmith = ">=0.1.0,<0.2.0" +packaging = ">=23.2,<24.0" +pydantic = ">=1,<3" +PyYAML = ">=5.3" +tenacity = ">=8.1.0,<9.0.0" + +[package.extras] +extended-testing = ["jinja2 (>=3,<4)"] + +[[package]] +name = "langchain-text-splitters" +version = "0.0.2" +description = "LangChain text splitting utilities" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain_text_splitters-0.0.2-py3-none-any.whl", hash = "sha256:13887f32705862c1e1454213cb7834a63aae57c26fcd80346703a1d09c46168d"}, + {file = "langchain_text_splitters-0.0.2.tar.gz", hash = "sha256:ac8927dc0ba08eba702f6961c9ed7df7cead8de19a9f7101ab2b5ea34201b3c1"}, +] + +[package.dependencies] +langchain-core = ">=0.1.28,<0.3" + +[package.extras] +extended-testing = ["beautifulsoup4 (>=4.12.3,<5.0.0)", "lxml (>=4.9.3,<6.0)"] + +[[package]] +name = "langdetect" +version = "1.0.9" +description = "Language detection library ported from Google's language-detection." +optional = true +python-versions = "*" +files = [ + {file = "langdetect-1.0.9-py2-none-any.whl", hash = "sha256:7cbc0746252f19e76f77c0b1690aadf01963be835ef0cd4b56dddf2a8f1dfc2a"}, + {file = "langdetect-1.0.9.tar.gz", hash = "sha256:cbc1fef89f8d062739774bd51eda3da3274006b3661d199c2655f6b3f6d605a0"}, +] + +[package.dependencies] +six = "*" + +[[package]] +name = "langsmith" +version = "0.1.142" +description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langsmith-0.1.142-py3-none-any.whl", hash = "sha256:f639ca23c9a0bb77af5fb881679b2f66ff1f21f19d0bebf4e51375e7585a8b38"}, + {file = "langsmith-0.1.142.tar.gz", hash = "sha256:f8a84d100f3052233ff0a1d66ae14c5dfc20b7e41a1601de011384f16ee6cb82"}, +] + +[package.dependencies] +httpx = ">=0.23.0,<1" +orjson = ">=3.9.14,<4.0.0" +pydantic = [ + {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, +] +requests = ">=2,<3" +requests-toolbelt = ">=1.0.0,<2.0.0" + +[[package]] +name = "linkify-it-py" +version = "2.0.3" +description = "Links recognition library with FULL unicode support." +optional = false +python-versions = ">=3.7" +files = [ + {file = "linkify-it-py-2.0.3.tar.gz", hash = "sha256:68cda27e162e9215c17d786649d1da0021a451bdc436ef9e0fa0ba5234b9b048"}, + {file = "linkify_it_py-2.0.3-py3-none-any.whl", hash = "sha256:6bcbc417b0ac14323382aef5c5192c0075bf8a9d6b41820a2b66371eac6b6d79"}, +] + +[package.dependencies] +uc-micro-py = "*" + +[package.extras] +benchmark = ["pytest", "pytest-benchmark"] +dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] +doc = ["myst-parser", "sphinx", "sphinx-book-theme"] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "lxml" +version = "5.3.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=3.6" +files = [ + {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, + {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7"}, + {file = "lxml-5.3.0-cp310-cp310-win32.whl", hash = "sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80"}, + {file = "lxml-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3"}, + {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b"}, + {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be"}, + {file = "lxml-5.3.0-cp311-cp311-win32.whl", hash = "sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9"}, + {file = "lxml-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1"}, + {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859"}, + {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d"}, + {file = "lxml-5.3.0-cp312-cp312-win32.whl", hash = "sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30"}, + {file = "lxml-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f"}, + {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a"}, + {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b"}, + {file = "lxml-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957"}, + {file = "lxml-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d"}, + {file = "lxml-5.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:8f0de2d390af441fe8b2c12626d103540b5d850d585b18fcada58d972b74a74e"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1afe0a8c353746e610bd9031a630a95bcfb1a720684c3f2b36c4710a0a96528f"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56b9861a71575f5795bde89256e7467ece3d339c9b43141dbdd54544566b3b94"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:9fb81d2824dff4f2e297a276297e9031f46d2682cafc484f49de182aa5e5df99"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2c226a06ecb8cdef28845ae976da407917542c5e6e75dcac7cc33eb04aaeb237"}, + {file = "lxml-5.3.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:7d3d1ca42870cdb6d0d29939630dbe48fa511c203724820fc0fd507b2fb46577"}, + {file = "lxml-5.3.0-cp36-cp36m-win32.whl", hash = "sha256:094cb601ba9f55296774c2d57ad68730daa0b13dc260e1f941b4d13678239e70"}, + {file = "lxml-5.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:eafa2c8658f4e560b098fe9fc54539f86528651f61849b22111a9b107d18910c"}, + {file = "lxml-5.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cb83f8a875b3d9b458cada4f880fa498646874ba4011dc974e071a0a84a1b033"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25f1b69d41656b05885aa185f5fdf822cb01a586d1b32739633679699f220391"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23e0553b8055600b3bf4a00b255ec5c92e1e4aebf8c2c09334f8368e8bd174d6"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ada35dd21dc6c039259596b358caab6b13f4db4d4a7f8665764d616daf9cc1d"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:81b4e48da4c69313192d8c8d4311e5d818b8be1afe68ee20f6385d0e96fc9512"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:2bc9fd5ca4729af796f9f59cd8ff160fe06a474da40aca03fcc79655ddee1a8b"}, + {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:07da23d7ee08577760f0a71d67a861019103e4812c87e2fab26b039054594cc5"}, + {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:ea2e2f6f801696ad7de8aec061044d6c8c0dd4037608c7cab38a9a4d316bfb11"}, + {file = "lxml-5.3.0-cp37-cp37m-win32.whl", hash = "sha256:5c54afdcbb0182d06836cc3d1be921e540be3ebdf8b8a51ee3ef987537455f84"}, + {file = "lxml-5.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:f2901429da1e645ce548bf9171784c0f74f0718c3f6150ce166be39e4dd66c3e"}, + {file = "lxml-5.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c56a1d43b2f9ee4786e4658c7903f05da35b923fb53c11025712562d5cc02753"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ee8c39582d2652dcd516d1b879451500f8db3fe3607ce45d7c5957ab2596040"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdf3a3059611f7585a78ee10399a15566356116a4288380921a4b598d807a22"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:146173654d79eb1fc97498b4280c1d3e1e5d58c398fa530905c9ea50ea849b22"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0a7056921edbdd7560746f4221dca89bb7a3fe457d3d74267995253f46343f15"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:9e4b47ac0f5e749cfc618efdf4726269441014ae1d5583e047b452a32e221920"}, + {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f914c03e6a31deb632e2daa881fe198461f4d06e57ac3d0e05bbcab8eae01945"}, + {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:213261f168c5e1d9b7535a67e68b1f59f92398dd17a56d934550837143f79c42"}, + {file = "lxml-5.3.0-cp38-cp38-win32.whl", hash = "sha256:218c1b2e17a710e363855594230f44060e2025b05c80d1f0661258142b2add2e"}, + {file = "lxml-5.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:315f9542011b2c4e1d280e4a20ddcca1761993dda3afc7a73b01235f8641e903"}, + {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1ffc23010330c2ab67fac02781df60998ca8fe759e8efde6f8b756a20599c5de"}, + {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2b3778cb38212f52fac9fe913017deea2fdf4eb1a4f8e4cfc6b009a13a6d3fcc"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b0c7a688944891086ba192e21c5229dea54382f4836a209ff8d0a660fac06be"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:747a3d3e98e24597981ca0be0fd922aebd471fa99d0043a3842d00cdcad7ad6a"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86a6b24b19eaebc448dc56b87c4865527855145d851f9fc3891673ff97950540"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b11a5d918a6216e521c715b02749240fb07ae5a1fefd4b7bf12f833bc8b4fe70"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68b87753c784d6acb8a25b05cb526c3406913c9d988d51f80adecc2b0775d6aa"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:109fa6fede314cc50eed29e6e56c540075e63d922455346f11e4d7a036d2b8cf"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_ppc64le.whl", hash = "sha256:02ced472497b8362c8e902ade23e3300479f4f43e45f4105c85ef43b8db85229"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_s390x.whl", hash = "sha256:6b038cc86b285e4f9fea2ba5ee76e89f21ed1ea898e287dc277a25884f3a7dfe"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:7437237c6a66b7ca341e868cda48be24b8701862757426852c9b3186de1da8a2"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7f41026c1d64043a36fda21d64c5026762d53a77043e73e94b71f0521939cc71"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:482c2f67761868f0108b1743098640fbb2a28a8e15bf3f47ada9fa59d9fe08c3"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:1483fd3358963cc5c1c9b122c80606a3a79ee0875bcac0204149fa09d6ff2727"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2dec2d1130a9cda5b904696cec33b2cfb451304ba9081eeda7f90f724097300a"}, + {file = "lxml-5.3.0-cp39-cp39-win32.whl", hash = "sha256:a0eabd0a81625049c5df745209dc7fcef6e2aea7793e5f003ba363610aa0a3ff"}, + {file = "lxml-5.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:89e043f1d9d341c52bf2af6d02e6adde62e0a46e6755d5eb60dc6e4f0b8aeca2"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:94d6c3782907b5e40e21cadf94b13b0842ac421192f26b84c45f13f3c9d5dc27"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c300306673aa0f3ed5ed9372b21867690a17dba38c68c44b287437c362ce486b"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d9b952e07aed35fe2e1a7ad26e929595412db48535921c5013edc8aa4a35ce"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:01220dca0d066d1349bd6a1726856a78f7929f3878f7e2ee83c296c69495309e"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2d9b8d9177afaef80c53c0a9e30fa252ff3036fb1c6494d427c066a4ce6a282f"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:20094fc3f21ea0a8669dc4c61ed7fa8263bd37d97d93b90f28fc613371e7a875"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ace2c2326a319a0bb8a8b0e5b570c764962e95818de9f259ce814ee666603f19"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92e67a0be1639c251d21e35fe74df6bcc40cba445c2cda7c4a967656733249e2"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5350b55f9fecddc51385463a4f67a5da829bc741e38cf689f38ec9023f54ab"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c1fefd7e3d00921c44dc9ca80a775af49698bbfd92ea84498e56acffd4c5469"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:71a8dd38fbd2f2319136d4ae855a7078c69c9a38ae06e0c17c73fd70fc6caad8"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:97acf1e1fd66ab53dacd2c35b319d7e548380c2e9e8c54525c6e76d21b1ae3b1"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:68934b242c51eb02907c5b81d138cb977b2129a0a75a8f8b60b01cb8586c7b21"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b710bc2b8292966b23a6a0121f7a6c51d45d2347edcc75f016ac123b8054d3f2"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18feb4b93302091b1541221196a2155aa296c363fd233814fa11e181adebc52f"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3eb44520c4724c2e1a57c0af33a379eee41792595023f367ba3952a2d96c2aab"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:609251a0ca4770e5a8768ff902aa02bf636339c5a93f9349b48eb1f606f7f3e9"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:516f491c834eb320d6c843156440fe7fc0d50b33e44387fcec5b02f0bc118a4c"}, + {file = "lxml-5.3.0.tar.gz", hash = "sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml-html-clean"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.11)"] + +[[package]] +name = "markdown" +version = "3.7" +description = "Python implementation of John Gruber's Markdown." +optional = true +python-versions = ">=3.8" +files = [ + {file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"}, + {file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"}, +] + +[package.extras] +docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] +testing = ["coverage", "pyyaml"] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +linkify-it-py = {version = ">=1,<3", optional = true, markers = "extra == \"linkify\""} +mdit-py-plugins = {version = "*", optional = true, markers = "extra == \"plugins\""} +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "markupsafe" +version = "3.0.2" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.9" +files = [ + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, + {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, +] + +[[package]] +name = "marshmallow" +version = "3.23.1" +description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +optional = true +python-versions = ">=3.9" +files = [ + {file = "marshmallow-3.23.1-py3-none-any.whl", hash = "sha256:fece2eb2c941180ea1b7fcbd4a83c51bfdd50093fdd3ad2585ee5e1df2508491"}, + {file = "marshmallow-3.23.1.tar.gz", hash = "sha256:3a8dfda6edd8dcdbf216c0ede1d1e78d230a6dc9c5a088f58c4083b974a0d468"}, +] + +[package.dependencies] +packaging = ">=17.0" + +[package.extras] +dev = ["marshmallow[tests]", "pre-commit (>=3.5,<5.0)", "tox"] +docs = ["alabaster (==1.0.0)", "autodocsumm (==0.2.14)", "sphinx (==8.1.3)", "sphinx-issues (==5.0.0)", "sphinx-version-warning (==1.1.2)"] +tests = ["pytest", "simplejson"] + +[[package]] +name = "matplotlib" +version = "3.9.2" +description = "Python plotting package" +optional = true +python-versions = ">=3.9" +files = [ + {file = "matplotlib-3.9.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9d78bbc0cbc891ad55b4f39a48c22182e9bdaea7fc0e5dbd364f49f729ca1bbb"}, + {file = "matplotlib-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c375cc72229614632c87355366bdf2570c2dac01ac66b8ad048d2dabadf2d0d4"}, + {file = "matplotlib-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d94ff717eb2bd0b58fe66380bd8b14ac35f48a98e7c6765117fe67fb7684e64"}, + {file = "matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab68d50c06938ef28681073327795c5db99bb4666214d2d5f880ed11aeaded66"}, + {file = "matplotlib-3.9.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:65aacf95b62272d568044531e41de26285d54aec8cb859031f511f84bd8b495a"}, + {file = "matplotlib-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:3fd595f34aa8a55b7fc8bf9ebea8aa665a84c82d275190a61118d33fbc82ccae"}, + {file = "matplotlib-3.9.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d8dd059447824eec055e829258ab092b56bb0579fc3164fa09c64f3acd478772"}, + {file = "matplotlib-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c797dac8bb9c7a3fd3382b16fe8f215b4cf0f22adccea36f1545a6d7be310b41"}, + {file = "matplotlib-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d719465db13267bcef19ea8954a971db03b9f48b4647e3860e4bc8e6ed86610f"}, + {file = "matplotlib-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8912ef7c2362f7193b5819d17dae8629b34a95c58603d781329712ada83f9447"}, + {file = "matplotlib-3.9.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7741f26a58a240f43bee74965c4882b6c93df3e7eb3de160126d8c8f53a6ae6e"}, + {file = "matplotlib-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:ae82a14dab96fbfad7965403c643cafe6515e386de723e498cf3eeb1e0b70cc7"}, + {file = "matplotlib-3.9.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:ac43031375a65c3196bee99f6001e7fa5bdfb00ddf43379d3c0609bdca042df9"}, + {file = "matplotlib-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:be0fc24a5e4531ae4d8e858a1a548c1fe33b176bb13eff7f9d0d38ce5112a27d"}, + {file = "matplotlib-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf81de2926c2db243c9b2cbc3917619a0fc85796c6ba4e58f541df814bbf83c7"}, + {file = "matplotlib-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6ee45bc4245533111ced13f1f2cace1e7f89d1c793390392a80c139d6cf0e6c"}, + {file = "matplotlib-3.9.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:306c8dfc73239f0e72ac50e5a9cf19cc4e8e331dd0c54f5e69ca8758550f1e1e"}, + {file = "matplotlib-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:5413401594cfaff0052f9d8b1aafc6d305b4bd7c4331dccd18f561ff7e1d3bd3"}, + {file = "matplotlib-3.9.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:18128cc08f0d3cfff10b76baa2f296fc28c4607368a8402de61bb3f2eb33c7d9"}, + {file = "matplotlib-3.9.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4876d7d40219e8ae8bb70f9263bcbe5714415acfdf781086601211335e24f8aa"}, + {file = "matplotlib-3.9.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d9f07a80deab4bb0b82858a9e9ad53d1382fd122be8cde11080f4e7dfedb38b"}, + {file = "matplotlib-3.9.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7c0410f181a531ec4e93bbc27692f2c71a15c2da16766f5ba9761e7ae518413"}, + {file = "matplotlib-3.9.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:909645cce2dc28b735674ce0931a4ac94e12f5b13f6bb0b5a5e65e7cea2c192b"}, + {file = "matplotlib-3.9.2-cp313-cp313-win_amd64.whl", hash = "sha256:f32c7410c7f246838a77d6d1eff0c0f87f3cb0e7c4247aebea71a6d5a68cab49"}, + {file = "matplotlib-3.9.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:37e51dd1c2db16ede9cfd7b5cabdfc818b2c6397c83f8b10e0e797501c963a03"}, + {file = "matplotlib-3.9.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b82c5045cebcecd8496a4d694d43f9cc84aeeb49fe2133e036b207abe73f4d30"}, + {file = "matplotlib-3.9.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f053c40f94bc51bc03832a41b4f153d83f2062d88c72b5e79997072594e97e51"}, + {file = "matplotlib-3.9.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbe196377a8248972f5cede786d4c5508ed5f5ca4a1e09b44bda889958b33f8c"}, + {file = "matplotlib-3.9.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5816b1e1fe8c192cbc013f8f3e3368ac56fbecf02fb41b8f8559303f24c5015e"}, + {file = "matplotlib-3.9.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:cef2a73d06601437be399908cf13aee74e86932a5ccc6ccdf173408ebc5f6bb2"}, + {file = "matplotlib-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0830e188029c14e891fadd99702fd90d317df294c3298aad682739c5533721a"}, + {file = "matplotlib-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ba9c1299c920964e8d3857ba27173b4dbb51ca4bab47ffc2c2ba0eb5e2cbc5"}, + {file = "matplotlib-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1cd93b91ab47a3616b4d3c42b52f8363b88ca021e340804c6ab2536344fad9ca"}, + {file = "matplotlib-3.9.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6d1ce5ed2aefcdce11904fc5bbea7d9c21fff3d5f543841edf3dea84451a09ea"}, + {file = "matplotlib-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:b2696efdc08648536efd4e1601b5fd491fd47f4db97a5fbfd175549a7365c1b2"}, + {file = "matplotlib-3.9.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d52a3b618cb1cbb769ce2ee1dcdb333c3ab6e823944e9a2d36e37253815f9556"}, + {file = "matplotlib-3.9.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:039082812cacd6c6bec8e17a9c1e6baca230d4116d522e81e1f63a74d01d2e21"}, + {file = "matplotlib-3.9.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6758baae2ed64f2331d4fd19be38b7b4eae3ecec210049a26b6a4f3ae1c85dcc"}, + {file = "matplotlib-3.9.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:050598c2b29e0b9832cde72bcf97627bf00262adbc4a54e2b856426bb2ef0697"}, + {file = "matplotlib-3.9.2.tar.gz", hash = "sha256:96ab43906269ca64a6366934106fa01534454a69e471b7bf3d79083981aaab92"}, +] + +[package.dependencies] +contourpy = ">=1.0.1" +cycler = ">=0.10" +fonttools = ">=4.22.0" +kiwisolver = ">=1.3.1" +numpy = ">=1.23" +packaging = ">=20.0" +pillow = ">=8" +pyparsing = ">=2.3.1" +python-dateutil = ">=2.7" + +[package.extras] +dev = ["meson-python (>=0.13.1)", "numpy (>=1.25)", "pybind11 (>=2.6)", "setuptools (>=64)", "setuptools_scm (>=7)"] + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + +[[package]] +name = "mdit-py-plugins" +version = "0.4.2" +description = "Collection of plugins for markdown-it-py" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"}, + {file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"}, +] + +[package.dependencies] +markdown-it-py = ">=1.0.0,<4.0.0" + +[package.extras] +code-style = ["pre-commit"] +rtd = ["myst-parser", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + +[[package]] +name = "memray" +version = "1.14.0" +description = "A memory profiler for Python applications" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "memray-1.14.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:745d9014cb662065501441a7b534c29914fe2b68398b37385aba9f4a1c51c723"}, + {file = "memray-1.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f62a402ca1a7126f749544c3d6493672d6330ffd37d59ba230bc73e5143b3bc2"}, + {file = "memray-1.14.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:36840f39277b1871ecb5a9592dd1aa517a17b9f855f4e3ff08aa328a9d305e69"}, + {file = "memray-1.14.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3c7933ca70c0d59d0ce9b1064a6eda86231248759b46ed6dabedf489039d1aa1"}, + {file = "memray-1.14.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75a5907345ff845652e709ddce3171a9ba2d65c62e8bd49a99131066e2a7ce3b"}, + {file = "memray-1.14.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:88c89c3797834eec177a89ad528699c75b94e2ed08c00754141eae69c520b894"}, + {file = "memray-1.14.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:d6087f291fd68acdf0a833efb57bc0f192c98ae89b4377c690c28313e78d029c"}, + {file = "memray-1.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e6ba7bff9dfa37bf3b80a5b83b50eadf20afb1f0e8de4a0139019154086d6bed"}, + {file = "memray-1.14.0-cp311-cp311-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9bb0cfe1b755a860435cd52047b2e3f4f7b0c3887e0c1bf98da7127948284a91"}, + {file = "memray-1.14.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:638ba74e1735a40b6595fee7f37b426b9a95d244091a1f5df3dc5d98df1cbd4b"}, + {file = "memray-1.14.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7227ebf024cb0688a68ed91ed3e05c61a13751a9e875807195076b827bfde464"}, + {file = "memray-1.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:248dea8cfb5a615345e28b7e25c94377a8d198da3b6957ee443afa6f4ff1b733"}, + {file = "memray-1.14.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d03f6be66aa259df7fa50082876fbe6461108d77d46c1f720c46067d60685d4"}, + {file = "memray-1.14.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:9af9d30b1e484fd8591c9a7f322fd50b9192a2bce660be92385a01555af9968b"}, + {file = "memray-1.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c4088b391c04796c888ac751b5d387f6e8212b3515d4c53ba540c65a6efe4bda"}, + {file = "memray-1.14.0-cp312-cp312-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:af8aee7e8e5cac1e4130f1184b3e03b6bb08264e4ba1696551791ed3f8fb824e"}, + {file = "memray-1.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4352f9e85957f2cbe45a9e1c87dfc19d2df77e93dcd8a558794a683eeee57b7b"}, + {file = "memray-1.14.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5953f0d2aa31b23d4cce20236a03d90b7c71903709a57b456d6494bfe6f470b7"}, + {file = "memray-1.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e4ccaca04365efcda51036fe2add980030e33cfc4f3a194a01f530a5c923c65"}, + {file = "memray-1.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f85a27eb8a65c810161bb992116a66d328546f78a4a4c7c1868949651b917c08"}, + {file = "memray-1.14.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:958d57f7149b8fa4831785394f2a7ace93dbc2be6c49a1c07987a8972986474a"}, + {file = "memray-1.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287a01953bc44dd0a32549a23bdacb5f9734e345ca289fa3923867c637715056"}, + {file = "memray-1.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc17cba35d98e3d2ca20ab995f17eec3edba7138b062cbc1aa36d53d9d2d955"}, + {file = "memray-1.14.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c82342cead930ca50235f59740ca238808f9c33ef31d994712972966beb6226e"}, + {file = "memray-1.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a22a826b4047e839310514f4889c24e45a66ea222fca19ac0ae7b2f89bbb0281"}, + {file = "memray-1.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:344f3c73b97ffc8f1666b404deafbc31a19e6b2881341b706aa7ec20afb0e8b1"}, + {file = "memray-1.14.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a43455233d534e9c0e8dabe827d451124874a6455b2afcbcd60b823241ea5843"}, + {file = "memray-1.14.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e05a3b6bc82ef01821beaee98e86bd8de2ada06cb8781add9c40a3ae4a040383"}, + {file = "memray-1.14.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bc27e5483c70236c9379b99277b4ea8fa4b3f73a99e37af81190649bd877881"}, + {file = "memray-1.14.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a7e5604448b2a78e329addfb099384515d3f973a03711c4e2a7b6c9f7f34f53"}, + {file = "memray-1.14.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:443885a96ab9f67d46288240e2593b5c3ecb2c507ddb4e3b10695e104403d001"}, + {file = "memray-1.14.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:52a45d96ed717d8efb645e99646a92dd21a2ca38bdb823fe22e38c429cba9513"}, + {file = "memray-1.14.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:72febec7b287667e8ea9ee3e879a4da19a4318bc47e211da815be74acd961994"}, + {file = "memray-1.14.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e07bdc3a4979b335c2b6b93a81b807d5aacd8dbbea56c41c6899a8bc0d2beb3"}, + {file = "memray-1.14.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b5e729d03caf426dc45a258270537a603794ecc067ccfd92f9c67ba9332e788"}, + {file = "memray-1.14.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1d0a1397b5387b75dc9d9759beb022cb360948584840e850474d7d39ad267f85"}, + {file = "memray-1.14.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:c119b600e7c665e0713f09e25f9ee09733a98035688ecc1ec8fd168fa37a77f6"}, + {file = "memray-1.14.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:29a2e7d84d1652ef4664bcceb155630979b4797180b70da35525d963a4ca707f"}, + {file = "memray-1.14.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:b3b8d46b6447cdecba3ba100d47c62e78cdad58b00b2d6ba004d6bad318c8668"}, + {file = "memray-1.14.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57f9bf3f1c648f1ea877a84c21c449fdafd8cc105763ada6023e36bae9b45eb8"}, + {file = "memray-1.14.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b7a59346d242fc39041d87a71cb6cf45baf492ffbb69da9690de49346be64a8"}, + {file = "memray-1.14.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:11fb00105572b70f2aca8b787ce9748b0c94672fbb6334f1604f7f813ca3dca6"}, + {file = "memray-1.14.0.tar.gz", hash = "sha256:b5d8874b7b215551f0ae9fa8aef3f2f52321a6460dc0141aaf9374709e6b0eb7"}, +] + +[package.dependencies] +jinja2 = ">=2.9" +rich = ">=11.2.0" +textual = ">=0.41.0" + +[package.extras] +benchmark = ["asv"] +dev = ["Cython", "asv", "black", "bump2version", "check-manifest", "flake8", "furo", "greenlet", "ipython", "isort", "mypy", "packaging", "pytest", "pytest-cov", "pytest-textual-snapshot", "setuptools", "sphinx", "sphinx-argparse", "textual (>=0.43,!=0.65.2,!=0.66)", "towncrier"] +docs = ["IPython", "bump2version", "furo", "sphinx", "sphinx-argparse", "towncrier"] +lint = ["black", "check-manifest", "flake8", "isort", "mypy"] +test = ["Cython", "greenlet", "ipython", "packaging", "pytest", "pytest-cov", "pytest-textual-snapshot", "setuptools", "textual (>=0.43,!=0.65.2,!=0.66)"] + +[[package]] +name = "multidict" +version = "6.1.0" +description = "multidict implementation" +optional = true +python-versions = ">=3.8" +files = [ + {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"}, + {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"}, + {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"}, + {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"}, + {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"}, + {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"}, + {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"}, + {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"}, + {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"}, + {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"}, + {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"}, + {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"}, + {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"}, + {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"}, + {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"}, + {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"}, + {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"}, + {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"}, + {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"}, + {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"}, + {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"}, + {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"}, + {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"}, + {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"}, + {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"}, + {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"}, + {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"}, + {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"}, + {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"}, + {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"}, + {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"}, + {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"}, + {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"}, + {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"}, + {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"}, + {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"}, + {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"}, + {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"}, + {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"}, + {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"}, + {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"}, + {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"}, + {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"}, + {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""} + +[[package]] +name = "mypy" +version = "1.13.0" +description = "Optional static typing for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, + {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"}, + {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"}, + {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"}, + {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"}, + {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"}, + {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"}, + {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"}, + {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"}, + {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"}, + {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"}, + {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"}, + {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"}, + {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"}, + {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"}, + {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"}, + {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"}, + {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"}, + {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"}, + {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"}, + {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"}, +] + +[package.dependencies] +mypy-extensions = ">=1.0.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = ">=4.6.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] +install-types = ["pip"] +mypyc = ["setuptools (>=50)"] +reports = ["lxml"] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + +[[package]] +name = "nltk" +version = "3.8.1" +description = "Natural Language Toolkit" +optional = true +python-versions = ">=3.7" +files = [ + {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"}, + {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"}, +] + +[package.dependencies] +click = "*" +joblib = "*" +regex = ">=2021.8.3" +tqdm = "*" + +[package.extras] +all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"] +corenlp = ["requests"] +machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"] +plot = ["matplotlib"] +tgrep = ["pyparsing"] +twitter = ["twython"] + +[[package]] +name = "numpy" +version = "1.26.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, + {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, + {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, + {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, + {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, + {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, + {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, + {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, + {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, + {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, +] + +[[package]] +name = "openai" +version = "0.27.9" +description = "Python client library for the OpenAI API" +optional = true +python-versions = ">=3.7.1" +files = [ + {file = "openai-0.27.9-py3-none-any.whl", hash = "sha256:6a3cf8e276d1a6262b50562fbc0cba7967cfebb78ed827d375986b48fdad6475"}, + {file = "openai-0.27.9.tar.gz", hash = "sha256:b687761c82f5ebb6f61efc791b2083d2d068277b94802d4d1369efe39851813d"}, +] + +[package.dependencies] +aiohttp = "*" +matplotlib = {version = "*", optional = true, markers = "extra == \"embeddings\""} +numpy = {version = "*", optional = true, markers = "extra == \"embeddings\""} +openpyxl = {version = ">=3.0.7", optional = true, markers = "extra == \"embeddings\""} +pandas = {version = ">=1.2.3", optional = true, markers = "extra == \"embeddings\""} +pandas-stubs = {version = ">=1.1.0.11", optional = true, markers = "extra == \"embeddings\""} +plotly = {version = "*", optional = true, markers = "extra == \"embeddings\""} +requests = ">=2.20" +scikit-learn = {version = ">=1.0.2", optional = true, markers = "extra == \"embeddings\""} +scipy = {version = "*", optional = true, markers = "extra == \"embeddings\""} +tenacity = {version = ">=8.0.1", optional = true, markers = "extra == \"embeddings\""} +tqdm = "*" + +[package.extras] +datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"] +embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"] +wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"] + +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = true +python-versions = ">=3.8" +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + +[[package]] +name = "orjson" +version = "3.10.11" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +optional = false +python-versions = ">=3.8" +files = [ + {file = "orjson-3.10.11-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6dade64687f2bd7c090281652fe18f1151292d567a9302b34c2dbb92a3872f1f"}, + {file = "orjson-3.10.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82f07c550a6ccd2b9290849b22316a609023ed851a87ea888c0456485a7d196a"}, + {file = "orjson-3.10.11-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd9a187742d3ead9df2e49240234d728c67c356516cf4db018833a86f20ec18c"}, + {file = "orjson-3.10.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:77b0fed6f209d76c1c39f032a70df2d7acf24b1812ca3e6078fd04e8972685a3"}, + {file = "orjson-3.10.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63fc9d5fe1d4e8868f6aae547a7b8ba0a2e592929245fff61d633f4caccdcdd6"}, + {file = "orjson-3.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65cd3e3bb4fbb4eddc3c1e8dce10dc0b73e808fcb875f9fab40c81903dd9323e"}, + {file = "orjson-3.10.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6f67c570602300c4befbda12d153113b8974a3340fdcf3d6de095ede86c06d92"}, + {file = "orjson-3.10.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1f39728c7f7d766f1f5a769ce4d54b5aaa4c3f92d5b84817053cc9995b977acc"}, + {file = "orjson-3.10.11-cp310-none-win32.whl", hash = "sha256:1789d9db7968d805f3d94aae2c25d04014aae3a2fa65b1443117cd462c6da647"}, + {file = "orjson-3.10.11-cp310-none-win_amd64.whl", hash = "sha256:5576b1e5a53a5ba8f8df81872bb0878a112b3ebb1d392155f00f54dd86c83ff6"}, + {file = "orjson-3.10.11-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1444f9cb7c14055d595de1036f74ecd6ce15f04a715e73f33bb6326c9cef01b6"}, + {file = "orjson-3.10.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdec57fe3b4bdebcc08a946db3365630332dbe575125ff3d80a3272ebd0ddafe"}, + {file = "orjson-3.10.11-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4eed32f33a0ea6ef36ccc1d37f8d17f28a1d6e8eefae5928f76aff8f1df85e67"}, + {file = "orjson-3.10.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80df27dd8697242b904f4ea54820e2d98d3f51f91e97e358fc13359721233e4b"}, + {file = "orjson-3.10.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:705f03cee0cb797256d54de6695ef219e5bc8c8120b6654dd460848d57a9af3d"}, + {file = "orjson-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03246774131701de8e7059b2e382597da43144a9a7400f178b2a32feafc54bd5"}, + {file = "orjson-3.10.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8b5759063a6c940a69c728ea70d7c33583991c6982915a839c8da5f957e0103a"}, + {file = "orjson-3.10.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:677f23e32491520eebb19c99bb34675daf5410c449c13416f7f0d93e2cf5f981"}, + {file = "orjson-3.10.11-cp311-none-win32.whl", hash = "sha256:a11225d7b30468dcb099498296ffac36b4673a8398ca30fdaec1e6c20df6aa55"}, + {file = "orjson-3.10.11-cp311-none-win_amd64.whl", hash = "sha256:df8c677df2f9f385fcc85ab859704045fa88d4668bc9991a527c86e710392bec"}, + {file = "orjson-3.10.11-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:360a4e2c0943da7c21505e47cf6bd725588962ff1d739b99b14e2f7f3545ba51"}, + {file = "orjson-3.10.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:496e2cb45de21c369079ef2d662670a4892c81573bcc143c4205cae98282ba97"}, + {file = "orjson-3.10.11-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7dfa8db55c9792d53c5952900c6a919cfa377b4f4534c7a786484a6a4a350c19"}, + {file = "orjson-3.10.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:51f3382415747e0dbda9dade6f1e1a01a9d37f630d8c9049a8ed0e385b7a90c0"}, + {file = "orjson-3.10.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f35a1b9f50a219f470e0e497ca30b285c9f34948d3c8160d5ad3a755d9299433"}, + {file = "orjson-3.10.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2f3b7c5803138e67028dde33450e054c87e0703afbe730c105f1fcd873496d5"}, + {file = "orjson-3.10.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f91d9eb554310472bd09f5347950b24442600594c2edc1421403d7610a0998fd"}, + {file = "orjson-3.10.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dfbb2d460a855c9744bbc8e36f9c3a997c4b27d842f3d5559ed54326e6911f9b"}, + {file = "orjson-3.10.11-cp312-none-win32.whl", hash = "sha256:d4a62c49c506d4d73f59514986cadebb7e8d186ad510c518f439176cf8d5359d"}, + {file = "orjson-3.10.11-cp312-none-win_amd64.whl", hash = "sha256:f1eec3421a558ff7a9b010a6c7effcfa0ade65327a71bb9b02a1c3b77a247284"}, + {file = "orjson-3.10.11-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:c46294faa4e4d0eb73ab68f1a794d2cbf7bab33b1dda2ac2959ffb7c61591899"}, + {file = "orjson-3.10.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52e5834d7d6e58a36846e059d00559cb9ed20410664f3ad156cd2cc239a11230"}, + {file = "orjson-3.10.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2fc947e5350fdce548bfc94f434e8760d5cafa97fb9c495d2fef6757aa02ec0"}, + {file = "orjson-3.10.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0efabbf839388a1dab5b72b5d3baedbd6039ac83f3b55736eb9934ea5494d258"}, + {file = "orjson-3.10.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a3f29634260708c200c4fe148e42b4aae97d7b9fee417fbdd74f8cfc265f15b0"}, + {file = "orjson-3.10.11-cp313-none-win32.whl", hash = "sha256:1a1222ffcee8a09476bbdd5d4f6f33d06d0d6642df2a3d78b7a195ca880d669b"}, + {file = "orjson-3.10.11-cp313-none-win_amd64.whl", hash = "sha256:bc274ac261cc69260913b2d1610760e55d3c0801bb3457ba7b9004420b6b4270"}, + {file = "orjson-3.10.11-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:19b3763e8bbf8ad797df6b6b5e0fc7c843ec2e2fc0621398534e0c6400098f87"}, + {file = "orjson-3.10.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1be83a13312e5e58d633580c5eb8d0495ae61f180da2722f20562974188af205"}, + {file = "orjson-3.10.11-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:afacfd1ab81f46dedd7f6001b6d4e8de23396e4884cd3c3436bd05defb1a6446"}, + {file = "orjson-3.10.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cb4d0bea56bba596723d73f074c420aec3b2e5d7d30698bc56e6048066bd560c"}, + {file = "orjson-3.10.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96ed1de70fcb15d5fed529a656df29f768187628727ee2788344e8a51e1c1350"}, + {file = "orjson-3.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bfb30c891b530f3f80e801e3ad82ef150b964e5c38e1fb8482441c69c35c61c"}, + {file = "orjson-3.10.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d496c74fc2b61341e3cefda7eec21b7854c5f672ee350bc55d9a4997a8a95204"}, + {file = "orjson-3.10.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:655a493bac606655db9a47fe94d3d84fc7f3ad766d894197c94ccf0c5408e7d3"}, + {file = "orjson-3.10.11-cp38-none-win32.whl", hash = "sha256:b9546b278c9fb5d45380f4809e11b4dd9844ca7aaf1134024503e134ed226161"}, + {file = "orjson-3.10.11-cp38-none-win_amd64.whl", hash = "sha256:b592597fe551d518f42c5a2eb07422eb475aa8cfdc8c51e6da7054b836b26782"}, + {file = "orjson-3.10.11-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:c95f2ecafe709b4e5c733b5e2768ac569bed308623c85806c395d9cca00e08af"}, + {file = "orjson-3.10.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80c00d4acded0c51c98754fe8218cb49cb854f0f7eb39ea4641b7f71732d2cb7"}, + {file = "orjson-3.10.11-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:461311b693d3d0a060439aa669c74f3603264d4e7a08faa68c47ae5a863f352d"}, + {file = "orjson-3.10.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52ca832f17d86a78cbab86cdc25f8c13756ebe182b6fc1a97d534051c18a08de"}, + {file = "orjson-3.10.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c57ea78a753812f528178aa2f1c57da633754c91d2124cb28991dab4c79a54"}, + {file = "orjson-3.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7fcfc6f7ca046383fb954ba528587e0f9336828b568282b27579c49f8e16aad"}, + {file = "orjson-3.10.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:86b9dd983857970c29e4c71bb3e95ff085c07d3e83e7c46ebe959bac07ebd80b"}, + {file = "orjson-3.10.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4d83f87582d223e54efb2242a79547611ba4ebae3af8bae1e80fa9a0af83bb7f"}, + {file = "orjson-3.10.11-cp39-none-win32.whl", hash = "sha256:9fd0ad1c129bc9beb1154c2655f177620b5beaf9a11e0d10bac63ef3fce96950"}, + {file = "orjson-3.10.11-cp39-none-win_amd64.whl", hash = "sha256:10f416b2a017c8bd17f325fb9dee1fb5cdd7a54e814284896b7c3f2763faa017"}, + {file = "orjson-3.10.11.tar.gz", hash = "sha256:e35b6d730de6384d5b2dab5fd23f0d76fae8bbc8c353c2f78210aa5fa4beb3ef"}, +] + +[[package]] +name = "packaging" +version = "23.2" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, + {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, +] + +[[package]] +name = "pandas" +version = "2.2.2" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"}, + {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"}, + {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, + {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"}, + {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"}, + {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" + +[package.extras] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] + +[[package]] +name = "pandas-stubs" +version = "2.2.3.241009" +description = "Type annotations for pandas" +optional = true +python-versions = ">=3.10" +files = [ + {file = "pandas_stubs-2.2.3.241009-py3-none-any.whl", hash = "sha256:3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa"}, + {file = "pandas_stubs-2.2.3.241009.tar.gz", hash = "sha256:d4ab618253f0acf78a5d0d2bfd6dffdd92d91a56a69bdc8144e5a5c6d25be3b5"}, +] + +[package.dependencies] +numpy = ">=1.23.5" +types-pytz = ">=2022.1.1" + +[[package]] +name = "pastel" +version = "0.2.1" +description = "Bring colors to your terminal." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pastel-0.2.1-py2.py3-none-any.whl", hash = "sha256:4349225fcdf6c2bb34d483e523475de5bb04a5c10ef711263452cb37d7dd4364"}, + {file = "pastel-0.2.1.tar.gz", hash = "sha256:e6581ac04e973cac858828c6202c1e1e81fee1dc7de7683f3e1ffe0bfd8a573d"}, +] + +[[package]] +name = "pdf2image" +version = "1.16.3" +description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list." +optional = true +python-versions = "*" +files = [ + {file = "pdf2image-1.16.3-py3-none-any.whl", hash = "sha256:b6154164af3677211c22cbb38b2bd778b43aca02758e962fe1e231f6d3b0e380"}, + {file = "pdf2image-1.16.3.tar.gz", hash = "sha256:74208810c2cef4d9e347769b8e62a52303982ddb4f2dfd744c7ab4b940ae287e"}, +] + +[package.dependencies] +pillow = "*" + +[[package]] +name = "pdfminer-six" +version = "20221105" +description = "PDF parser and analyzer" +optional = true +python-versions = ">=3.6" +files = [ + {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, + {file = "pdfminer.six-20221105.tar.gz", hash = "sha256:8448ab7b939d18b64820478ecac5394f482d7a79f5f7eaa7703c6c959c175e1d"}, +] + +[package.dependencies] +charset-normalizer = ">=2.0.0" +cryptography = ">=36.0.0" + +[package.extras] +dev = ["black", "mypy (==0.931)", "nox", "pytest"] +docs = ["sphinx", "sphinx-argparse"] +image = ["Pillow"] + +[[package]] +name = "pdoc" +version = "15.0.0" +description = "API Documentation for Python Projects" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pdoc-15.0.0-py3-none-any.whl", hash = "sha256:151b0187a25eaf827099e981d6dbe3a4f68aeb18d0d637c24edcab788d5540f1"}, + {file = "pdoc-15.0.0.tar.gz", hash = "sha256:b761220d3ba129cd87e6da1bb7b62c8e799973ab9c595de7ba1a514850d86da5"}, +] + +[package.dependencies] +Jinja2 = ">=2.11.0" +MarkupSafe = ">=1.1.1" +pygments = ">=2.12.0" + +[[package]] +name = "pendulum" +version = "2.1.2" +description = "Python datetimes made easy" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "pendulum-2.1.2-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:b6c352f4bd32dff1ea7066bd31ad0f71f8d8100b9ff709fb343f3b86cee43efe"}, + {file = "pendulum-2.1.2-cp27-cp27m-win_amd64.whl", hash = "sha256:318f72f62e8e23cd6660dbafe1e346950281a9aed144b5c596b2ddabc1d19739"}, + {file = "pendulum-2.1.2-cp35-cp35m-macosx_10_15_x86_64.whl", hash = "sha256:0731f0c661a3cb779d398803655494893c9f581f6488048b3fb629c2342b5394"}, + {file = "pendulum-2.1.2-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:3481fad1dc3f6f6738bd575a951d3c15d4b4ce7c82dce37cf8ac1483fde6e8b0"}, + {file = "pendulum-2.1.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9702069c694306297ed362ce7e3c1ef8404ac8ede39f9b28b7c1a7ad8c3959e3"}, + {file = "pendulum-2.1.2-cp35-cp35m-win_amd64.whl", hash = "sha256:fb53ffa0085002ddd43b6ca61a7b34f2d4d7c3ed66f931fe599e1a531b42af9b"}, + {file = "pendulum-2.1.2-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:c501749fdd3d6f9e726086bf0cd4437281ed47e7bca132ddb522f86a1645d360"}, + {file = "pendulum-2.1.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:c807a578a532eeb226150d5006f156632df2cc8c5693d778324b43ff8c515dd0"}, + {file = "pendulum-2.1.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:2d1619a721df661e506eff8db8614016f0720ac171fe80dda1333ee44e684087"}, + {file = "pendulum-2.1.2-cp36-cp36m-win_amd64.whl", hash = "sha256:f888f2d2909a414680a29ae74d0592758f2b9fcdee3549887779cd4055e975db"}, + {file = "pendulum-2.1.2-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:e95d329384717c7bf627bf27e204bc3b15c8238fa8d9d9781d93712776c14002"}, + {file = "pendulum-2.1.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:4c9c689747f39d0d02a9f94fcee737b34a5773803a64a5fdb046ee9cac7442c5"}, + {file = "pendulum-2.1.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:1245cd0075a3c6d889f581f6325dd8404aca5884dea7223a5566c38aab94642b"}, + {file = "pendulum-2.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:db0a40d8bcd27b4fb46676e8eb3c732c67a5a5e6bfab8927028224fbced0b40b"}, + {file = "pendulum-2.1.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:f5e236e7730cab1644e1b87aca3d2ff3e375a608542e90fe25685dae46310116"}, + {file = "pendulum-2.1.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:de42ea3e2943171a9e95141f2eecf972480636e8e484ccffaf1e833929e9e052"}, + {file = "pendulum-2.1.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7c5ec650cb4bec4c63a89a0242cc8c3cebcec92fcfe937c417ba18277d8560be"}, + {file = "pendulum-2.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:33fb61601083f3eb1d15edeb45274f73c63b3c44a8524703dc143f4212bf3269"}, + {file = "pendulum-2.1.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:29c40a6f2942376185728c9a0347d7c0f07905638c83007e1d262781f1e6953a"}, + {file = "pendulum-2.1.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:94b1fc947bfe38579b28e1cccb36f7e28a15e841f30384b5ad6c5e31055c85d7"}, + {file = "pendulum-2.1.2.tar.gz", hash = "sha256:b06a0ca1bfe41c990bbf0c029f0b6501a7f2ec4e38bfec730712015e8860f207"}, +] + +[package.dependencies] +python-dateutil = ">=2.6,<3.0" +pytzdata = ">=2020.1" + +[[package]] +name = "pillow" +version = "11.0.0" +description = "Python Imaging Library (Fork)" +optional = true +python-versions = ">=3.9" +files = [ + {file = "pillow-11.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6619654954dc4936fcff82db8eb6401d3159ec6be81e33c6000dfd76ae189947"}, + {file = "pillow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3c5ac4bed7519088103d9450a1107f76308ecf91d6dabc8a33a2fcfb18d0fba"}, + {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a65149d8ada1055029fcb665452b2814fe7d7082fcb0c5bed6db851cb69b2086"}, + {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88a58d8ac0cc0e7f3a014509f0455248a76629ca9b604eca7dc5927cc593c5e9"}, + {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c26845094b1af3c91852745ae78e3ea47abf3dbcd1cf962f16b9a5fbe3ee8488"}, + {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1a61b54f87ab5786b8479f81c4b11f4d61702830354520837f8cc791ebba0f5f"}, + {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:674629ff60030d144b7bca2b8330225a9b11c482ed408813924619c6f302fdbb"}, + {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:598b4e238f13276e0008299bd2482003f48158e2b11826862b1eb2ad7c768b97"}, + {file = "pillow-11.0.0-cp310-cp310-win32.whl", hash = "sha256:9a0f748eaa434a41fccf8e1ee7a3eed68af1b690e75328fd7a60af123c193b50"}, + {file = "pillow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:a5629742881bcbc1f42e840af185fd4d83a5edeb96475a575f4da50d6ede337c"}, + {file = "pillow-11.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:ee217c198f2e41f184f3869f3e485557296d505b5195c513b2bfe0062dc537f1"}, + {file = "pillow-11.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1c1d72714f429a521d8d2d018badc42414c3077eb187a59579f28e4270b4b0fc"}, + {file = "pillow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:499c3a1b0d6fc8213519e193796eb1a86a1be4b1877d678b30f83fd979811d1a"}, + {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8b2351c85d855293a299038e1f89db92a2f35e8d2f783489c6f0b2b5f3fe8a3"}, + {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f4dba50cfa56f910241eb7f883c20f1e7b1d8f7d91c750cd0b318bad443f4d5"}, + {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:5ddbfd761ee00c12ee1be86c9c0683ecf5bb14c9772ddbd782085779a63dd55b"}, + {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:45c566eb10b8967d71bf1ab8e4a525e5a93519e29ea071459ce517f6b903d7fa"}, + {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b4fd7bd29610a83a8c9b564d457cf5bd92b4e11e79a4ee4716a63c959699b306"}, + {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cb929ca942d0ec4fac404cbf520ee6cac37bf35be479b970c4ffadf2b6a1cad9"}, + {file = "pillow-11.0.0-cp311-cp311-win32.whl", hash = "sha256:006bcdd307cc47ba43e924099a038cbf9591062e6c50e570819743f5607404f5"}, + {file = "pillow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:52a2d8323a465f84faaba5236567d212c3668f2ab53e1c74c15583cf507a0291"}, + {file = "pillow-11.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:16095692a253047fe3ec028e951fa4221a1f3ed3d80c397e83541a3037ff67c9"}, + {file = "pillow-11.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2c0a187a92a1cb5ef2c8ed5412dd8d4334272617f532d4ad4de31e0495bd923"}, + {file = "pillow-11.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:084a07ef0821cfe4858fe86652fffac8e187b6ae677e9906e192aafcc1b69903"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8069c5179902dcdce0be9bfc8235347fdbac249d23bd90514b7a47a72d9fecf4"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f02541ef64077f22bf4924f225c0fd1248c168f86e4b7abdedd87d6ebaceab0f"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fcb4621042ac4b7865c179bb972ed0da0218a076dc1820ffc48b1d74c1e37fe9"}, + {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:00177a63030d612148e659b55ba99527803288cea7c75fb05766ab7981a8c1b7"}, + {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8853a3bf12afddfdf15f57c4b02d7ded92c7a75a5d7331d19f4f9572a89c17e6"}, + {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3107c66e43bda25359d5ef446f59c497de2b5ed4c7fdba0894f8d6cf3822dafc"}, + {file = "pillow-11.0.0-cp312-cp312-win32.whl", hash = "sha256:86510e3f5eca0ab87429dd77fafc04693195eec7fd6a137c389c3eeb4cfb77c6"}, + {file = "pillow-11.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:8ec4a89295cd6cd4d1058a5e6aec6bf51e0eaaf9714774e1bfac7cfc9051db47"}, + {file = "pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25"}, + {file = "pillow-11.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:bcd1fb5bb7b07f64c15618c89efcc2cfa3e95f0e3bcdbaf4642509de1942a699"}, + {file = "pillow-11.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0e038b0745997c7dcaae350d35859c9715c71e92ffb7e0f4a8e8a16732150f38"}, + {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ae08bd8ffc41aebf578c2af2f9d8749d91f448b3bfd41d7d9ff573d74f2a6b2"}, + {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d69bfd8ec3219ae71bcde1f942b728903cad25fafe3100ba2258b973bd2bc1b2"}, + {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:61b887f9ddba63ddf62fd02a3ba7add935d053b6dd7d58998c630e6dbade8527"}, + {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:c6a660307ca9d4867caa8d9ca2c2658ab685de83792d1876274991adec7b93fa"}, + {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:73e3a0200cdda995c7e43dd47436c1548f87a30bb27fb871f352a22ab8dcf45f"}, + {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fba162b8872d30fea8c52b258a542c5dfd7b235fb5cb352240c8d63b414013eb"}, + {file = "pillow-11.0.0-cp313-cp313-win32.whl", hash = "sha256:f1b82c27e89fffc6da125d5eb0ca6e68017faf5efc078128cfaa42cf5cb38798"}, + {file = "pillow-11.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ba470552b48e5835f1d23ecb936bb7f71d206f9dfeee64245f30c3270b994de"}, + {file = "pillow-11.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:846e193e103b41e984ac921b335df59195356ce3f71dcfd155aa79c603873b84"}, + {file = "pillow-11.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4ad70c4214f67d7466bea6a08061eba35c01b1b89eaa098040a35272a8efb22b"}, + {file = "pillow-11.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ec0d5af64f2e3d64a165f490d96368bb5dea8b8f9ad04487f9ab60dc4bb6003"}, + {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c809a70e43c7977c4a42aefd62f0131823ebf7dd73556fa5d5950f5b354087e2"}, + {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4b60c9520f7207aaf2e1d94de026682fc227806c6e1f55bba7606d1c94dd623a"}, + {file = "pillow-11.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1e2688958a840c822279fda0086fec1fdab2f95bf2b717b66871c4ad9859d7e8"}, + {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"}, + {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"}, + {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"}, + {file = "pillow-11.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2e46773dc9f35a1dd28bd6981332fd7f27bec001a918a72a79b4133cf5291dba"}, + {file = "pillow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2679d2258b7f1192b378e2893a8a0a0ca472234d4c2c0e6bdd3380e8dfa21b6a"}, + {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda2616eb2313cbb3eebbe51f19362eb434b18e3bb599466a1ffa76a033fb916"}, + {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ec184af98a121fb2da42642dea8a29ec80fc3efbaefb86d8fdd2606619045d"}, + {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:8594f42df584e5b4bb9281799698403f7af489fba84c34d53d1c4bfb71b7c4e7"}, + {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:c12b5ae868897c7338519c03049a806af85b9b8c237b7d675b8c5e089e4a618e"}, + {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:70fbbdacd1d271b77b7721fe3cdd2d537bbbd75d29e6300c672ec6bb38d9672f"}, + {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5178952973e588b3f1360868847334e9e3bf49d19e169bbbdfaf8398002419ae"}, + {file = "pillow-11.0.0-cp39-cp39-win32.whl", hash = "sha256:8c676b587da5673d3c75bd67dd2a8cdfeb282ca38a30f37950511766b26858c4"}, + {file = "pillow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:94f3e1780abb45062287b4614a5bc0874519c86a777d4a7ad34978e86428b8dd"}, + {file = "pillow-11.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:290f2cc809f9da7d6d622550bbf4c1e57518212da51b6a30fe8e0a270a5b78bd"}, + {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"}, + {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"}, + {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"}, + {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a0d3b115009ebb8ac3d2ebec5c2982cc693da935f4ab7bb5c8ebe2f47d36f2"}, + {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"}, + {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"}, + {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"}, + {file = "pillow-11.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5bd2d3bdb846d757055910f0a59792d33b555800813c3b39ada1829c372ccb06"}, + {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:375b8dd15a1f5d2feafff536d47e22f69625c1aa92f12b339ec0b2ca40263273"}, + {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:daffdf51ee5db69a82dd127eabecce20729e21f7a3680cf7cbb23f0829189790"}, + {file = "pillow-11.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7326a1787e3c7b0429659e0a944725e1b03eeaa10edd945a86dead1913383944"}, + {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] +typing = ["typing-extensions"] +xmp = ["defusedxml"] + +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + +[[package]] +name = "plotly" +version = "5.24.1" +description = "An open-source, interactive data visualization library for Python" +optional = true +python-versions = ">=3.8" +files = [ + {file = "plotly-5.24.1-py3-none-any.whl", hash = "sha256:f67073a1e637eb0dc3e46324d9d51e2fe76e9727c892dde64ddf1e1b51f29089"}, + {file = "plotly-5.24.1.tar.gz", hash = "sha256:dbc8ac8339d248a4bcc36e08a5659bacfe1b079390b8953533f4eb22169b4bae"}, +] + +[package.dependencies] +packaging = "*" +tenacity = ">=6.2.0" + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "poethepoet" +version = "0.24.4" +description = "A task runner that works well with poetry." +optional = false +python-versions = ">=3.8" +files = [ + {file = "poethepoet-0.24.4-py3-none-any.whl", hash = "sha256:fb4ea35d7f40fe2081ea917d2e4102e2310fda2cde78974050ca83896e229075"}, + {file = "poethepoet-0.24.4.tar.gz", hash = "sha256:ff4220843a87c888cbcb5312c8905214701d0af60ac7271795baa8369b428fef"}, +] + +[package.dependencies] +pastel = ">=0.2.1,<0.3.0" +tomli = ">=1.2.2" + +[package.extras] +poetry-plugin = ["poetry (>=1.0,<2.0)"] + +[[package]] +name = "propcache" +version = "0.2.0" +description = "Accelerated property cache" +optional = true +python-versions = ">=3.8" +files = [ + {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"}, + {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"}, + {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"}, + {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"}, + {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"}, + {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"}, + {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"}, + {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"}, + {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"}, + {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"}, + {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"}, + {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"}, + {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"}, + {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"}, + {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"}, + {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"}, + {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"}, +] + +[[package]] +name = "psutil" +version = "6.1.0" +description = "Cross-platform lib for process and system monitoring in Python." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"}, + {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"}, + {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"}, + {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"}, + {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"}, + {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"}, + {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"}, + {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"}, + {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"}, + {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"}, + {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"}, + {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"}, + {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"}, + {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"}, +] + +[package.extras] +dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"] +test = ["pytest", "pytest-xdist", "setuptools"] + +[[package]] +name = "pyarrow" +version = "15.0.2" +description = "Python library for Apache Arrow" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pyarrow-15.0.2-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:88b340f0a1d05b5ccc3d2d986279045655b1fe8e41aba6ca44ea28da0d1455d8"}, + {file = "pyarrow-15.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eaa8f96cecf32da508e6c7f69bb8401f03745c050c1dd42ec2596f2e98deecac"}, + {file = "pyarrow-15.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23c6753ed4f6adb8461e7c383e418391b8d8453c5d67e17f416c3a5d5709afbd"}, + {file = "pyarrow-15.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f639c059035011db8c0497e541a8a45d98a58dbe34dc8fadd0ef128f2cee46e5"}, + {file = "pyarrow-15.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:290e36a59a0993e9a5224ed2fb3e53375770f07379a0ea03ee2fce2e6d30b423"}, + {file = "pyarrow-15.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06c2bb2a98bc792f040bef31ad3e9be6a63d0cb39189227c08a7d955db96816e"}, + {file = "pyarrow-15.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:f7a197f3670606a960ddc12adbe8075cea5f707ad7bf0dffa09637fdbb89f76c"}, + {file = "pyarrow-15.0.2-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:5f8bc839ea36b1f99984c78e06e7a06054693dc2af8920f6fb416b5bca9944e4"}, + {file = "pyarrow-15.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f5e81dfb4e519baa6b4c80410421528c214427e77ca0ea9461eb4097c328fa33"}, + {file = "pyarrow-15.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a4f240852b302a7af4646c8bfe9950c4691a419847001178662a98915fd7ee7"}, + {file = "pyarrow-15.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e7d9cfb5a1e648e172428c7a42b744610956f3b70f524aa3a6c02a448ba853e"}, + {file = "pyarrow-15.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2d4f905209de70c0eb5b2de6763104d5a9a37430f137678edfb9a675bac9cd98"}, + {file = "pyarrow-15.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:90adb99e8ce5f36fbecbbc422e7dcbcbed07d985eed6062e459e23f9e71fd197"}, + {file = "pyarrow-15.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:b116e7fd7889294cbd24eb90cd9bdd3850be3738d61297855a71ac3b8124ee38"}, + {file = "pyarrow-15.0.2-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:25335e6f1f07fdaa026a61c758ee7d19ce824a866b27bba744348fa73bb5a440"}, + {file = "pyarrow-15.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:90f19e976d9c3d8e73c80be84ddbe2f830b6304e4c576349d9360e335cd627fc"}, + {file = "pyarrow-15.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a22366249bf5fd40ddacc4f03cd3160f2d7c247692945afb1899bab8a140ddfb"}, + {file = "pyarrow-15.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2a335198f886b07e4b5ea16d08ee06557e07db54a8400cc0d03c7f6a22f785f"}, + {file = "pyarrow-15.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3e6d459c0c22f0b9c810a3917a1de3ee704b021a5fb8b3bacf968eece6df098f"}, + {file = "pyarrow-15.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:033b7cad32198754d93465dcfb71d0ba7cb7cd5c9afd7052cab7214676eec38b"}, + {file = "pyarrow-15.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:29850d050379d6e8b5a693098f4de7fd6a2bea4365bfd073d7c57c57b95041ee"}, + {file = "pyarrow-15.0.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:7167107d7fb6dcadb375b4b691b7e316f4368f39f6f45405a05535d7ad5e5058"}, + {file = "pyarrow-15.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e85241b44cc3d365ef950432a1b3bd44ac54626f37b2e3a0cc89c20e45dfd8bf"}, + {file = "pyarrow-15.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:248723e4ed3255fcd73edcecc209744d58a9ca852e4cf3d2577811b6d4b59818"}, + {file = "pyarrow-15.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ff3bdfe6f1b81ca5b73b70a8d482d37a766433823e0c21e22d1d7dde76ca33f"}, + {file = "pyarrow-15.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f3d77463dee7e9f284ef42d341689b459a63ff2e75cee2b9302058d0d98fe142"}, + {file = "pyarrow-15.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:8c1faf2482fb89766e79745670cbca04e7018497d85be9242d5350cba21357e1"}, + {file = "pyarrow-15.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:28f3016958a8e45a1069303a4a4f6a7d4910643fc08adb1e2e4a7ff056272ad3"}, + {file = "pyarrow-15.0.2-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:89722cb64286ab3d4daf168386f6968c126057b8c7ec3ef96302e81d8cdb8ae4"}, + {file = "pyarrow-15.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cd0ba387705044b3ac77b1b317165c0498299b08261d8122c96051024f953cd5"}, + {file = "pyarrow-15.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad2459bf1f22b6a5cdcc27ebfd99307d5526b62d217b984b9f5c974651398832"}, + {file = "pyarrow-15.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58922e4bfece8b02abf7159f1f53a8f4d9f8e08f2d988109126c17c3bb261f22"}, + {file = "pyarrow-15.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:adccc81d3dc0478ea0b498807b39a8d41628fa9210729b2f718b78cb997c7c91"}, + {file = "pyarrow-15.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:8bd2baa5fe531571847983f36a30ddbf65261ef23e496862ece83bdceb70420d"}, + {file = "pyarrow-15.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6669799a1d4ca9da9c7e06ef48368320f5856f36f9a4dd31a11839dda3f6cc8c"}, + {file = "pyarrow-15.0.2.tar.gz", hash = "sha256:9c9bc803cb3b7bfacc1e96ffbfd923601065d9d3f911179d81e72d99fd74a3d9"}, +] + +[package.dependencies] +numpy = ">=1.16.6,<2" + +[[package]] +name = "pycodestyle" +version = "2.11.1" +description = "Python style guide checker" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycodestyle-2.11.1-py2.py3-none-any.whl", hash = "sha256:44fe31000b2d866f2e41841b18528a505fbd7fef9017b04eff4e2648a0fadc67"}, + {file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"}, +] + +[[package]] +name = "pycparser" +version = "2.22" +description = "C parser in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, + {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, +] + +[[package]] +name = "pydantic" +version = "2.9.2" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"}, + {file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"}, +] + +[package.dependencies] +annotated-types = ">=0.6.0" +pydantic-core = "2.23.4" +typing-extensions = [ + {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, +] + +[package.extras] +email = ["email-validator (>=2.0.0)"] +timezone = ["tzdata"] + +[[package]] +name = "pydantic-core" +version = "2.23.4" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_core-2.23.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b10bd51f823d891193d4717448fab065733958bdb6a6b351967bd349d48d5c9b"}, + {file = "pydantic_core-2.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4fc714bdbfb534f94034efaa6eadd74e5b93c8fa6315565a222f7b6f42ca1166"}, + {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63e46b3169866bd62849936de036f901a9356e36376079b05efa83caeaa02ceb"}, + {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed1a53de42fbe34853ba90513cea21673481cd81ed1be739f7f2efb931b24916"}, + {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cfdd16ab5e59fc31b5e906d1a3f666571abc367598e3e02c83403acabc092e07"}, + {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255a8ef062cbf6674450e668482456abac99a5583bbafb73f9ad469540a3a232"}, + {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a7cd62e831afe623fbb7aabbb4fe583212115b3ef38a9f6b71869ba644624a2"}, + {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f09e2ff1f17c2b51f2bc76d1cc33da96298f0a036a137f5440ab3ec5360b624f"}, + {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e38e63e6f3d1cec5a27e0afe90a085af8b6806ee208b33030e65b6516353f1a3"}, + {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0dbd8dbed2085ed23b5c04afa29d8fd2771674223135dc9bc937f3c09284d071"}, + {file = "pydantic_core-2.23.4-cp310-none-win32.whl", hash = "sha256:6531b7ca5f951d663c339002e91aaebda765ec7d61b7d1e3991051906ddde119"}, + {file = "pydantic_core-2.23.4-cp310-none-win_amd64.whl", hash = "sha256:7c9129eb40958b3d4500fa2467e6a83356b3b61bfff1b414c7361d9220f9ae8f"}, + {file = "pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8"}, + {file = "pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d"}, + {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e"}, + {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607"}, + {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd"}, + {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea"}, + {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e"}, + {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b"}, + {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0"}, + {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64"}, + {file = "pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f"}, + {file = "pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3"}, + {file = "pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231"}, + {file = "pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee"}, + {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87"}, + {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8"}, + {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327"}, + {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2"}, + {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36"}, + {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126"}, + {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e"}, + {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24"}, + {file = "pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84"}, + {file = "pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9"}, + {file = "pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc"}, + {file = "pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd"}, + {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05"}, + {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d"}, + {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510"}, + {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6"}, + {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b"}, + {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327"}, + {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6"}, + {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"}, + {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"}, + {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"}, + {file = "pydantic_core-2.23.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d4488a93b071c04dc20f5cecc3631fc78b9789dd72483ba15d423b5b3689b555"}, + {file = "pydantic_core-2.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:81965a16b675b35e1d09dd14df53f190f9129c0202356ed44ab2728b1c905658"}, + {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffa2ebd4c8530079140dd2d7f794a9d9a73cbb8e9d59ffe24c63436efa8f271"}, + {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61817945f2fe7d166e75fbfb28004034b48e44878177fc54d81688e7b85a3665"}, + {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d2c342c4bc01b88402d60189f3df065fb0dda3654744d5a165a5288a657368"}, + {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e11661ce0fd30a6790e8bcdf263b9ec5988e95e63cf901972107efc49218b13"}, + {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d18368b137c6295db49ce7218b1a9ba15c5bc254c96d7c9f9e924a9bc7825ad"}, + {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec4e55f79b1c4ffb2eecd8a0cfba9955a2588497d96851f4c8f99aa4a1d39b12"}, + {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:374a5e5049eda9e0a44c696c7ade3ff355f06b1fe0bb945ea3cac2bc336478a2"}, + {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5c364564d17da23db1106787675fc7af45f2f7b58b4173bfdd105564e132e6fb"}, + {file = "pydantic_core-2.23.4-cp38-none-win32.whl", hash = "sha256:d7a80d21d613eec45e3d41eb22f8f94ddc758a6c4720842dc74c0581f54993d6"}, + {file = "pydantic_core-2.23.4-cp38-none-win_amd64.whl", hash = "sha256:5f5ff8d839f4566a474a969508fe1c5e59c31c80d9e140566f9a37bba7b8d556"}, + {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"}, + {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"}, + {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"}, + {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"}, + {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"}, + {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"}, + {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"}, + {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"}, + {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"}, + {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"}, + {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"}, + {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e08277a400de01bc72436a0ccd02bdf596631411f592ad985dcee21445bd0068"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f220b0eea5965dec25480b6333c788fb72ce5f9129e8759ef876a1d805d00801"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"}, + {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"}, + {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"}, + {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + +[[package]] +name = "pyflakes" +version = "3.1.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyflakes-3.1.0-py2.py3-none-any.whl", hash = "sha256:4132f6d49cb4dae6819e5379898f2b8cce3c5f23994194c24b77d5da2e36f774"}, + {file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"}, +] + +[[package]] +name = "pygments" +version = "2.18.0" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, + {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + +[[package]] +name = "pyjwt" +version = "2.9.0" +description = "JSON Web Token implementation in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "PyJWT-2.9.0-py3-none-any.whl", hash = "sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850"}, + {file = "pyjwt-2.9.0.tar.gz", hash = "sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c"}, +] + +[package.extras] +crypto = ["cryptography (>=3.4.0)"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"] +docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] + +[[package]] +name = "pyparsing" +version = "3.2.0" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = true +python-versions = ">=3.9" +files = [ + {file = "pyparsing-3.2.0-py3-none-any.whl", hash = "sha256:93d9577b88da0bbea8cc8334ee8b918ed014968fd2ec383e868fb8afb1ccef84"}, + {file = "pyparsing-3.2.0.tar.gz", hash = "sha256:cbf74e27246d595d9a74b186b810f6fbb86726dbf3b9532efb343f6d7294fe9c"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "pyproject-flake8" +version = "6.1.0" +description = "pyproject-flake8 (`pflake8`), a monkey patching wrapper to connect flake8 with pyproject.toml configuration" +optional = false +python-versions = ">=3.8.1" +files = [ + {file = "pyproject_flake8-6.1.0-py3-none-any.whl", hash = "sha256:86ea5559263c098e1aa4f866776aa2cf45362fd91a576b9fd8fbbbb55db12c4e"}, + {file = "pyproject_flake8-6.1.0.tar.gz", hash = "sha256:6da8e5a264395e0148bc11844c6fb50546f1fac83ac9210f7328664135f9e70f"}, +] + +[package.dependencies] +flake8 = "6.1.0" +tomli = {version = "*", markers = "python_version < \"3.11\""} + +[[package]] +name = "pyrate-limiter" +version = "3.1.1" +description = "Python Rate-Limiter using Leaky-Bucket Algorithm" +optional = false +python-versions = ">=3.8,<4.0" +files = [ + {file = "pyrate_limiter-3.1.1-py3-none-any.whl", hash = "sha256:c51906f1d51d56dc992ff6c26e8300e32151bc6cfa3e6559792e31971dfd4e2b"}, + {file = "pyrate_limiter-3.1.1.tar.gz", hash = "sha256:2f57eda712687e6eccddf6afe8f8a15b409b97ed675fe64a626058f12863b7b7"}, +] + +[package.extras] +all = ["filelock (>=3.0)", "redis (>=5.0.0,<6.0.0)"] +docs = ["furo (>=2022.3.4,<2023.0.0)", "myst-parser (>=0.17)", "sphinx (>=4.3.0,<5.0.0)", "sphinx-autodoc-typehints (>=1.17,<2.0)", "sphinx-copybutton (>=0.5)", "sphinxcontrib-apidoc (>=0.3,<0.4)"] + +[[package]] +name = "pyrsistent" +version = "0.20.0" +description = "Persistent/Functional/Immutable data structures" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyrsistent-0.20.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c3aba3e01235221e5b229a6c05f585f344734bd1ad42a8ac51493d74722bbce"}, + {file = "pyrsistent-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1beb78af5423b879edaf23c5591ff292cf7c33979734c99aa66d5914ead880f"}, + {file = "pyrsistent-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21cc459636983764e692b9eba7144cdd54fdec23ccdb1e8ba392a63666c60c34"}, + {file = "pyrsistent-0.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5ac696f02b3fc01a710427585c855f65cd9c640e14f52abe52020722bb4906b"}, + {file = "pyrsistent-0.20.0-cp310-cp310-win32.whl", hash = "sha256:0724c506cd8b63c69c7f883cc233aac948c1ea946ea95996ad8b1380c25e1d3f"}, + {file = "pyrsistent-0.20.0-cp310-cp310-win_amd64.whl", hash = "sha256:8441cf9616d642c475684d6cf2520dd24812e996ba9af15e606df5f6fd9d04a7"}, + {file = "pyrsistent-0.20.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0f3b1bcaa1f0629c978b355a7c37acd58907390149b7311b5db1b37648eb6958"}, + {file = "pyrsistent-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cdd7ef1ea7a491ae70d826b6cc64868de09a1d5ff9ef8d574250d0940e275b8"}, + {file = "pyrsistent-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cae40a9e3ce178415040a0383f00e8d68b569e97f31928a3a8ad37e3fde6df6a"}, + {file = "pyrsistent-0.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6288b3fa6622ad8a91e6eb759cfc48ff3089e7c17fb1d4c59a919769314af224"}, + {file = "pyrsistent-0.20.0-cp311-cp311-win32.whl", hash = "sha256:7d29c23bdf6e5438c755b941cef867ec2a4a172ceb9f50553b6ed70d50dfd656"}, + {file = "pyrsistent-0.20.0-cp311-cp311-win_amd64.whl", hash = "sha256:59a89bccd615551391f3237e00006a26bcf98a4d18623a19909a2c48b8e986ee"}, + {file = "pyrsistent-0.20.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:09848306523a3aba463c4b49493a760e7a6ca52e4826aa100ee99d8d39b7ad1e"}, + {file = "pyrsistent-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a14798c3005ec892bbada26485c2eea3b54109cb2533713e355c806891f63c5e"}, + {file = "pyrsistent-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b14decb628fac50db5e02ee5a35a9c0772d20277824cfe845c8a8b717c15daa3"}, + {file = "pyrsistent-0.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e2c116cc804d9b09ce9814d17df5edf1df0c624aba3b43bc1ad90411487036d"}, + {file = "pyrsistent-0.20.0-cp312-cp312-win32.whl", hash = "sha256:e78d0c7c1e99a4a45c99143900ea0546025e41bb59ebc10182e947cf1ece9174"}, + {file = "pyrsistent-0.20.0-cp312-cp312-win_amd64.whl", hash = "sha256:4021a7f963d88ccd15b523787d18ed5e5269ce57aa4037146a2377ff607ae87d"}, + {file = "pyrsistent-0.20.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:79ed12ba79935adaac1664fd7e0e585a22caa539dfc9b7c7c6d5ebf91fb89054"}, + {file = "pyrsistent-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f920385a11207dc372a028b3f1e1038bb244b3ec38d448e6d8e43c6b3ba20e98"}, + {file = "pyrsistent-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f5c2d012671b7391803263419e31b5c7c21e7c95c8760d7fc35602353dee714"}, + {file = "pyrsistent-0.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef3992833fbd686ee783590639f4b8343a57f1f75de8633749d984dc0eb16c86"}, + {file = "pyrsistent-0.20.0-cp38-cp38-win32.whl", hash = "sha256:881bbea27bbd32d37eb24dd320a5e745a2a5b092a17f6debc1349252fac85423"}, + {file = "pyrsistent-0.20.0-cp38-cp38-win_amd64.whl", hash = "sha256:6d270ec9dd33cdb13f4d62c95c1a5a50e6b7cdd86302b494217137f760495b9d"}, + {file = "pyrsistent-0.20.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ca52d1ceae015859d16aded12584c59eb3825f7b50c6cfd621d4231a6cc624ce"}, + {file = "pyrsistent-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b318ca24db0f0518630e8b6f3831e9cba78f099ed5c1d65ffe3e023003043ba0"}, + {file = "pyrsistent-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fed2c3216a605dc9a6ea50c7e84c82906e3684c4e80d2908208f662a6cbf9022"}, + {file = "pyrsistent-0.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e14c95c16211d166f59c6611533d0dacce2e25de0f76e4c140fde250997b3ca"}, + {file = "pyrsistent-0.20.0-cp39-cp39-win32.whl", hash = "sha256:f058a615031eea4ef94ead6456f5ec2026c19fb5bd6bfe86e9665c4158cf802f"}, + {file = "pyrsistent-0.20.0-cp39-cp39-win_amd64.whl", hash = "sha256:58b8f6366e152092194ae68fefe18b9f0b4f89227dfd86a07770c3d86097aebf"}, + {file = "pyrsistent-0.20.0-py3-none-any.whl", hash = "sha256:c55acc4733aad6560a7f5f818466631f07efc001fd023f34a6c203f8b6df0f0b"}, + {file = "pyrsistent-0.20.0.tar.gz", hash = "sha256:4c48f78f62ab596c679086084d0dd13254ae4f3d6c72a83ffdf5ebdef8f265a4"}, +] + +[[package]] +name = "pytesseract" +version = "0.3.10" +description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"}, + {file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"}, +] + +[package.dependencies] +packaging = ">=21.3" +Pillow = ">=8.0.0" + +[[package]] +name = "pytest" +version = "7.4.4" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, + {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-cov" +version = "6.0.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.9" +files = [ + {file = "pytest-cov-6.0.0.tar.gz", hash = "sha256:fde0b595ca248bb8e2d76f020b465f3b107c9632e6a1d1705f17834c89dcadc0"}, + {file = "pytest_cov-6.0.0-py3-none-any.whl", hash = "sha256:eee6f1b9e61008bd34975a4d5bab25801eb31898b032dd55addc93e96fcaaa35"}, +] + +[package.dependencies] +coverage = {version = ">=7.5", extras = ["toml"]} +pytest = ">=4.6" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] + +[[package]] +name = "pytest-httpserver" +version = "1.1.0" +description = "pytest-httpserver is a httpserver for pytest" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest_httpserver-1.1.0-py3-none-any.whl", hash = "sha256:7ef88be8ed3354b6784daa3daa75a422370327c634053cefb124903fa8d73a41"}, + {file = "pytest_httpserver-1.1.0.tar.gz", hash = "sha256:6b1cb0199e2ed551b1b94d43f096863bbf6ae5bcd7c75c2c06845e5ce2dc8701"}, +] + +[package.dependencies] +Werkzeug = ">=2.0.0" + +[[package]] +name = "pytest-memray" +version = "1.7.0" +description = "A simple plugin to use with pytest" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest_memray-1.7.0-py3-none-any.whl", hash = "sha256:b896718c1adf6d0cd339dfaaaa5620f035c9919e1199a79b3453804a1254306f"}, + {file = "pytest_memray-1.7.0.tar.gz", hash = "sha256:c18fa907d2210b42f4096c093e2d3416dfc002dcaa450ef3f9ba819bc3dd8f5f"}, +] + +[package.dependencies] +memray = ">=1.12" +pytest = ">=7.2" + +[package.extras] +docs = ["furo (>=2022.12.7)", "sphinx (>=6.1.3)", "sphinx-argparse (>=0.4)", "sphinx-inline-tabs (>=2022.1.2b11)", "sphinxcontrib-programoutput (>=0.17)", "towncrier (>=22.12)"] +lint = ["black (==22.12)", "isort (==5.11.4)", "mypy (==0.991)", "ruff (==0.0.272)"] +test = ["anyio (>=4.4.0)", "covdefaults (>=2.2.2)", "coverage (>=7.0.5)", "flaky (>=3.7)", "pytest (>=7.2)", "pytest-xdist (>=3.1)"] + +[[package]] +name = "pytest-mock" +version = "3.14.0" +description = "Thin-wrapper around the mock package for easier use with pytest" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-mock-3.14.0.tar.gz", hash = "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"}, + {file = "pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f"}, +] + +[package.dependencies] +pytest = ">=6.2.5" + +[package.extras] +dev = ["pre-commit", "pytest-asyncio", "tox"] + +[[package]] +name = "python-calamine" +version = "0.2.3" +description = "Python binding for Rust's library for reading excel and odf file - calamine" +optional = true +python-versions = ">=3.8" +files = [ + {file = "python_calamine-0.2.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f292a03591b1cab1537424851b74baa33b0a55affc315248a7592ba3de1c3e83"}, + {file = "python_calamine-0.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6cfbd23d1147f53fd70fddfb38af2a98896ecad069c9a4120e77358a6fc43b39"}, + {file = "python_calamine-0.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:847373d0152bafd92b739c911de8c2d23e32ea93d9358bf32b58ed4ace382ae7"}, + {file = "python_calamine-0.2.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1e0dcdc796eb4b4907618392c4b71146812774ca30bf6162a711b63e54214912"}, + {file = "python_calamine-0.2.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2ee8250638ad174aa22a3776ebd41500cf88af62346f1c857505158d2685852"}, + {file = "python_calamine-0.2.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9ac718eb8e9753b986f329aec5dea964005a79115c622a2671fccd0c563d345a"}, + {file = "python_calamine-0.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa1baf404027779cb298d15939a5268eb3d477c86a7a8f4cad0734ea513876c2"}, + {file = "python_calamine-0.2.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc36a85f1a182e49fc318b3e91f06f390d3889ce8c843721cb03a68ca4c7e4ce"}, + {file = "python_calamine-0.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:11e2a74da47adc502c776e399972864802a20d358001a1cfaefb13c36a5116c0"}, + {file = "python_calamine-0.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f19c8eb9f2182cca54c274145b6c8409776b7c08ee5be8a61d44f0448dc55192"}, + {file = "python_calamine-0.2.3-cp310-none-win32.whl", hash = "sha256:37367f85282d87c0d9453cb3caec5a74f2720252bfbc1365d627e9fe12251e56"}, + {file = "python_calamine-0.2.3-cp310-none-win_amd64.whl", hash = "sha256:6d73ef3131b3a7c3894a533857b02fc50198fb65528cbf869742555d1497ee52"}, + {file = "python_calamine-0.2.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e5a36cca8b447295e9edddbe055857bdfdec56cb78554455a03bacd78e3c45a0"}, + {file = "python_calamine-0.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7b5b0803c70269d93b67c42f03e5711a7ba02166fd473a6cb89ef71632167154"}, + {file = "python_calamine-0.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:73766349215f69854afb092ef891cb1ff253f4b6611342566c469b46516c6ada"}, + {file = "python_calamine-0.2.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3bf4cf41518541016b9442082360a83f3579955a872cfca5cec50acc3101cce5"}, + {file = "python_calamine-0.2.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7f1f6dab7b44deed8cf7b45a6d6d2743b622ba5e21a8b73f52ef1064cc5e3638"}, + {file = "python_calamine-0.2.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1991261d40be3d577ce48c0884c6403aefd1cbef5dcc451e039746aa1d185931"}, + {file = "python_calamine-0.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f675e7f45d9e3f1430f3114701133432c279aba06442e743220f6b648023b5ee"}, + {file = "python_calamine-0.2.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8bb7444454cff2c1ad44e7f1a1be776845cbad8f1210d868c7058d2183b3da74"}, + {file = "python_calamine-0.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7a604306cd5ceca720f0426deb49192f2ede5eedd1597b7ff4fa9659a36dc462"}, + {file = "python_calamine-0.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b95afd1a1cd3871d472aa117537b8731c1609756347874b251300cff152176a5"}, + {file = "python_calamine-0.2.3-cp311-none-win32.whl", hash = "sha256:a0ae5a740c9d97b2842d948a91f926a0fab278d247d816fe786219b94507c5a2"}, + {file = "python_calamine-0.2.3-cp311-none-win_amd64.whl", hash = "sha256:a32c64e74673fb0203ad877c6ba4832de7976fd31c79c637552b567d295ff6b5"}, + {file = "python_calamine-0.2.3-cp311-none-win_arm64.whl", hash = "sha256:f8c4c9e7ade09b4122c59e3e0da7e5fba872a0e47d3076702185a4ffdf99dec4"}, + {file = "python_calamine-0.2.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:40e5f75c4a7bb2105e3bd65e7b4656e085c6d86e46af1c56468a2f87c2ed639a"}, + {file = "python_calamine-0.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3557bdd36060db4929f42bf4c2c728a76af60ccc95d5c98f2110331d993a7299"}, + {file = "python_calamine-0.2.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa75b28686f9dc727d26a97b41c6a2a6ca1d2c679139b6199edbae2782e7c77"}, + {file = "python_calamine-0.2.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2c8577b00e13f5f43b1c03a2eca01848c3b24467ebaf597729d1e483613c110"}, + {file = "python_calamine-0.2.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4639255202380251833a9ab75c077e687ebbef2120f54030b2dc46eb6ce43105"}, + {file = "python_calamine-0.2.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:583656c6a6e8efac8951cd72459e2d84eea5f2617214ebc7e1c96217b44a0fa1"}, + {file = "python_calamine-0.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68fc61b34a1d82d3eee2109d323268dd455107dfb639b027aa5c388e2781273c"}, + {file = "python_calamine-0.2.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64bb1f212275ed0288f578ee817e5cad4a063cfe5c38bf4c4dc6968957cb95b0"}, + {file = "python_calamine-0.2.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a7da299c1676dc34cd5f0adf93e92139afbfb832722d5d50a696ac180885aabb"}, + {file = "python_calamine-0.2.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:599752629ab0c5231159c5bea4f94795dd9b11a36c02dd5bd0613cf257ecd710"}, + {file = "python_calamine-0.2.3-cp312-none-win32.whl", hash = "sha256:fc73da2863c3251862583d64c0d07fe907f489a86a205e2b6ac94a39a1df1b42"}, + {file = "python_calamine-0.2.3-cp312-none-win_amd64.whl", hash = "sha256:a8d1662b4767f863c17ea4c1afc3c3fe3174d7b007ae77349d481e6792d142fe"}, + {file = "python_calamine-0.2.3-cp312-none-win_arm64.whl", hash = "sha256:87af11076364ade6f3da9e33993b6f55ec8dfd5f017129de688fd6d94d7bc24a"}, + {file = "python_calamine-0.2.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1ae98e1db1d3e74df08291f66d872bf7a4c47d96d39f8f589bff5dab873fbd13"}, + {file = "python_calamine-0.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bc270e8827191e7125600c97b61b3c78ec17d394820c2607c801f93c3475a0aa"}, + {file = "python_calamine-0.2.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c25b18eca7976aac0748fc122fa5109be66801d94b77a7676125fb825a8b67b9"}, + {file = "python_calamine-0.2.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:484330c0a917879afc615dc15e5ad925953a726f1a839ce3c35504a5befdae0c"}, + {file = "python_calamine-0.2.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c15ccb20f49eb6f824664ca8ec741edf09679977c2d41d13a02f0532f71a318b"}, + {file = "python_calamine-0.2.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19421a1b8a808333c39b03e007b74c85220700ceed1229449a21d51803d0671b"}, + {file = "python_calamine-0.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0cd8e3069c57a26eea5e6d3addb3dab812cc39b70f0cd11246d6f6592b7f293"}, + {file = "python_calamine-0.2.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d13822a6669a00da497394719a1fa63033ab79858fd653d330a6a7a681a5f6ce"}, + {file = "python_calamine-0.2.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:767db722eeb9c4d3847a87e4c3c4c9cc3e48938efaed4c507a5dd538a6bc5910"}, + {file = "python_calamine-0.2.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:4cac4095c25c64ef091fd994f62c5169f3ab0eec39c5bdbd0f319cac633b8183"}, + {file = "python_calamine-0.2.3-cp313-none-win32.whl", hash = "sha256:79aab3dc2c54525896b24002756e12fe09ec573efc2787285c244520bc17c39f"}, + {file = "python_calamine-0.2.3-cp313-none-win_amd64.whl", hash = "sha256:bd6606c893493eb555db5e63aef85b87fd806e6a0aa59bad0dbb591b88db2a0d"}, + {file = "python_calamine-0.2.3-cp313-none-win_arm64.whl", hash = "sha256:9f7b93851c941efba8387bb3c004437541230e8253230868204a079f1dacc21a"}, + {file = "python_calamine-0.2.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:5fa0395816ecff641b5df7ee3a2a953fb0f449a88f780e1c8b762b94578fdb9c"}, + {file = "python_calamine-0.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7397213b734e71434be06c3391ba9c23660215dc5e1c5601b8141f9f623fef84"}, + {file = "python_calamine-0.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be628b380f190b4140801731786f14d59d5a25c54398a724543181e6f46e71d3"}, + {file = "python_calamine-0.2.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7fc182ebd15dd629d5c355207b125fd2301f109bc6cd2d91b1e67626fdbec1f"}, + {file = "python_calamine-0.2.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ae983b57379225f44102e0ff2f3724428174d0156ac42b1b69ed7f63ce105b1"}, + {file = "python_calamine-0.2.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98592f79f46cd2d74cd7f4e69ef2031a51138159a5852efe56fa5bc289c106b4"}, + {file = "python_calamine-0.2.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660347ae698f63f4a495b60411e913cfa448b149e7f51434934782559df6158f"}, + {file = "python_calamine-0.2.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fef87aa0b533c15e22ddb1bd6c257b3de9616c7a4ed3ca00c3c19e4cd8825d08"}, + {file = "python_calamine-0.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:06ab4232827eed11f6a40ddca5dd9015fe73a10c1cf71a4ab2aa26e63f3d1ffb"}, + {file = "python_calamine-0.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a6f64365bfc2cf6acefc3a618c7f25f64c317be3187d50dba3a2ccdbf405f911"}, + {file = "python_calamine-0.2.3-cp38-none-win32.whl", hash = "sha256:08b4b35d5943574ab44e87e4ccc2250f14ce7e8b34ad437ff95c1ae845823d0e"}, + {file = "python_calamine-0.2.3-cp38-none-win_amd64.whl", hash = "sha256:cd9b57326453be8ab52807cde90f3a61a008ed22a69489b41e9edbf66fb86a68"}, + {file = "python_calamine-0.2.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b439270ac6283a2e00abaae167ed35dececaa73f394bf5be8bf8631f3c9757fc"}, + {file = "python_calamine-0.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:38b6d1c315feaacfa95336f7d8d82bdc9fc75854ceae3dd003f075a4cf943582"}, + {file = "python_calamine-0.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:411812b0ffcf042be71408ae82b6fcc8dd70e2ee9ba8e8024a70242f7bce305e"}, + {file = "python_calamine-0.2.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4086c857d2cd1bf388bab6f18ca6ae453fb6618b8f3547e76447dc759b9a3a2a"}, + {file = "python_calamine-0.2.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c6b43b8d0b556cb6e9fa9280cc6a61945fcef0005622590c45fa1471705476b5"}, + {file = "python_calamine-0.2.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce29ebf7b8bd978ef7aaf7755489f67f056327a53ef112a9b24c7a90970f9467"}, + {file = "python_calamine-0.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:042385ce2ba386ef72bd678ed44ee6d4a5de20c9561c3cd1ecd2a57bfdc874cc"}, + {file = "python_calamine-0.2.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e55fd471afd1c50ad88b442ef20c57d7efd38c7c300992708aa2cff943a29b9"}, + {file = "python_calamine-0.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4972a653bd54a4513e9419c26576429b391cdb4b417e7afa46469089ee7c10ee"}, + {file = "python_calamine-0.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:206524d140eb7d2999791afd4dfd62ceed531af3cfa487ff2b8b8fdc4b7c2b50"}, + {file = "python_calamine-0.2.3-cp39-none-win32.whl", hash = "sha256:e5a2c540d631343ba9f16be2afbb7b9fa187b3ced1b292ecc4cfcd51b8859bef"}, + {file = "python_calamine-0.2.3-cp39-none-win_amd64.whl", hash = "sha256:af65a13551d6575468d7cfcc61028df5d4218796dc4886419049e136148694e6"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:10f28b56fb84bd622e23f32881fd17b07ab039e7f2cacdfb6101dce702e77970"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d00cef2e12e4b6660b5fab13f936194263e7e11f707f7951b1867995278051df"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7aebcbd105e49516dd1831f05a0ffca7c9b85f855bf3a9c68f9bc509a212e381"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d5a9182590f5ad12e08a0ba9b72dfe0e6b1780ff95153926e2f4564a6018a14"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2af3805806088acc7b4d766b58b03d08947a7100e1ef26e55509161adbb36201"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:5283e049cc36a0e2442f72d0c2c156dc1e7dc7ca48cba02d52c5cb223525b5c3"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:9b7d0ef322f073099ea69e4a3db8c31ff4c4f7cdf4cd333f0577ab0c9320eaf5"}, + {file = "python_calamine-0.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0bcd07be6953efb08340ccb19b9ae0732b104a9e672edf1ffd2d6b3cc226d815"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7a8b12de6e2329643dd6b0a56570b853b94149ca7b1b323db3f69a06f61ec1e2"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:cad27b0e491060dc72653ccd9288301120b23261e3e374f2401cc133547615d4"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:303e2f2a1bdfaf428db7aca50d954667078c0cdf1b585ff090dfca2fac9107d7"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a21187b6ebcdabdfe2113df11c2a522b9adc02bcf54bd3ba424ca8c6762cd9b"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2773094cc62602f6bcc2acd8e905b3e2292daf6a6c24ddbc85f41065604fd9d4"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:6de5646a9ec3d24b5089ed174f4dcee13620e65e20dc463097c00e803c81f86f"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e976c948ab18e9fee589994b68878381e1e393d870362babf9634258deb4f13b"}, + {file = "python_calamine-0.2.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:00fdfd24d13d8b04619dd933be4888bc6a70427e217fb179f3a1f71f2e377219"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ab7d60482520508ebf00476cde1b97011084a2e73ac49b2ca32003547e7444c9"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:00c915fc67b0b4e1ddd000d374bd808d947f2ecb0f6051a4669a77abada4b7b8"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c869fe1b568a2a970b13dd59a58a13a81a667aff2f365a95a577555585ff14bc"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:602ebad70b176a41f22547d6bb99a6d32a531a11dbf74720f3984e6bf98c94ab"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f6a7c4eb79803ee7cdfd00a0b8267c60c33f25da8bb9275f6168a4dd1a54db76"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:68275fed9dcbe90a9185c9919980933e4feea925db178461f0cdb336a2587021"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:5efc667fd002db9482a7b9f2c70b41fa69c86e18206132be1a0adcad3c998c17"}, + {file = "python_calamine-0.2.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d2d845cbcd767c7b85c616849f0c6cd619662adb98d86af2a3fd8630d6acc48d"}, + {file = "python_calamine-0.2.3.tar.gz", hash = "sha256:d6b3858c3756629d9b4a166de0facfa6c8033fa0b73dcddd3d82144f3170c0dc"}, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "python-docx" +version = "1.1.2" +description = "Create, read, and update Microsoft Word .docx files." +optional = true +python-versions = ">=3.7" +files = [ + {file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"}, + {file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +typing-extensions = ">=4.9.0" + +[[package]] +name = "python-iso639" +version = "2024.10.22" +description = "ISO 639 language codes, names, and other associated information" +optional = true +python-versions = ">=3.8" +files = [ + {file = "python_iso639-2024.10.22-py3-none-any.whl", hash = "sha256:02d3ce2e01c6896b30b9cbbd3e1c8ee0d7221250b5d63ea9803e0d2a81fd1047"}, + {file = "python_iso639-2024.10.22.tar.gz", hash = "sha256:750f21b6a0bc6baa24253a3d8aae92b582bf93aa40988361cd96852c2c6d9a52"}, +] + +[package.extras] +dev = ["black (==24.10.0)", "build (==1.2.1)", "flake8 (==7.1.1)", "pytest (==8.3.3)", "requests (==2.32.3)", "twine (==5.1.1)"] + +[[package]] +name = "python-magic" +version = "0.4.27" +description = "File type identification using libmagic" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b"}, + {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, +] + +[[package]] +name = "python-pptx" +version = "0.6.21" +description = "Generate and manipulate Open XML PowerPoint (.pptx) files" +optional = true +python-versions = "*" +files = [ + {file = "python-pptx-0.6.21.tar.gz", hash = "sha256:7798a2aaf89563565b3c7120c0acfe9aff775db0db3580544e3bf4840c2e378f"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +Pillow = ">=3.3.2" +XlsxWriter = ">=0.5.7" + +[[package]] +name = "python-snappy" +version = "0.7.3" +description = "Python library for the snappy compression library from Google" +optional = true +python-versions = "*" +files = [ + {file = "python_snappy-0.7.3-py3-none-any.whl", hash = "sha256:074c0636cfcd97e7251330f428064050ac81a52c62ed884fc2ddebbb60ed7f50"}, + {file = "python_snappy-0.7.3.tar.gz", hash = "sha256:40216c1badfb2d38ac781ecb162a1d0ec40f8ee9747e610bcfefdfa79486cee3"}, +] + +[package.dependencies] +cramjam = "*" + +[[package]] +name = "python-ulid" +version = "3.0.0" +description = "Universally unique lexicographically sortable identifier" +optional = false +python-versions = ">=3.9" +files = [ + {file = "python_ulid-3.0.0-py3-none-any.whl", hash = "sha256:e4c4942ff50dbd79167ad01ac725ec58f924b4018025ce22c858bfcff99a5e31"}, + {file = "python_ulid-3.0.0.tar.gz", hash = "sha256:e50296a47dc8209d28629a22fc81ca26c00982c78934bd7766377ba37ea49a9f"}, +] + +[package.extras] +pydantic = ["pydantic (>=2.0)"] + +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + +[[package]] +name = "pytzdata" +version = "2020.1" +description = "The Olson timezone database for Python." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pytzdata-2020.1-py2.py3-none-any.whl", hash = "sha256:e1e14750bcf95016381e4d472bad004eef710f2d6417240904070b3d6654485f"}, + {file = "pytzdata-2020.1.tar.gz", hash = "sha256:3efa13b335a00a8de1d345ae41ec78dd11c9f8807f522d39850f2dd828681540"}, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, + {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"}, + {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"}, + {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"}, + {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"}, + {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"}, + {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"}, + {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"}, + {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"}, + {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"}, + {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"}, + {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"}, + {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"}, + {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"}, + {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"}, + {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, + {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, +] + +[[package]] +name = "rapidfuzz" +version = "3.10.1" +description = "rapid fuzzy string matching" +optional = true +python-versions = ">=3.9" +files = [ + {file = "rapidfuzz-3.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f17d9f21bf2f2f785d74f7b0d407805468b4c173fa3e52c86ec94436b338e74a"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b31f358a70efc143909fb3d75ac6cd3c139cd41339aa8f2a3a0ead8315731f2b"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4f43f2204b56a61448ec2dd061e26fd344c404da99fb19f3458200c5874ba2"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9d81bf186a453a2757472133b24915768abc7c3964194406ed93e170e16c21cb"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3611c8f45379a12063d70075c75134f2a8bd2e4e9b8a7995112ddae95ca1c982"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c3b537b97ac30da4b73930fa8a4fe2f79c6d1c10ad535c5c09726612cd6bed9"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231ef1ec9cf7b59809ce3301006500b9d564ddb324635f4ea8f16b3e2a1780da"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed4f3adc1294834955b7e74edd3c6bd1aad5831c007f2d91ea839e76461a5879"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:7b6015da2e707bf632a71772a2dbf0703cff6525732c005ad24987fe86e8ec32"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1b35a118d61d6f008e8e3fb3a77674d10806a8972c7b8be433d6598df4d60b01"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:bc308d79a7e877226f36bdf4e149e3ed398d8277c140be5c1fd892ec41739e6d"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f017dbfecc172e2d0c37cf9e3d519179d71a7f16094b57430dffc496a098aa17"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-win32.whl", hash = "sha256:36c0e1483e21f918d0f2f26799fe5ac91c7b0c34220b73007301c4f831a9c4c7"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:10746c1d4c8cd8881c28a87fd7ba0c9c102346dfe7ff1b0d021cdf093e9adbff"}, + {file = "rapidfuzz-3.10.1-cp310-cp310-win_arm64.whl", hash = "sha256:dfa64b89dcb906835e275187569e51aa9d546a444489e97aaf2cc84011565fbe"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:92958ae075c87fef393f835ed02d4fe8d5ee2059a0934c6c447ea3417dfbf0e8"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ba7521e072c53e33c384e78615d0718e645cab3c366ecd3cc8cb732befd94967"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00d02cbd75d283c287471b5b3738b3e05c9096150f93f2d2dfa10b3d700f2db9"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efa1582a397da038e2f2576c9cd49b842f56fde37d84a6b0200ffebc08d82350"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f12912acee1f506f974f58de9fdc2e62eea5667377a7e9156de53241c05fdba8"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666d5d8b17becc3f53447bcb2b6b33ce6c2df78792495d1fa82b2924cd48701a"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26f71582c0d62445067ee338ddad99b655a8f4e4ed517a90dcbfbb7d19310474"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8a2ef08b27167bcff230ffbfeedd4c4fa6353563d6aaa015d725dd3632fc3de7"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:365e4fc1a2b95082c890f5e98489b894e6bf8c338c6ac89bb6523c2ca6e9f086"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1996feb7a61609fa842e6b5e0c549983222ffdedaf29644cc67e479902846dfe"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:cf654702f144beaa093103841a2ea6910d617d0bb3fccb1d1fd63c54dde2cd49"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec108bf25de674781d0a9a935030ba090c78d49def3d60f8724f3fc1e8e75024"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-win32.whl", hash = "sha256:031f8b367e5d92f7a1e27f7322012f3c321c3110137b43cc3bf678505583ef48"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:f98f36c6a1bb9a6c8bbec99ad87c8c0e364f34761739b5ea9adf7b48129ae8cf"}, + {file = "rapidfuzz-3.10.1-cp311-cp311-win_arm64.whl", hash = "sha256:f1da2028cb4e41be55ee797a82d6c1cf589442504244249dfeb32efc608edee7"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1340b56340896bede246f612b6ecf685f661a56aabef3d2512481bfe23ac5835"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2316515169b7b5a453f0ce3adbc46c42aa332cae9f2edb668e24d1fc92b2f2bb"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e06fe6a12241ec1b72c0566c6b28cda714d61965d86569595ad24793d1ab259"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d99c1cd9443b19164ec185a7d752f4b4db19c066c136f028991a480720472e23"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1d9aa156ed52d3446388ba4c2f335e312191d1ca9d1f5762ee983cf23e4ecf6"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:54bcf4efaaee8e015822be0c2c28214815f4f6b4f70d8362cfecbd58a71188ac"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0c955e32afdbfdf6e9ee663d24afb25210152d98c26d22d399712d29a9b976b"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:191633722203f5b7717efcb73a14f76f3b124877d0608c070b827c5226d0b972"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:195baad28057ec9609e40385991004e470af9ef87401e24ebe72c064431524ab"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0fff4a6b87c07366662b62ae994ffbeadc472e72f725923f94b72a3db49f4671"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4ffed25f9fdc0b287f30a98467493d1e1ce5b583f6317f70ec0263b3c97dbba6"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d02cf8e5af89a9ac8f53c438ddff6d773f62c25c6619b29db96f4aae248177c0"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-win32.whl", hash = "sha256:f3bb81d4fe6a5d20650f8c0afcc8f6e1941f6fecdb434f11b874c42467baded0"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-win_amd64.whl", hash = "sha256:aaf83e9170cb1338922ae42d320699dccbbdca8ffed07faeb0b9257822c26e24"}, + {file = "rapidfuzz-3.10.1-cp312-cp312-win_arm64.whl", hash = "sha256:c5da802a0d085ad81b0f62828fb55557996c497b2d0b551bbdfeafd6d447892f"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fc22d69a1c9cccd560a5c434c0371b2df0f47c309c635a01a913e03bbf183710"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38b0dac2c8e057562b8f0d8ae5b663d2d6a28c5ab624de5b73cef9abb6129a24"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fde3bbb14e92ce8fcb5c2edfff72e474d0080cadda1c97785bf4822f037a309"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9141fb0592e55f98fe9ac0f3ce883199b9c13e262e0bf40c5b18cdf926109d16"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:237bec5dd1bfc9b40bbd786cd27949ef0c0eb5fab5eb491904c6b5df59d39d3c"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18123168cba156ab5794ea6de66db50f21bb3c66ae748d03316e71b27d907b95"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b75fe506c8e02769cc47f5ab21ce3e09b6211d3edaa8f8f27331cb6988779be"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9da82aa4b46973aaf9e03bb4c3d6977004648c8638febfc0f9d237e865761270"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c34c022d5ad564f1a5a57a4a89793bd70d7bad428150fb8ff2760b223407cdcf"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:1e96c84d6c2a0ca94e15acb5399118fff669f4306beb98a6d8ec6f5dccab4412"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e8e154b84a311263e1aca86818c962e1fa9eefdd643d1d5d197fcd2738f88cb9"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:335fee93188f8cd585552bb8057228ce0111bd227fa81bfd40b7df6b75def8ab"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-win32.whl", hash = "sha256:6729b856166a9e95c278410f73683957ea6100c8a9d0a8dbe434c49663689255"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-win_amd64.whl", hash = "sha256:0e06d99ad1ad97cb2ef7f51ec6b1fedd74a3a700e4949353871cf331d07b382a"}, + {file = "rapidfuzz-3.10.1-cp313-cp313-win_arm64.whl", hash = "sha256:8d1b7082104d596a3eb012e0549b2634ed15015b569f48879701e9d8db959dbb"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:779027d3307e1a2b1dc0c03c34df87a470a368a1a0840a9d2908baf2d4067956"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:440b5608ab12650d0390128d6858bc839ae77ffe5edf0b33a1551f2fa9860651"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82cac41a411e07a6f3dc80dfbd33f6be70ea0abd72e99c59310819d09f07d945"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:958473c9f0bca250590200fd520b75be0dbdbc4a7327dc87a55b6d7dc8d68552"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ef60dfa73749ef91cb6073be1a3e135f4846ec809cc115f3cbfc6fe283a5584"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7fbac18f2c19fc983838a60611e67e3262e36859994c26f2ee85bb268de2355"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a0d519ff39db887cd73f4e297922786d548f5c05d6b51f4e6754f452a7f4296"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bebb7bc6aeb91cc57e4881b222484c26759ca865794187217c9dcea6c33adae6"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe07f8b9c3bb5c5ad1d2c66884253e03800f4189a60eb6acd6119ebaf3eb9894"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:bfa48a4a2d45a41457f0840c48e579db157a927f4e97acf6e20df8fc521c79de"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2cf44d01bfe8ee605b7eaeecbc2b9ca64fc55765f17b304b40ed8995f69d7716"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e6bbca9246d9eedaa1c84e04a7f555493ba324d52ae4d9f3d9ddd1b740dcd87"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-win32.whl", hash = "sha256:567f88180f2c1423b4fe3f3ad6e6310fc97b85bdba574801548597287fc07028"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:6b2cd7c29d6ecdf0b780deb587198f13213ac01c430ada6913452fd0c40190fc"}, + {file = "rapidfuzz-3.10.1-cp39-cp39-win_arm64.whl", hash = "sha256:9f912d459e46607ce276128f52bea21ebc3e9a5ccf4cccfef30dd5bddcf47be8"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ac4452f182243cfab30ba4668ef2de101effaedc30f9faabb06a095a8c90fd16"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:565c2bd4f7d23c32834652b27b51dd711814ab614b4e12add8476be4e20d1cf5"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:187d9747149321607be4ccd6f9f366730078bed806178ec3eeb31d05545e9e8f"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:616290fb9a8fa87e48cb0326d26f98d4e29f17c3b762c2d586f2b35c1fd2034b"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:073a5b107e17ebd264198b78614c0206fa438cce749692af5bc5f8f484883f50"}, + {file = "rapidfuzz-3.10.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:39c4983e2e2ccb9732f3ac7d81617088822f4a12291d416b09b8a1eadebb3e29"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ac7adee6bcf0c6fee495d877edad1540a7e0f5fc208da03ccb64734b43522d7a"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:425f4ac80b22153d391ee3f94bc854668a0c6c129f05cf2eaf5ee74474ddb69e"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65a2fa13e8a219f9b5dcb9e74abe3ced5838a7327e629f426d333dfc8c5a6e66"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:75561f3df9a906aaa23787e9992b228b1ab69007932dc42070f747103e177ba8"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:edd062490537e97ca125bc6c7f2b7331c2b73d21dc304615afe61ad1691e15d5"}, + {file = "rapidfuzz-3.10.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfcc8feccf63245a22dfdd16e222f1a39771a44b870beb748117a0e09cbb4a62"}, + {file = "rapidfuzz-3.10.1.tar.gz", hash = "sha256:5a15546d847a915b3f42dc79ef9b0c78b998b4e2c53b252e7166284066585979"}, +] + +[package.extras] +all = ["numpy"] + +[[package]] +name = "regex" +version = "2024.11.6" +description = "Alternative regular expression module, to replace re." +optional = true +python-versions = ">=3.8" +files = [ + {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"}, + {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"}, + {file = "regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62"}, + {file = "regex-2024.11.6-cp310-cp310-win32.whl", hash = "sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e"}, + {file = "regex-2024.11.6-cp310-cp310-win_amd64.whl", hash = "sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45"}, + {file = "regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9"}, + {file = "regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad"}, + {file = "regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54"}, + {file = "regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d"}, + {file = "regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff"}, + {file = "regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3a51ccc315653ba012774efca4f23d1d2a8a8f278a6072e29c7147eee7da446b"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ad182d02e40de7459b73155deb8996bbd8e96852267879396fb274e8700190e3"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba9b72e5643641b7d41fa1f6d5abda2c9a263ae835b917348fc3c928182ad467"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40291b1b89ca6ad8d3f2b82782cc33807f1406cf68c8d440861da6304d8ffbbd"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf58d0e516ee426a48f7b2c03a332a4114420716d55769ff7108c37a09951bf"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a36fdf2af13c2b14738f6e973aba563623cb77d753bbbd8d414d18bfaa3105dd"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cee317bfc014c2419a76bcc87f071405e3966da434e03e13beb45f8aced1a6"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50153825ee016b91549962f970d6a4442fa106832e14c918acd1c8e479916c4f"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea1bfda2f7162605f6e8178223576856b3d791109f15ea99a9f95c16a7636fb5"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:df951c5f4a1b1910f1a99ff42c473ff60f8225baa1cdd3539fe2819d9543e9df"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:072623554418a9911446278f16ecb398fb3b540147a7828c06e2011fa531e773"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f654882311409afb1d780b940234208a252322c24a93b442ca714d119e68086c"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:89d75e7293d2b3e674db7d4d9b1bee7f8f3d1609428e293771d1a962617150cc"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f65557897fc977a44ab205ea871b690adaef6b9da6afda4790a2484b04293a5f"}, + {file = "regex-2024.11.6-cp38-cp38-win32.whl", hash = "sha256:6f44ec28b1f858c98d3036ad5d7d0bfc568bdd7a74f9c24e25f41ef1ebfd81a4"}, + {file = "regex-2024.11.6-cp38-cp38-win_amd64.whl", hash = "sha256:bb8f74f2f10dbf13a0be8de623ba4f9491faf58c24064f32b65679b021ed0001"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b"}, + {file = "regex-2024.11.6-cp39-cp39-win32.whl", hash = "sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57"}, + {file = "regex-2024.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983"}, + {file = "regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519"}, +] + +[[package]] +name = "requests" +version = "2.32.3" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "requests-cache" +version = "1.2.1" +description = "A persistent cache for python requests" +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests_cache-1.2.1-py3-none-any.whl", hash = "sha256:1285151cddf5331067baa82598afe2d47c7495a1334bfe7a7d329b43e9fd3603"}, + {file = "requests_cache-1.2.1.tar.gz", hash = "sha256:68abc986fdc5b8d0911318fbb5f7c80eebcd4d01bfacc6685ecf8876052511d1"}, +] + +[package.dependencies] +attrs = ">=21.2" +cattrs = ">=22.2" +platformdirs = ">=2.5" +requests = ">=2.22" +url-normalize = ">=1.4" +urllib3 = ">=1.25.5" + +[package.extras] +all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=6.0.1)", "redis (>=3)", "ujson (>=5.4)"] +bson = ["bson (>=0.5)"] +docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.9)"] +dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"] +json = ["ujson (>=5.4)"] +mongodb = ["pymongo (>=3)"] +redis = ["redis (>=3)"] +security = ["itsdangerous (>=2.0)"] +yaml = ["pyyaml (>=6.0.1)"] + +[[package]] +name = "requests-mock" +version = "1.12.1" +description = "Mock out responses from the requests package" +optional = false +python-versions = ">=3.5" +files = [ + {file = "requests-mock-1.12.1.tar.gz", hash = "sha256:e9e12e333b525156e82a3c852f22016b9158220d2f47454de9cae8a77d371401"}, + {file = "requests_mock-1.12.1-py2.py3-none-any.whl", hash = "sha256:b1e37054004cdd5e56c84454cc7df12b25f90f382159087f4b6915aaeef39563"}, +] + +[package.dependencies] +requests = ">=2.22,<3" + +[package.extras] +fixture = ["fixtures"] + +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, +] + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + +[[package]] +name = "rich" +version = "13.9.4" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, + {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + +[[package]] +name = "scikit-learn" +version = "1.5.2" +description = "A set of python modules for machine learning and data mining" +optional = true +python-versions = ">=3.9" +files = [ + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"}, + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"}, + {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"}, + {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"}, + {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"}, + {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"}, +] + +[package.dependencies] +joblib = ">=1.2.0" +numpy = ">=1.19.5" +scipy = ">=1.6.0" +threadpoolctl = ">=3.1.0" + +[package.extras] +benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"] +build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"] +examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] +install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"] +maintenance = ["conda-lock (==2.5.6)"] +tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"] + +[[package]] +name = "scipy" +version = "1.14.1" +description = "Fundamental algorithms for scientific computing in Python" +optional = true +python-versions = ">=3.10" +files = [ + {file = "scipy-1.14.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389"}, + {file = "scipy-1.14.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3"}, + {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0"}, + {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3"}, + {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d"}, + {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69"}, + {file = "scipy-1.14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad"}, + {file = "scipy-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617"}, + {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8"}, + {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37"}, + {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2"}, + {file = "scipy-1.14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2"}, + {file = "scipy-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5"}, + {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc"}, + {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310"}, + {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066"}, + {file = "scipy-1.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1"}, + {file = "scipy-1.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73"}, + {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e"}, + {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d"}, + {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e"}, + {file = "scipy-1.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06"}, + {file = "scipy-1.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84"}, + {file = "scipy-1.14.1.tar.gz", hash = "sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417"}, +] + +[package.dependencies] +numpy = ">=1.23.5,<2.3" + +[package.extras] +dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] +doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<=7.3.7)", "sphinx-design (>=0.4.0)"] +test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + +[[package]] +name = "serpyco-rs" +version = "1.11.0" +description = "" +optional = false +python-versions = ">=3.9" +files = [ + {file = "serpyco_rs-1.11.0-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:4b2bd933539bd8c84315e2fb5ae52ef7a58ace5a6dfe3f8b73f74dc71216779e"}, + {file = "serpyco_rs-1.11.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:627f957889ff73c4d2269fc7b6bba93212381befe03633e7cb5495de66ba9a33"}, + {file = "serpyco_rs-1.11.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b0933620abc01434023e0e3e22255b7e4ab9b427b5a9a5ee00834656d792377a"}, + {file = "serpyco_rs-1.11.0-cp310-cp310-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9ce46683d92e34abb20304817fc5ac6cb141a06fc7468dedb1d8865a8a9682f6"}, + {file = "serpyco_rs-1.11.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bda437d86e8859bf91c189c1f4650899822f6d6d7b02b48f5729da904eb7bb7d"}, + {file = "serpyco_rs-1.11.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a72bfbd282af17ebe76d122639013e802c09902543fdbbd828fb2159ec9755e"}, + {file = "serpyco_rs-1.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d4808df5384e3e8581e31a90ba7a1fa501c0837b1f174284bb8a4555b6864ea"}, + {file = "serpyco_rs-1.11.0-cp310-none-win_amd64.whl", hash = "sha256:c7b60aef4c16d68efb0d6241f05d0a434d873d98449cbb4366b0d385f0a7172b"}, + {file = "serpyco_rs-1.11.0-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:8d47ee577cf4d69b53917615cb031ad8708eb2f59fe78194b1968c13130fc2f7"}, + {file = "serpyco_rs-1.11.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6090d9a1487237cdd4e9362a823eede23249602019b917e7bd57846179286e79"}, + {file = "serpyco_rs-1.11.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7192eb3df576386fefd595ea31ae25c62522841ffec7e7aeb37a80b55bdc3213"}, + {file = "serpyco_rs-1.11.0-cp311-cp311-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b52ef8affb7e71b9b98a7d5216d6a7ad03b04e990acb147cd9211c8b931c5487"}, + {file = "serpyco_rs-1.11.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3480e09e473560c60e74aaa789e6b4d079637371aae0a98235440111464bbba7"}, + {file = "serpyco_rs-1.11.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c92e36b0ab6fe866601c2331f7e99c809a126d21963c03d8a5c29331526deed"}, + {file = "serpyco_rs-1.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84f497361952d4566bc1f77e9e15a84a2614f593cc671fbf0a0fa80046f9c3d7"}, + {file = "serpyco_rs-1.11.0-cp311-none-win_amd64.whl", hash = "sha256:37fc1cf192bef9784fbf1f4e03cec21750b9e704bef55cc0442f71a715eee920"}, + {file = "serpyco_rs-1.11.0-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:3ea93d485f03dc8b0cfb0d477f0ad2e86e78f0461b53010656ab5b4db1b41fb0"}, + {file = "serpyco_rs-1.11.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7772410d15694b03f9c5500a2c47d62eed76e191bea4087ad042250346b1a38e"}, + {file = "serpyco_rs-1.11.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42118463c1679846cffd2f06f47744c9b9eb33c5d0448afd88ea19e1a81a8ddd"}, + {file = "serpyco_rs-1.11.0-cp312-cp312-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:79481a455b76cc56021dc55bb6d5bdda1b2b32bcb6a1ee711b597140d112e9b1"}, + {file = "serpyco_rs-1.11.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8fd79051f9af9591fc03cf7d3033ff180416301f6a4fd3d1e3d92ebd2d68697"}, + {file = "serpyco_rs-1.11.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d29c8f9aeed734a3b51f7349d04ec9063516ffa4e10b632d75e9b1309e4930e4"}, + {file = "serpyco_rs-1.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15609158b0d9591ffa118302cd9d0039970cb3faf91dce32975f7d276e7411d5"}, + {file = "serpyco_rs-1.11.0-cp312-none-win_amd64.whl", hash = "sha256:00081eae77fbf4c5d88371c5586317ab02ccb293a330b460869a283edf2b7b69"}, + {file = "serpyco_rs-1.11.0-cp313-cp313-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:3028893366a1985adcedb13fa8f6f98c087c185efc427f94c2ccdafa40f45832"}, + {file = "serpyco_rs-1.11.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c18bf511316f3abf648a68ee62ef88617bec57d3fcde69466b4361102715ae5"}, + {file = "serpyco_rs-1.11.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7dde9ef09cdfaf7c62378186b9e29f54ec76114be4c347be6a06dd559c5681e"}, + {file = "serpyco_rs-1.11.0-cp313-cp313-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:18500ebc5e75285841e35585a238629a990b709e14f68933233640d15ca17d5f"}, + {file = "serpyco_rs-1.11.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47c23132d4e03982703a7630aa09877b41e499722142f76b6153f6619b612f3"}, + {file = "serpyco_rs-1.11.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5f8e6ba499f6a0825bee0d8f8764569d367af871b563fc6512c171474e8e5383"}, + {file = "serpyco_rs-1.11.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15438a076047c34cff6601a977df54948e8d39d1a86f89d05c48bc60f4c12a61"}, + {file = "serpyco_rs-1.11.0-cp313-none-win_amd64.whl", hash = "sha256:84ee2c109415bd81904fc9abb9aec86a5dd13166808c21142cf23ec639f683bd"}, + {file = "serpyco_rs-1.11.0-cp39-cp39-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5c97c16c865261577fac4effeccc7ef5e0a1e8e35e7a3ee6c90c77c3a4cd7ff9"}, + {file = "serpyco_rs-1.11.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47825e70f86fd6ef7c4a835dea3d6e8eef4fee354ed7b39ced99f31aba74a86e"}, + {file = "serpyco_rs-1.11.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24d220220365110edba2f778f41ab3cf396883da0f26e1361a3ada9bd0227f73"}, + {file = "serpyco_rs-1.11.0-cp39-cp39-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:3a46f334af5a9d77acc6e1e58f355ae497900a2798929371f0545e274f6e6166"}, + {file = "serpyco_rs-1.11.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d72b748acce4b4e3c7c9724e1eb33d033a1c26b08a698b393e0288060e0901"}, + {file = "serpyco_rs-1.11.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2b8b6f205e8cc038d4d30dd0e70eece7bbecc816eb2f3787c330dc2218e232d"}, + {file = "serpyco_rs-1.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:038d748bfff31f150f0c3edab2766b8843edb952cb1bd3bf547886beb0912dae"}, + {file = "serpyco_rs-1.11.0-cp39-none-win_amd64.whl", hash = "sha256:0fee1c89ec2cb013dc232e4ebef88e2844357ce8631063b56639dbfb83762f20"}, + {file = "serpyco_rs-1.11.0.tar.gz", hash = "sha256:70a844615ffb229e6e89c204b3ab7404aacaf2838911814c7d847969b8da2e3a"}, +] + +[package.dependencies] +attributes-doc = "*" +typing-extensions = "*" + +[[package]] +name = "setuptools" +version = "75.3.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd"}, + {file = "setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12.*)", "pytest-mypy"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = true +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + +[[package]] +name = "snowballstemmer" +version = "2.2.0" +description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +optional = true +python-versions = "*" +files = [ + {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, + {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, +] + +[[package]] +name = "soupsieve" +version = "2.6" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = true +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, + {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, +] + +[[package]] +name = "sphinx" +version = "4.2.0" +description = "Python documentation generator" +optional = true +python-versions = ">=3.6" +files = [ + {file = "Sphinx-4.2.0-py3-none-any.whl", hash = "sha256:98a535c62a4fcfcc362528592f69b26f7caec587d32cd55688db580be0287ae0"}, + {file = "Sphinx-4.2.0.tar.gz", hash = "sha256:94078db9184491e15bce0a56d9186e0aec95f16ac20b12d00e06d4e36f1058a6"}, +] + +[package.dependencies] +alabaster = ">=0.7,<0.8" +babel = ">=1.3" +colorama = {version = ">=0.3.5", markers = "sys_platform == \"win32\""} +docutils = ">=0.14,<0.18" +imagesize = "*" +Jinja2 = ">=2.3" +packaging = "*" +Pygments = ">=2.0" +requests = ">=2.5.0" +setuptools = "*" +snowballstemmer = ">=1.1" +sphinxcontrib-applehelp = "*" +sphinxcontrib-devhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" +sphinxcontrib-jsmath = "*" +sphinxcontrib-qthelp = "*" +sphinxcontrib-serializinghtml = ">=1.1.5" + +[package.extras] +docs = ["sphinxcontrib-websupport"] +lint = ["docutils-stubs", "flake8 (>=3.5.0)", "isort", "mypy (>=0.900)", "types-pkg-resources", "types-requests", "types-typed-ast"] +test = ["cython", "html5lib", "pytest", "pytest-cov", "typed-ast"] + +[[package]] +name = "sphinx-rtd-theme" +version = "1.0.0" +description = "Read the Docs theme for Sphinx" +optional = true +python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" +files = [ + {file = "sphinx_rtd_theme-1.0.0-py2.py3-none-any.whl", hash = "sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8"}, + {file = "sphinx_rtd_theme-1.0.0.tar.gz", hash = "sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c"}, +] + +[package.dependencies] +docutils = "<0.18" +sphinx = ">=1.6" + +[package.extras] +dev = ["bump2version", "sphinxcontrib-httpdomain", "transifex-client"] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" +optional = true +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5"}, + {file = "sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" +optional = true +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2"}, + {file = "sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +optional = true +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8"}, + {file = "sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["html5lib", "pytest"] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +description = "A sphinx extension which renders display math in HTML via JavaScript" +optional = true +python-versions = ">=3.5" +files = [ + {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, + {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, +] + +[package.extras] +test = ["flake8", "mypy", "pytest"] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" +optional = true +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb"}, + {file = "sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["defusedxml (>=0.7.1)", "pytest"] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" +optional = true +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331"}, + {file = "sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sqlalchemy" +version = "2.0.35" +description = "Database Abstraction Library" +optional = true +python-versions = ">=3.7" +files = [ + {file = "SQLAlchemy-2.0.35-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:67219632be22f14750f0d1c70e62f204ba69d28f62fd6432ba05ab295853de9b"}, + {file = "SQLAlchemy-2.0.35-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4668bd8faf7e5b71c0319407b608f278f279668f358857dbfd10ef1954ac9f90"}, + {file = "SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb8bea573863762bbf45d1e13f87c2d2fd32cee2dbd50d050f83f87429c9e1ea"}, + {file = "SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f552023710d4b93d8fb29a91fadf97de89c5926c6bd758897875435f2a939f33"}, + {file = "SQLAlchemy-2.0.35-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:016b2e665f778f13d3c438651dd4de244214b527a275e0acf1d44c05bc6026a9"}, + {file = "SQLAlchemy-2.0.35-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7befc148de64b6060937231cbff8d01ccf0bfd75aa26383ffdf8d82b12ec04ff"}, + {file = "SQLAlchemy-2.0.35-cp310-cp310-win32.whl", hash = "sha256:22b83aed390e3099584b839b93f80a0f4a95ee7f48270c97c90acd40ee646f0b"}, + {file = "SQLAlchemy-2.0.35-cp310-cp310-win_amd64.whl", hash = "sha256:a29762cd3d116585278ffb2e5b8cc311fb095ea278b96feef28d0b423154858e"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e21f66748ab725ade40fa7af8ec8b5019c68ab00b929f6643e1b1af461eddb60"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8a6219108a15fc6d24de499d0d515c7235c617b2540d97116b663dade1a54d62"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:042622a5306c23b972192283f4e22372da3b8ddf5f7aac1cc5d9c9b222ab3ff6"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:627dee0c280eea91aed87b20a1f849e9ae2fe719d52cbf847c0e0ea34464b3f7"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4fdcd72a789c1c31ed242fd8c1bcd9ea186a98ee8e5408a50e610edfef980d71"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:89b64cd8898a3a6f642db4eb7b26d1b28a497d4022eccd7717ca066823e9fb01"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-win32.whl", hash = "sha256:6a93c5a0dfe8d34951e8a6f499a9479ffb9258123551fa007fc708ae2ac2bc5e"}, + {file = "SQLAlchemy-2.0.35-cp311-cp311-win_amd64.whl", hash = "sha256:c68fe3fcde03920c46697585620135b4ecfdfc1ed23e75cc2c2ae9f8502c10b8"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:eb60b026d8ad0c97917cb81d3662d0b39b8ff1335e3fabb24984c6acd0c900a2"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6921ee01caf375363be5e9ae70d08ce7ca9d7e0e8983183080211a062d299468"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cdf1a0dbe5ced887a9b127da4ffd7354e9c1a3b9bb330dce84df6b70ccb3a8d"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93a71c8601e823236ac0e5d087e4f397874a421017b3318fd92c0b14acf2b6db"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e04b622bb8a88f10e439084486f2f6349bf4d50605ac3e445869c7ea5cf0fa8c"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1b56961e2d31389aaadf4906d453859f35302b4eb818d34a26fab72596076bb8"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-win32.whl", hash = "sha256:0f9f3f9a3763b9c4deb8c5d09c4cc52ffe49f9876af41cc1b2ad0138878453cf"}, + {file = "SQLAlchemy-2.0.35-cp312-cp312-win_amd64.whl", hash = "sha256:25b0f63e7fcc2a6290cb5f7f5b4fc4047843504983a28856ce9b35d8f7de03cc"}, + {file = "SQLAlchemy-2.0.35-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f021d334f2ca692523aaf7bbf7592ceff70c8594fad853416a81d66b35e3abf9"}, + {file = "SQLAlchemy-2.0.35-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05c3f58cf91683102f2f0265c0db3bd3892e9eedabe059720492dbaa4f922da1"}, + {file = "SQLAlchemy-2.0.35-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:032d979ce77a6c2432653322ba4cbeabf5a6837f704d16fa38b5a05d8e21fa00"}, + {file = "SQLAlchemy-2.0.35-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:2e795c2f7d7249b75bb5f479b432a51b59041580d20599d4e112b5f2046437a3"}, + {file = "SQLAlchemy-2.0.35-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:cc32b2990fc34380ec2f6195f33a76b6cdaa9eecf09f0c9404b74fc120aef36f"}, + {file = "SQLAlchemy-2.0.35-cp37-cp37m-win32.whl", hash = "sha256:9509c4123491d0e63fb5e16199e09f8e262066e58903e84615c301dde8fa2e87"}, + {file = "SQLAlchemy-2.0.35-cp37-cp37m-win_amd64.whl", hash = "sha256:3655af10ebcc0f1e4e06c5900bb33e080d6a1fa4228f502121f28a3b1753cde5"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4c31943b61ed8fdd63dfd12ccc919f2bf95eefca133767db6fbbd15da62078ec"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a62dd5d7cc8626a3634208df458c5fe4f21200d96a74d122c83bc2015b333bc1"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0630774b0977804fba4b6bbea6852ab56c14965a2b0c7fc7282c5f7d90a1ae72"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d625eddf7efeba2abfd9c014a22c0f6b3796e0ffb48f5d5ab106568ef01ff5a"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ada603db10bb865bbe591939de854faf2c60f43c9b763e90f653224138f910d9"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c41411e192f8d3ea39ea70e0fae48762cd11a2244e03751a98bd3c0ca9a4e936"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-win32.whl", hash = "sha256:d299797d75cd747e7797b1b41817111406b8b10a4f88b6e8fe5b5e59598b43b0"}, + {file = "SQLAlchemy-2.0.35-cp38-cp38-win_amd64.whl", hash = "sha256:0375a141e1c0878103eb3d719eb6d5aa444b490c96f3fedab8471c7f6ffe70ee"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ccae5de2a0140d8be6838c331604f91d6fafd0735dbdcee1ac78fc8fbaba76b4"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2a275a806f73e849e1c309ac11108ea1a14cd7058577aba962cd7190e27c9e3c"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:732e026240cdd1c1b2e3ac515c7a23820430ed94292ce33806a95869c46bd139"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:890da8cd1941fa3dab28c5bac3b9da8502e7e366f895b3b8e500896f12f94d11"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c0d8326269dbf944b9201911b0d9f3dc524d64779a07518199a58384c3d37a44"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b76d63495b0508ab9fc23f8152bac63205d2a704cd009a2b0722f4c8e0cba8e0"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-win32.whl", hash = "sha256:69683e02e8a9de37f17985905a5eca18ad651bf592314b4d3d799029797d0eb3"}, + {file = "SQLAlchemy-2.0.35-cp39-cp39-win_amd64.whl", hash = "sha256:aee110e4ef3c528f3abbc3c2018c121e708938adeeff9006428dd7c8555e9b3f"}, + {file = "SQLAlchemy-2.0.35-py3-none-any.whl", hash = "sha256:2ab3f0336c0387662ce6221ad30ab3a5e6499aab01b9790879b6578fd9b8faa1"}, + {file = "sqlalchemy-2.0.35.tar.gz", hash = "sha256:e11d7ea4d24f0a262bccf9a7cd6284c976c5369dac21db237cff59586045ab9f"}, +] + +[package.dependencies] +greenlet = {version = "!=0.4.17", markers = "python_version < \"3.13\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} +typing-extensions = ">=4.6.0" + +[package.extras] +aiomysql = ["aiomysql (>=0.2.0)", "greenlet (!=0.4.17)"] +aioodbc = ["aioodbc", "greenlet (!=0.4.17)"] +aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing_extensions (!=3.10.0.1)"] +asyncio = ["greenlet (!=0.4.17)"] +asyncmy = ["asyncmy (>=0.2.3,!=0.2.4,!=0.2.6)", "greenlet (!=0.4.17)"] +mariadb-connector = ["mariadb (>=1.0.1,!=1.1.2,!=1.1.5)"] +mssql = ["pyodbc"] +mssql-pymssql = ["pymssql"] +mssql-pyodbc = ["pyodbc"] +mypy = ["mypy (>=0.910)"] +mysql = ["mysqlclient (>=1.4.0)"] +mysql-connector = ["mysql-connector-python"] +oracle = ["cx_oracle (>=8)"] +oracle-oracledb = ["oracledb (>=1.0.1)"] +postgresql = ["psycopg2 (>=2.7)"] +postgresql-asyncpg = ["asyncpg", "greenlet (!=0.4.17)"] +postgresql-pg8000 = ["pg8000 (>=1.29.1)"] +postgresql-psycopg = ["psycopg (>=3.0.7)"] +postgresql-psycopg2binary = ["psycopg2-binary"] +postgresql-psycopg2cffi = ["psycopg2cffi"] +postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] +pymysql = ["pymysql"] +sqlcipher = ["sqlcipher3_binary"] + +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = true +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + +[[package]] +name = "tenacity" +version = "8.5.0" +description = "Retry code until it succeeds" +optional = true +python-versions = ">=3.8" +files = [ + {file = "tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687"}, + {file = "tenacity-8.5.0.tar.gz", hash = "sha256:8bc6c0c8a09b31e6cad13c47afbed1a567518250a9a171418582ed8d9c20ca78"}, +] + +[package.extras] +doc = ["reno", "sphinx"] +test = ["pytest", "tornado (>=4.5)", "typeguard"] + +[[package]] +name = "textual" +version = "0.85.2" +description = "Modern Text User Interface framework" +optional = false +python-versions = "<4.0.0,>=3.8.1" +files = [ + {file = "textual-0.85.2-py3-none-any.whl", hash = "sha256:9ccdeb6b8a6a0ff72d497f714934f2e524f2eb67783b459fb08b1339ee537dc0"}, + {file = "textual-0.85.2.tar.gz", hash = "sha256:2a416995c49d5381a81d0a6fd23925cb0e3f14b4f239ed05f35fa3c981bb1df2"}, +] + +[package.dependencies] +markdown-it-py = {version = ">=2.1.0", extras = ["linkify", "plugins"]} +platformdirs = ">=3.6.0,<5" +rich = ">=13.3.3" +typing-extensions = ">=4.4.0,<5.0.0" + +[package.extras] +syntax = ["tree-sitter (>=0.20.1,<0.21.0)", "tree-sitter-languages (==1.10.2)"] + +[[package]] +name = "threadpoolctl" +version = "3.5.0" +description = "threadpoolctl" +optional = true +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, + {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, +] + +[[package]] +name = "tiktoken" +version = "0.4.0" +description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" +optional = true +python-versions = ">=3.8" +files = [ + {file = "tiktoken-0.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:176cad7f053d2cc82ce7e2a7c883ccc6971840a4b5276740d0b732a2b2011f8a"}, + {file = "tiktoken-0.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:450d504892b3ac80207700266ee87c932df8efea54e05cefe8613edc963c1285"}, + {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00d662de1e7986d129139faf15e6a6ee7665ee103440769b8dedf3e7ba6ac37f"}, + {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5727d852ead18b7927b8adf558a6f913a15c7766725b23dbe21d22e243041b28"}, + {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c06cd92b09eb0404cedce3702fa866bf0d00e399439dad3f10288ddc31045422"}, + {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9ec161e40ed44e4210d3b31e2ff426b4a55e8254f1023e5d2595cb60044f8ea6"}, + {file = "tiktoken-0.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:1e8fa13cf9889d2c928b9e258e9dbbbf88ab02016e4236aae76e3b4f82dd8288"}, + {file = "tiktoken-0.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bb2341836b725c60d0ab3c84970b9b5f68d4b733a7bcb80fb25967e5addb9920"}, + {file = "tiktoken-0.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ca30367ad750ee7d42fe80079d3092bd35bb266be7882b79c3bd159b39a17b0"}, + {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3dc3df19ddec79435bb2a94ee46f4b9560d0299c23520803d851008445671197"}, + {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d980fa066e962ef0f4dad0222e63a484c0c993c7a47c7dafda844ca5aded1f3"}, + {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:329f548a821a2f339adc9fbcfd9fc12602e4b3f8598df5593cfc09839e9ae5e4"}, + {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b1a038cee487931a5caaef0a2e8520e645508cde21717eacc9af3fbda097d8bb"}, + {file = "tiktoken-0.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:08efa59468dbe23ed038c28893e2a7158d8c211c3dd07f2bbc9a30e012512f1d"}, + {file = "tiktoken-0.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3020350685e009053829c1168703c346fb32c70c57d828ca3742558e94827a9"}, + {file = "tiktoken-0.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba16698c42aad8190e746cd82f6a06769ac7edd415d62ba027ea1d99d958ed93"}, + {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c15d9955cc18d0d7ffcc9c03dc51167aedae98542238b54a2e659bd25fe77ed"}, + {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64e1091c7103100d5e2c6ea706f0ec9cd6dc313e6fe7775ef777f40d8c20811e"}, + {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e87751b54eb7bca580126353a9cf17a8a8eaadd44edaac0e01123e1513a33281"}, + {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e063b988b8ba8b66d6cc2026d937557437e79258095f52eaecfafb18a0a10c03"}, + {file = "tiktoken-0.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:9c6dd439e878172dc163fced3bc7b19b9ab549c271b257599f55afc3a6a5edef"}, + {file = "tiktoken-0.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8d1d97f83697ff44466c6bef5d35b6bcdb51e0125829a9c0ed1e6e39fb9a08fb"}, + {file = "tiktoken-0.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1b6bce7c68aa765f666474c7c11a7aebda3816b58ecafb209afa59c799b0dd2d"}, + {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a73286c35899ca51d8d764bc0b4d60838627ce193acb60cc88aea60bddec4fd"}, + {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0394967d2236a60fd0aacef26646b53636423cc9c70c32f7c5124ebe86f3093"}, + {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:dae2af6f03ecba5f679449fa66ed96585b2fa6accb7fd57d9649e9e398a94f44"}, + {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:55e251b1da3c293432179cf7c452cfa35562da286786be5a8b1ee3405c2b0dd2"}, + {file = "tiktoken-0.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:c835d0ee1f84a5aa04921717754eadbc0f0a56cf613f78dfc1cf9ad35f6c3fea"}, + {file = "tiktoken-0.4.0.tar.gz", hash = "sha256:59b20a819969735b48161ced9b92f05dc4519c17be4015cfb73b65270a243620"}, +] + +[package.dependencies] +regex = ">=2022.1.18" +requests = ">=2.26.0" + +[package.extras] +blobfile = ["blobfile (>=2)"] + +[[package]] +name = "tomli" +version = "2.0.2" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, + {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, +] + +[[package]] +name = "tqdm" +version = "4.67.0" +description = "Fast, Extensible Progress Meter" +optional = true +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.67.0-py3-none-any.whl", hash = "sha256:0cd8af9d56911acab92182e88d763100d4788bdf421d251616040cc4d44863be"}, + {file = "tqdm-4.67.0.tar.gz", hash = "sha256:fe5a6f95e6fe0b9755e9469b77b9c3cf850048224ecaa8293d7d2d31f97d869a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +discord = ["requests"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + +[[package]] +name = "types-pytz" +version = "2024.2.0.20241003" +description = "Typing stubs for pytz" +optional = true +python-versions = ">=3.8" +files = [ + {file = "types-pytz-2024.2.0.20241003.tar.gz", hash = "sha256:575dc38f385a922a212bac00a7d6d2e16e141132a3c955078f4a4fd13ed6cb44"}, + {file = "types_pytz-2024.2.0.20241003-py3-none-any.whl", hash = "sha256:3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7"}, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + +[[package]] +name = "typing-inspect" +version = "0.9.0" +description = "Runtime inspection utilities for typing module." +optional = true +python-versions = "*" +files = [ + {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, + {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, +] + +[package.dependencies] +mypy-extensions = ">=0.3.0" +typing-extensions = ">=3.7.4" + +[[package]] +name = "tzdata" +version = "2024.2" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"}, + {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, +] + +[[package]] +name = "uc-micro-py" +version = "1.0.3" +description = "Micro subset of unicode data files for linkify-it-py projects." +optional = false +python-versions = ">=3.7" +files = [ + {file = "uc-micro-py-1.0.3.tar.gz", hash = "sha256:d321b92cff673ec58027c04015fcaa8bb1e005478643ff4a500882eaab88c48a"}, + {file = "uc_micro_py-1.0.3-py3-none-any.whl", hash = "sha256:db1dffff340817673d7b466ec86114a9dc0e9d4d9b5ba229d9d60e5c12600cd5"}, +] + +[package.extras] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "unstructured" +version = "0.10.27" +description = "A library that prepares raw documents for downstream ML tasks." +optional = true +python-versions = ">=3.7.0" +files = [ + {file = "unstructured-0.10.27-py3-none-any.whl", hash = "sha256:3a8a8e44302388ddc39c184059e8b4458f1cdc58032540b9af7d85f6c3eca3be"}, + {file = "unstructured-0.10.27.tar.gz", hash = "sha256:f567b5c4385993a9ab48db5563dd7b413aac4f2002bb22e6250496ea8f440f5e"}, +] + +[package.dependencies] +backoff = "*" +beautifulsoup4 = "*" +chardet = "*" +dataclasses-json = "*" +emoji = "*" +filetype = "*" +langdetect = "*" +lxml = "*" +nltk = "*" +numpy = "*" +python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"docx\""} +python-iso639 = "*" +python-magic = "*" +python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"pptx\""} +rapidfuzz = "*" +requests = "*" +tabulate = "*" +typing-extensions = "*" + +[package.extras] +airtable = ["pyairtable"] +all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +azure = ["adlfs", "fsspec (==2023.9.1)"] +azure-cognitive-search = ["azure-search-documents"] +bedrock = ["boto3", "langchain"] +biomed = ["bs4"] +box = ["boxfs", "fsspec (==2023.9.1)"] +confluence = ["atlassian-python-api"] +csv = ["pandas"] +delta-table = ["deltalake", "fsspec (==2023.9.1)"] +discord = ["discord-py"] +doc = ["python-docx (>=1.0.1)"] +docx = ["python-docx (>=1.0.1)"] +dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] +elasticsearch = ["elasticsearch", "jq"] +embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] +epub = ["pypandoc"] +gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] +github = ["pygithub (>1.58.0)"] +gitlab = ["python-gitlab"] +google-drive = ["google-api-python-client"] +huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] +image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] +jira = ["atlassian-python-api"] +local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +md = ["markdown"] +msg = ["msg-parser"] +notion = ["htmlBuilder", "notion-client"] +odt = ["pypandoc", "python-docx (>=1.0.1)"] +onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] +openai = ["langchain", "openai", "tiktoken"] +org = ["pypandoc"] +outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] +paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] +pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] +ppt = ["python-pptx (<=0.6.21)"] +pptx = ["python-pptx (<=0.6.21)"] +reddit = ["praw"] +rst = ["pypandoc"] +rtf = ["pypandoc"] +s3 = ["fsspec (==2023.9.1)", "s3fs"] +salesforce = ["simple-salesforce"] +sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] +slack = ["slack-sdk"] +tsv = ["pandas"] +wikipedia = ["wikipedia"] +xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] + +[[package]] +name = "unstructured-pytesseract" +version = "0.3.13" +description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR" +optional = true +python-versions = ">=3.8" +files = [ + {file = "unstructured.pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:8001bc860470d56185176eb3ceb4623e888eba058ca3b30af79003784bc40e19"}, + {file = "unstructured.pytesseract-0.3.13.tar.gz", hash = "sha256:ff2e6391496e457dbf4b4e327f4a4577cce18921ea6570dc74bd64381b10e963"}, +] + +[package.dependencies] +packaging = ">=21.3" +Pillow = ">=8.0.0" + +[[package]] +name = "url-normalize" +version = "1.4.3" +description = "URL normalization for Python" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"}, + {file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"}, +] + +[package.dependencies] +six = "*" + +[[package]] +name = "urllib3" +version = "2.2.3" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, + {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "wcmatch" +version = "8.4" +description = "Wildcard/glob file name matcher." +optional = false +python-versions = ">=3.7" +files = [ + {file = "wcmatch-8.4-py3-none-any.whl", hash = "sha256:dc7351e5a7f8bbf4c6828d51ad20c1770113f5f3fd3dfe2a03cfde2a63f03f98"}, + {file = "wcmatch-8.4.tar.gz", hash = "sha256:ba4fc5558f8946bf1ffc7034b05b814d825d694112499c86035e0e4d398b6a67"}, +] + +[package.dependencies] +bracex = ">=2.1.1" + +[[package]] +name = "werkzeug" +version = "3.1.3" +description = "The comprehensive WSGI web application library." +optional = false +python-versions = ">=3.9" +files = [ + {file = "werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e"}, + {file = "werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746"}, +] + +[package.dependencies] +MarkupSafe = ">=2.1.1" + +[package.extras] +watchdog = ["watchdog (>=2.3)"] + +[[package]] +name = "wrapt" +version = "1.16.0" +description = "Module for decorators, wrappers and monkey patching." +optional = false +python-versions = ">=3.6" +files = [ + {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"}, + {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"}, + {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"}, + {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"}, + {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"}, + {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"}, + {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"}, + {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"}, + {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"}, + {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"}, + {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"}, + {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"}, + {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"}, + {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"}, + {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"}, + {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"}, + {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"}, + {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, +] + +[[package]] +name = "xlsxwriter" +version = "3.2.0" +description = "A Python module for creating Excel XLSX files." +optional = true +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, + {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, +] + +[[package]] +name = "xmltodict" +version = "0.13.0" +description = "Makes working with XML feel like you are working with JSON" +optional = false +python-versions = ">=3.4" +files = [ + {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, + {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, +] + +[[package]] +name = "yarl" +version = "1.17.1" +description = "Yet another URL library" +optional = true +python-versions = ">=3.9" +files = [ + {file = "yarl-1.17.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1794853124e2f663f0ea54efb0340b457f08d40a1cef78edfa086576179c91"}, + {file = "yarl-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fbea1751729afe607d84acfd01efd95e3b31db148a181a441984ce9b3d3469da"}, + {file = "yarl-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ee427208c675f1b6e344a1f89376a9613fc30b52646a04ac0c1f6587c7e46ec"}, + {file = "yarl-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b74ff4767d3ef47ffe0cd1d89379dc4d828d4873e5528976ced3b44fe5b0a21"}, + {file = "yarl-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:62a91aefff3d11bf60e5956d340eb507a983a7ec802b19072bb989ce120cd948"}, + {file = "yarl-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:846dd2e1243407133d3195d2d7e4ceefcaa5f5bf7278f0a9bda00967e6326b04"}, + {file = "yarl-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e844be8d536afa129366d9af76ed7cb8dfefec99f5f1c9e4f8ae542279a6dc3"}, + {file = "yarl-1.17.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc7c92c1baa629cb03ecb0c3d12564f172218fb1739f54bf5f3881844daadc6d"}, + {file = "yarl-1.17.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ae3476e934b9d714aa8000d2e4c01eb2590eee10b9d8cd03e7983ad65dfbfcba"}, + {file = "yarl-1.17.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:c7e177c619342e407415d4f35dec63d2d134d951e24b5166afcdfd1362828e17"}, + {file = "yarl-1.17.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:64cc6e97f14cf8a275d79c5002281f3040c12e2e4220623b5759ea7f9868d6a5"}, + {file = "yarl-1.17.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:84c063af19ef5130084db70ada40ce63a84f6c1ef4d3dbc34e5e8c4febb20822"}, + {file = "yarl-1.17.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:482c122b72e3c5ec98f11457aeb436ae4aecca75de19b3d1de7cf88bc40db82f"}, + {file = "yarl-1.17.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:380e6c38ef692b8fd5a0f6d1fa8774d81ebc08cfbd624b1bca62a4d4af2f9931"}, + {file = "yarl-1.17.1-cp310-cp310-win32.whl", hash = "sha256:16bca6678a83657dd48df84b51bd56a6c6bd401853aef6d09dc2506a78484c7b"}, + {file = "yarl-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:561c87fea99545ef7d692403c110b2f99dced6dff93056d6e04384ad3bc46243"}, + {file = "yarl-1.17.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cbad927ea8ed814622305d842c93412cb47bd39a496ed0f96bfd42b922b4a217"}, + {file = "yarl-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fca4b4307ebe9c3ec77a084da3a9d1999d164693d16492ca2b64594340999988"}, + {file = "yarl-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff5c6771c7e3511a06555afa317879b7db8d640137ba55d6ab0d0c50425cab75"}, + {file = "yarl-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b29beab10211a746f9846baa39275e80034e065460d99eb51e45c9a9495bcca"}, + {file = "yarl-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a52a1ffdd824fb1835272e125385c32fd8b17fbdefeedcb4d543cc23b332d74"}, + {file = "yarl-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:58c8e9620eb82a189c6c40cb6b59b4e35b2ee68b1f2afa6597732a2b467d7e8f"}, + {file = "yarl-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d216e5d9b8749563c7f2c6f7a0831057ec844c68b4c11cb10fc62d4fd373c26d"}, + {file = "yarl-1.17.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:881764d610e3269964fc4bb3c19bb6fce55422828e152b885609ec176b41cf11"}, + {file = "yarl-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8c79e9d7e3d8a32d4824250a9c6401194fb4c2ad9a0cec8f6a96e09a582c2cc0"}, + {file = "yarl-1.17.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:299f11b44d8d3a588234adbe01112126010bd96d9139c3ba7b3badd9829261c3"}, + {file = "yarl-1.17.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:cc7d768260f4ba4ea01741c1b5fe3d3a6c70eb91c87f4c8761bbcce5181beafe"}, + {file = "yarl-1.17.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:de599af166970d6a61accde358ec9ded821234cbbc8c6413acfec06056b8e860"}, + {file = "yarl-1.17.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2b24ec55fad43e476905eceaf14f41f6478780b870eda5d08b4d6de9a60b65b4"}, + {file = "yarl-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9fb815155aac6bfa8d86184079652c9715c812d506b22cfa369196ef4e99d1b4"}, + {file = "yarl-1.17.1-cp311-cp311-win32.whl", hash = "sha256:7615058aabad54416ddac99ade09a5510cf77039a3b903e94e8922f25ed203d7"}, + {file = "yarl-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:14bc88baa44e1f84164a392827b5defb4fa8e56b93fecac3d15315e7c8e5d8b3"}, + {file = "yarl-1.17.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:327828786da2006085a4d1feb2594de6f6d26f8af48b81eb1ae950c788d97f61"}, + {file = "yarl-1.17.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cc353841428d56b683a123a813e6a686e07026d6b1c5757970a877195f880c2d"}, + {file = "yarl-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c73df5b6e8fabe2ddb74876fb82d9dd44cbace0ca12e8861ce9155ad3c886139"}, + {file = "yarl-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bdff5e0995522706c53078f531fb586f56de9c4c81c243865dd5c66c132c3b5"}, + {file = "yarl-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:06157fb3c58f2736a5e47c8fcbe1afc8b5de6fb28b14d25574af9e62150fcaac"}, + {file = "yarl-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1654ec814b18be1af2c857aa9000de7a601400bd4c9ca24629b18486c2e35463"}, + {file = "yarl-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f6595c852ca544aaeeb32d357e62c9c780eac69dcd34e40cae7b55bc4fb1147"}, + {file = "yarl-1.17.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:459e81c2fb920b5f5df744262d1498ec2c8081acdcfe18181da44c50f51312f7"}, + {file = "yarl-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7e48cdb8226644e2fbd0bdb0a0f87906a3db07087f4de77a1b1b1ccfd9e93685"}, + {file = "yarl-1.17.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:d9b6b28a57feb51605d6ae5e61a9044a31742db557a3b851a74c13bc61de5172"}, + {file = "yarl-1.17.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e594b22688d5747b06e957f1ef822060cb5cb35b493066e33ceac0cf882188b7"}, + {file = "yarl-1.17.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5f236cb5999ccd23a0ab1bd219cfe0ee3e1c1b65aaf6dd3320e972f7ec3a39da"}, + {file = "yarl-1.17.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a2a64e62c7a0edd07c1c917b0586655f3362d2c2d37d474db1a509efb96fea1c"}, + {file = "yarl-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d0eea830b591dbc68e030c86a9569826145df485b2b4554874b07fea1275a199"}, + {file = "yarl-1.17.1-cp312-cp312-win32.whl", hash = "sha256:46ddf6e0b975cd680eb83318aa1d321cb2bf8d288d50f1754526230fcf59ba96"}, + {file = "yarl-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:117ed8b3732528a1e41af3aa6d4e08483c2f0f2e3d3d7dca7cf538b3516d93df"}, + {file = "yarl-1.17.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5d1d42556b063d579cae59e37a38c61f4402b47d70c29f0ef15cee1acaa64488"}, + {file = "yarl-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c0167540094838ee9093ef6cc2c69d0074bbf84a432b4995835e8e5a0d984374"}, + {file = "yarl-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2f0a6423295a0d282d00e8701fe763eeefba8037e984ad5de44aa349002562ac"}, + {file = "yarl-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5b078134f48552c4d9527db2f7da0b5359abd49393cdf9794017baec7506170"}, + {file = "yarl-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d401f07261dc5aa36c2e4efc308548f6ae943bfff20fcadb0a07517a26b196d8"}, + {file = "yarl-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5f1ac7359e17efe0b6e5fec21de34145caef22b260e978336f325d5c84e6938"}, + {file = "yarl-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f63d176a81555984e91f2c84c2a574a61cab7111cc907e176f0f01538e9ff6e"}, + {file = "yarl-1.17.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e275792097c9f7e80741c36de3b61917aebecc08a67ae62899b074566ff8556"}, + {file = "yarl-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:81713b70bea5c1386dc2f32a8f0dab4148a2928c7495c808c541ee0aae614d67"}, + {file = "yarl-1.17.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:aa46dce75078fceaf7cecac5817422febb4355fbdda440db55206e3bd288cfb8"}, + {file = "yarl-1.17.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1ce36ded585f45b1e9bb36d0ae94765c6608b43bd2e7f5f88079f7a85c61a4d3"}, + {file = "yarl-1.17.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:2d374d70fdc36f5863b84e54775452f68639bc862918602d028f89310a034ab0"}, + {file = "yarl-1.17.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:2d9f0606baaec5dd54cb99667fcf85183a7477f3766fbddbe3f385e7fc253299"}, + {file = "yarl-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b0341e6d9a0c0e3cdc65857ef518bb05b410dbd70d749a0d33ac0f39e81a4258"}, + {file = "yarl-1.17.1-cp313-cp313-win32.whl", hash = "sha256:2e7ba4c9377e48fb7b20dedbd473cbcbc13e72e1826917c185157a137dac9df2"}, + {file = "yarl-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:949681f68e0e3c25377462be4b658500e85ca24323d9619fdc41f68d46a1ffda"}, + {file = "yarl-1.17.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8994b29c462de9a8fce2d591028b986dbbe1b32f3ad600b2d3e1c482c93abad6"}, + {file = "yarl-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f9cbfbc5faca235fbdf531b93aa0f9f005ec7d267d9d738761a4d42b744ea159"}, + {file = "yarl-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b40d1bf6e6f74f7c0a567a9e5e778bbd4699d1d3d2c0fe46f4b717eef9e96b95"}, + {file = "yarl-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5efe0661b9fcd6246f27957f6ae1c0eb29bc60552820f01e970b4996e016004"}, + {file = "yarl-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b5c4804e4039f487e942c13381e6c27b4b4e66066d94ef1fae3f6ba8b953f383"}, + {file = "yarl-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5d6a6c9602fd4598fa07e0389e19fe199ae96449008d8304bf5d47cb745462e"}, + {file = "yarl-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f4c9156c4d1eb490fe374fb294deeb7bc7eaccda50e23775b2354b6a6739934"}, + {file = "yarl-1.17.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6324274b4e0e2fa1b3eccb25997b1c9ed134ff61d296448ab8269f5ac068c4c"}, + {file = "yarl-1.17.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d8a8b74d843c2638f3864a17d97a4acda58e40d3e44b6303b8cc3d3c44ae2d29"}, + {file = "yarl-1.17.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:7fac95714b09da9278a0b52e492466f773cfe37651cf467a83a1b659be24bf71"}, + {file = "yarl-1.17.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:c180ac742a083e109c1a18151f4dd8675f32679985a1c750d2ff806796165b55"}, + {file = "yarl-1.17.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:578d00c9b7fccfa1745a44f4eddfdc99d723d157dad26764538fbdda37209857"}, + {file = "yarl-1.17.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:1a3b91c44efa29e6c8ef8a9a2b583347998e2ba52c5d8280dbd5919c02dfc3b5"}, + {file = "yarl-1.17.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a7ac5b4984c468ce4f4a553df281450df0a34aefae02e58d77a0847be8d1e11f"}, + {file = "yarl-1.17.1-cp39-cp39-win32.whl", hash = "sha256:7294e38f9aa2e9f05f765b28ffdc5d81378508ce6dadbe93f6d464a8c9594473"}, + {file = "yarl-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:eb6dce402734575e1a8cc0bb1509afca508a400a57ce13d306ea2c663bad1138"}, + {file = "yarl-1.17.1-py3-none-any.whl", hash = "sha256:f1790a4b1e8e8e028c391175433b9c8122c39b46e1663228158e61e6f915bf06"}, + {file = "yarl-1.17.1.tar.gz", hash = "sha256:067a63fcfda82da6b198fa73079b1ca40b7c9b7994995b6ee38acda728b64d47"}, +] + +[package.dependencies] +idna = ">=2.0" +multidict = ">=4.0" +propcache = ">=0.2.0" + +[[package]] +name = "zipp" +version = "3.20.2" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = true +python-versions = ">=3.8" +files = [ + {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, + {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +type = ["pytest-mypy"] + +[extras] +file-based = ["avro", "fastavro", "markdown", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] +sphinx-docs = ["Sphinx", "sphinx-rtd-theme"] +sql = ["sqlalchemy"] +vector-db-based = ["cohere", "langchain", "openai", "tiktoken"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.10" +content-hash = "04616bc5fdd4e0993c2b8ff4ed31eda425c13fb72b3a75e6414caf282ed24148" diff --git a/airbyte-cdk/python/pyproject.toml b/airbyte-cdk/python/pyproject.toml new file mode 100644 index 000000000000..7286fa5979bd --- /dev/null +++ b/airbyte-cdk/python/pyproject.toml @@ -0,0 +1,134 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "airbyte-cdk" +version = "6.5.2" +description = "A framework for writing Airbyte Connectors." +authors = ["Airbyte "] +license = "MIT" +readme = "README.md" +homepage = "https://github.com/airbytehq/airbyte" +repository = "https://github.com/airbytehq/airbyte" +documentation = "https://docs.airbyte.io/" +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering", + "Topic :: Software Development :: Libraries :: Python Modules", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.10", +] +keywords = ["airbyte", "connector-development-kit", "cdk"] + + +[tool.poetry.dependencies] +python = "^3.10" +airbyte-protocol-models-dataclasses = "^0.13" +backoff = "*" +cachetools = "*" +Deprecated = "~1.2" +dpath = "^2.1.6" +genson = "1.2.2" +isodate = "~0.6.1" +Jinja2 = "~3.1.2" +jsonref = "~0.2" +jsonschema = "~3.2.0" +pandas = "2.2.2" +pendulum = "<3.0.0" +psutil = "6.1.0" +pydantic = "^2.7" +pyrate-limiter = "~3.1.0" +python-dateutil = "*" +python-ulid = "^3.0.0" +PyYAML = "^6.0.1" +requests = "*" +requests_cache = "*" +wcmatch = "8.4" +# Extras depedencies +avro = { version = "~1.11.2", optional = true } +cohere = { version = "4.21", optional = true } +fastavro = { version = "~1.8.0", optional = true } +langchain = { version = "0.1.16", optional = true } +langchain_core = { version = "0.1.42", optional = true } +markdown = { version = "*", optional = true } +openai = { version = "0.27.9", extras = ["embeddings"], optional = true } +pdf2image = { version = "1.16.3", optional = true } +"pdfminer.six" = { version = "20221105", optional = true } +pyarrow = { version = "~15.0.0", optional = true } +pytesseract = { version = "0.3.10", optional = true } +python-calamine = { version = "0.2.3", optional = true } +python-snappy = { version = "0.7.3", optional = true } +Sphinx = { version = "~4.2", optional = true } +sphinx-rtd-theme = { version = "~1.0", optional = true } +tiktoken = { version = "0.4.0", optional = true } +nltk = { version = "3.8.1", optional = true } +# This will ensure that even when you run poetry install or pip install, the compatible version of numpy will always be chosen. +# airbyte-ci will try to install latest version when --use-local-cdk is used, resulting in the conflict. +numpy = "<2" +unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +"unstructured.pytesseract" = { version = ">=0.3.12", optional = true } +pyjwt = "^2.8.0" +cryptography = "^42.0.5" +pytz = "2024.1" +orjson = "^3.10.7" +serpyco-rs = "^1.10.2" +sqlalchemy = {version = "^2.0,!=2.0.36", optional = true } +xmltodict = "^0.13.0" + +[tool.poetry.group.dev.dependencies] +freezegun = "*" +mypy = "*" +asyncio = "3.4.3" +poethepoet = "^0.24.2" +pyproject-flake8 = "^6.1.0" +pytest = "^7" +pytest-memray = "^1.6.0" +pytest-cov = "*" +pytest-httpserver = "*" +pytest-mock = "*" +requests-mock = "*" +pdoc = "^15.0.0" + +[tool.poetry.extras] +file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"] +sphinx-docs = ["Sphinx", "sphinx-rtd-theme"] +vector-db-based = ["langchain", "openai", "cohere", "tiktoken"] +sql = ["sqlalchemy"] + +[tool.ruff] +# Setting python version to at least 3.10 avoids `from __future__ import annotations`. +target-version = "py310" +# This is consistent with airbytehq/airbyte root pyproject.toml Black rule defined. +line-length = 140 + +[tool.poe.tasks] +# Build tasks +assemble = {cmd = "bin/generate-component-manifest-dagger.sh", help = "Generate component manifest files."} +build-package = {cmd = "poetry build", help = "Build the python package: source and wheels archives."} +build = {sequence = ["assemble", "build-package"], help = "Run all tasks to build the package."} + +# Check tasks +lint = {cmd = "pflake8 --config ../../pyproject.toml ./", help = "Lint with flake8."} +type-check = {cmd = "bin/run-mypy-on-modified-files.sh", help = "Type check modified files with mypy."} +unit-test-with-cov = {cmd = "pytest -s unit_tests -c pytest.ini --cov=airbyte_cdk --cov-report=term --cov-config ../../pyproject.toml", help = "Run unit tests and create a coverage report."} +# TODO: find a version of the modified mypy check that works both locally and in CI. +check-lockfile = {cmd = "poetry check", help = "Check the poetry lock file."} +check-local = {sequence = ["lint", "type-check", "check-lockfile", "unit-test-with-cov"], help = "Lint all code, type-check modified files, and run unit tests."} +check-ci = {sequence = ["check-lockfile", "build", "lint", "unit-test-with-cov"], help = "Build the package, lint and run unit tests. Does not include type-checking."} + +# Build and check +pre-push = {sequence = ["build", "check-local"], help = "Run all build and check tasks."} + +# API Docs with PDoc +docs-generate = {env = {PDOC_ALLOW_EXEC = "1"}, cmd = "python -m docs.generate run"} +docs-preview = {shell = "poe docs-generate && open docs/generated/index.html"} + + +[tool.airbyte_ci] +python_versions = ["3.10", "3.11"] +optional_poetry_groups = ["dev"] +poetry_extras = ["file-based", "sphinx-docs", "vector-db-based"] +poe_tasks = ["check-ci"] +mount_docker_socket = true diff --git a/airbyte-cdk/python/pyproject.toml-bak b/airbyte-cdk/python/pyproject.toml-bak new file mode 100644 index 000000000000..af6c4aab583c --- /dev/null +++ b/airbyte-cdk/python/pyproject.toml-bak @@ -0,0 +1,127 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "airbyte-cdk" +version = "6.5.2" +description = "A framework for writing Airbyte Connectors." +authors = ["Airbyte "] +license = "MIT" +readme = "README.md" +homepage = "https://github.com/airbytehq/airbyte" +repository = "https://github.com/airbytehq/airbyte" +documentation = "https://docs.airbyte.io/" +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering", + "Topic :: Software Development :: Libraries :: Python Modules", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.10", +] +keywords = ["airbyte", "connector-development-kit", "cdk"] + + +[tool.poetry.dependencies] +python = "^3.10" +airbyte-protocol-models-dataclasses = "^0.13" +backoff = "*" +cachetools = "*" +Deprecated = "~1.2" +dpath = "^2.1.6" +genson = "1.2.2" +isodate = "~0.6.1" +Jinja2 = "~3.1.2" +jsonref = "~0.2" +jsonschema = "~3.2.0" +pandas = "2.2.2" +pendulum = "<3.0.0" +psutil = "6.1.0" +pydantic = "^2.7" +pyrate-limiter = "~3.1.0" +python-dateutil = "*" +PyYAML = "^6.0.1" +requests = "*" +requests_cache = "*" +wcmatch = "8.4" +# Extras depedencies +avro = { version = "~1.11.2", optional = true } +cohere = { version = "4.21", optional = true } +fastavro = { version = "~1.8.0", optional = true } +langchain = { version = "0.1.16", optional = true } +langchain_core = { version = "0.1.42", optional = true } +markdown = { version = "*", optional = true } +openai = { version = "0.27.9", extras = ["embeddings"], optional = true } +pdf2image = { version = "1.16.3", optional = true } +"pdfminer.six" = { version = "20221105", optional = true } +pyarrow = { version = "~15.0.0", optional = true } +pytesseract = { version = "0.3.10", optional = true } +python-calamine = { version = "0.2.3", optional = true } +python-snappy = { version = "0.7.3", optional = true } +Sphinx = { version = "~4.2", optional = true } +sphinx-rtd-theme = { version = "~1.0", optional = true } +tiktoken = { version = "0.4.0", optional = true } +nltk = { version = "3.8.1", optional = true } +# This will ensure that even when you run poetry install or pip install, the compatible version of numpy will always be chosen. +# airbyte-ci will try to install latest version when --use-local-cdk is used, resulting in the conflict. +numpy = "<2" +unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +"unstructured.pytesseract" = { version = ">=0.3.12", optional = true } +pyjwt = "^2.8.0" +cryptography = "^42.0.5" +pytz = "2024.1" +orjson = "^3.10.7" +serpyco-rs = "^1.10.2" +sqlalchemy = {version = "^2.0,!=2.0.36", optional = true } +xmltodict = "^0.13.0" + +[tool.poetry.group.dev.dependencies] +freezegun = "*" +mypy = "*" +asyncio = "3.4.3" +poethepoet = "^0.24.2" +pyproject-flake8 = "^6.1.0" +pytest = "^7" +pytest-memray = "^1.6.0" +pytest-cov = "*" +pytest-httpserver = "*" +pytest-mock = "*" +requests-mock = "*" + +[tool.poetry.extras] +file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"] +sphinx-docs = ["Sphinx", "sphinx-rtd-theme"] +vector-db-based = ["langchain", "openai", "cohere", "tiktoken"] +sql = ["sqlalchemy"] + +[tool.ruff] +# Setting python version to at least 3.10 avoids `from __future__ import annotations`. +target-version = "py310" +# This is consistent with airbytehq/airbyte root pyproject.toml Black rule defined. +line-length = 140 + +[tool.poe.tasks] +# Build tasks +assemble = {cmd = "bin/generate-component-manifest-dagger.sh", help = "Generate component manifest files."} +build-package = {cmd = "poetry build", help = "Build the python package: source and wheels archives."} +build = {sequence = ["assemble", "build-package"], help = "Run all tasks to build the package."} + +# Check tasks +lint = {cmd = "pflake8 --config ../../pyproject.toml ./", help = "Lint with flake8."} +type-check = {cmd = "bin/run-mypy-on-modified-files.sh", help = "Type check modified files with mypy."} +unit-test-with-cov = {cmd = "pytest -s unit_tests -c pytest.ini --cov=airbyte_cdk --cov-report=term --cov-config ../../pyproject.toml", help = "Run unit tests and create a coverage report."} +# TODO: find a version of the modified mypy check that works both locally and in CI. +check-lockfile = {cmd = "poetry check", help = "Check the poetry lock file."} +check-local = {sequence = ["lint", "type-check", "check-lockfile", "unit-test-with-cov"], help = "Lint all code, type-check modified files, and run unit tests."} +check-ci = {sequence = ["check-lockfile", "build", "lint", "unit-test-with-cov"], help = "Build the package, lint and run unit tests. Does not include type-checking."} + +# Build and check +pre-push = {sequence = ["build", "check-local"], help = "Run all build and check tasks."} + +[tool.airbyte_ci] +python_versions = ["3.10", "3.11"] +optional_poetry_groups = ["dev"] +poetry_extras = ["file-based", "sphinx-docs", "vector-db-based"] +poe_tasks = ["check-ci"] +mount_docker_socket = true diff --git a/airbyte-cdk/python/pytest.ini b/airbyte-cdk/python/pytest.ini new file mode 100644 index 000000000000..6a5cafc1f6ec --- /dev/null +++ b/airbyte-cdk/python/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +log_cli = 1 +log_cli_level = INFO +log_cli_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s) +log_cli_date_format=%Y-%m-%d %H:%M:%S +filterwarnings = + ignore::airbyte_cdk.sources.source.ExperimentalClassWarning diff --git a/airbyte-cdk/python/reference_docs/Makefile b/airbyte-cdk/python/reference_docs/Makefile new file mode 100644 index 000000000000..c7f9263975e8 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/Makefile @@ -0,0 +1,23 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = _source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +rst: + sphinx-apidoc -f -F -o $(SOURCEDIR)/api ../airbyte_cdk diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.destinations.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.destinations.rst new file mode 100644 index 000000000000..e554319db2fb --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.destinations.rst @@ -0,0 +1,19 @@ + +Submodules +---------- + +airbyte\_cdk.destinations.destination module +-------------------------------------------- + +.. automodule:: airbyte_cdk.destinations.destination + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.destinations + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.models.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.models.rst new file mode 100644 index 000000000000..14d3065dd76c --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.models.rst @@ -0,0 +1,19 @@ + +Submodules +---------- + +airbyte\_cdk.models.airbyte\_protocol module +-------------------------------------------- + +.. automodule:: airbyte_cdk.models.airbyte_protocol + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.models + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.rst new file mode 100644 index 000000000000..4d270b65e6ee --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.rst @@ -0,0 +1,54 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.destinations + airbyte_cdk.models + airbyte_cdk.sources + airbyte_cdk.utils + +Submodules +---------- + +airbyte\_cdk.connector module +----------------------------- + +.. automodule:: airbyte_cdk.connector + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.entrypoint module +------------------------------ + +.. automodule:: airbyte_cdk.entrypoint + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.exception\_handler module +-------------------------------------- + +.. automodule:: airbyte_cdk.exception_handler + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.logger module +-------------------------- + +.. automodule:: airbyte_cdk.logger + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.auth.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.auth.rst new file mode 100644 index 000000000000..43c8a518e5d7 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.auth.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.auth.oauth module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.auth.oauth + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.auth.token module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.auth.token + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.auth + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.checks.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.checks.rst new file mode 100644 index 000000000000..d4d275419f54 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.checks.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.checks.check\_stream module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.checks.check_stream + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.checks.connection\_checker module +------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.checks.connection_checker + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.checks + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.datetime.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.datetime.rst new file mode 100644 index 000000000000..f523d1b1736a --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.datetime.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.datetime.datetime\_parser module +----------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.datetime.datetime_parser + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.datetime.min\_max\_datetime module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.datetime.min_max_datetime + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.datetime + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.decoders.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.decoders.rst new file mode 100644 index 000000000000..3d4a362b1064 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.decoders.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.decoders.decoder module +-------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.decoders.decoder + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.decoders.json\_decoder module +-------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.decoders.json_decoder + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.decoders + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.extractors.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.extractors.rst new file mode 100644 index 000000000000..3b901d5c9f1e --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.extractors.rst @@ -0,0 +1,51 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.extractors.dpath\_extractor module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.dpath_extractor + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.extractors.http\_selector module +----------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.http_selector + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.extractors.record\_extractor module +-------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.record_extractor + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.extractors.record\_filter module +----------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.record_filter + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.extractors.record\_selector module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.record_selector + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.interpolation.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.interpolation.rst new file mode 100644 index 000000000000..22ab8838517b --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.interpolation.rst @@ -0,0 +1,59 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.interpolation.interpolated\_boolean module +--------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolated_boolean + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.interpolated\_mapping module +--------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolated_mapping + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.interpolated\_string module +-------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolated_string + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.interpolation module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolation + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.jinja module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.jinja + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.macros module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.macros + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.parsers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.parsers.rst new file mode 100644 index 000000000000..c5f9fdb8b8ec --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.parsers.rst @@ -0,0 +1,59 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.parsers.class\_types\_registry module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.class_types_registry + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.config\_parser module +-------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.config_parser + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.default\_implementation\_registry module +--------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.default_implementation_registry + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.factory module +------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.factory + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.undefined\_reference\_exception module +------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.undefined_reference_exception + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.yaml\_parser module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.parsers.yaml_parser + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.rst new file mode 100644 index 000000000000..0fa4a8c4070d --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.rst @@ -0,0 +1,51 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.constant\_backoff\_strategy module +------------------------------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.exponential\_backoff\_strategy module +--------------------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.header\_helper module +----------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.wait\_time\_from\_header\_backoff\_strategy module +---------------------------------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_time_from_header_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.wait\_until\_time\_from\_header\_backoff\_strategy module +----------------------------------------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_until_time_from_header_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.rst new file mode 100644 index 000000000000..5b69c8b19ce7 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.rst @@ -0,0 +1,75 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategy module +------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.composite\_error\_handler module +-------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.default\_error\_handler module +------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.error\_handler module +--------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.http\_response\_filter module +----------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.response\_action module +----------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.response_action + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.response\_status module +----------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.response_status + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.rst new file mode 100644 index 000000000000..fd91f3f08af4 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.rst @@ -0,0 +1,43 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.requesters.paginators.strategies + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.paginators.limit\_paginator module +------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.default_paginator + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.no\_pagination module +---------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.no_pagination + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.paginator module +----------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.paginator + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.strategies.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.strategies.rst new file mode 100644 index 000000000000..86f929120e33 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.strategies.rst @@ -0,0 +1,43 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.paginators.strategies.cursor\_pagination\_strategy module +----------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.strategies.offset\_increment module +------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.strategies.page\_increment module +---------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.strategies.pagination\_strategy module +--------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_headers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_headers.rst new file mode 100644 index 000000000000..ebb492b2a50b --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_headers.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.request\_headers.interpolated\_request\_header\_provider module +----------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_headers.interpolated_request_header_provider + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.request\_headers.request\_header\_provider module +--------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_headers.request_header_provider + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_headers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_options.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_options.rst new file mode 100644 index 000000000000..ff8eb074f6d1 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_options.rst @@ -0,0 +1,35 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.request\_options.interpolated\_request\_input\_provider module +---------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_input_provider + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.request\_options.interpolated\_request\_options\_provider module +------------------------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.request\_options.request\_options\_provider module +---------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_options + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.retriers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.retriers.rst new file mode 100644 index 000000000000..65e58aec2e73 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.retriers.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.retriers.default\_retrier module +---------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.retriers.default_retrier + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.retriers.retrier module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.retriers.retrier + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.retriers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.rst new file mode 100644 index 000000000000..63a9dc689e6e --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.rst @@ -0,0 +1,45 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.requesters.error_handlers + airbyte_cdk.sources.declarative.requesters.paginators + airbyte_cdk.sources.declarative.requesters.request_options + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.http\_requester module +------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.http_requester + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.request\_option module +------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_option + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.requester module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.requester + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.retrievers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.retrievers.rst new file mode 100644 index 000000000000..763c663648a9 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.retrievers.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.retrievers.retriever module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.retrievers.retriever + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.retrievers.simple\_retriever module +-------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.retrievers.simple_retriever + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.retrievers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.rst new file mode 100644 index 000000000000..0ffe29a4ae01 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.rst @@ -0,0 +1,78 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.auth + airbyte_cdk.sources.declarative.checks + airbyte_cdk.sources.declarative.datetime + airbyte_cdk.sources.declarative.decoders + airbyte_cdk.sources.declarative.extractors + airbyte_cdk.sources.declarative.interpolation + airbyte_cdk.sources.declarative.parsers + airbyte_cdk.sources.declarative.requesters + airbyte_cdk.sources.declarative.retrievers + airbyte_cdk.sources.declarative.schema + airbyte_cdk.sources.declarative.stream_slicers + airbyte_cdk.sources.declarative.transformations + +Submodules +---------- + +airbyte\_cdk.sources.declarative.create\_partial module +------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.create_partial + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.declarative\_source module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.declarative_source + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.declarative\_stream module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.declarative_stream + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.exceptions module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.exceptions + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.types module +--------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.types + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.yaml\_declarative\_source module +----------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.yaml_declarative_source + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.schema.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.schema.rst new file mode 100644 index 000000000000..d0da7b6a7127 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.schema.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.schema.json\_schema module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.schema.json_schema + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.schema.schema\_loader module +------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.schema.schema_loader + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.schema + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.states.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.states.rst new file mode 100644 index 000000000000..f4d331f8d3da --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.states.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.states.dict\_state module +---------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.states.dict_state + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.states.state module +---------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.states.state + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.states + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.stream_slicers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.stream_slicers.rst new file mode 100644 index 000000000000..ed5f6c179977 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.stream_slicers.rst @@ -0,0 +1,59 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.stream\_slicers.cartesian\_product\_stream\_slicer module +------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.cartesian_product_stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.datetime\_stream\_slicer module +-------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.datetime_stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.list\_stream\_slicer module +---------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.single\_slice module +--------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.single_slice + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.stream\_slicer module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.substream\_slicer module +------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.substream_slicer + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.transformations.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.transformations.rst new file mode 100644 index 000000000000..031b1af23d2c --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.transformations.rst @@ -0,0 +1,35 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.transformations.add\_fields module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations.add_fields + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.transformations.remove\_fields module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations.remove_fields + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.transformations.transformation module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations.transformation + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.deprecated.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.deprecated.rst new file mode 100644 index 000000000000..3d19a99f04f0 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.deprecated.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.deprecated.base\_source module +--------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.deprecated.base_source + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.deprecated.client module +--------------------------------------------- + +.. automodule:: airbyte_cdk.sources.deprecated.client + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.deprecated + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.rst new file mode 100644 index 000000000000..0a25c34ae005 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.rst @@ -0,0 +1,47 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative + airbyte_cdk.sources.deprecated + airbyte_cdk.sources.singer + airbyte_cdk.sources.streams + airbyte_cdk.sources.utils + +Submodules +---------- + +airbyte\_cdk.sources.abstract\_source module +-------------------------------------------- + +.. automodule:: airbyte_cdk.sources.abstract_source + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.config module +---------------------------------- + +.. automodule:: airbyte_cdk.sources.config + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.source module +---------------------------------- + +.. automodule:: airbyte_cdk.sources.source + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.singer.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.singer.rst new file mode 100644 index 000000000000..37af436cbc75 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.singer.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.singer.singer\_helpers module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.singer.singer_helpers + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.singer.source module +----------------------------------------- + +.. automodule:: airbyte_cdk.sources.singer.source + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.singer + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.auth.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.auth.rst new file mode 100644 index 000000000000..049da2b74e72 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.auth.rst @@ -0,0 +1,35 @@ + +Submodules +---------- + +airbyte\_cdk.sources.streams.http.auth.core module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.auth.core + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.auth.oauth module +--------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.auth.oauth + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.auth.token module +--------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.auth.token + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.streams.http.auth + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.requests_native_auth.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.requests_native_auth.rst new file mode 100644 index 000000000000..b2a7bc7d8ec5 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.requests_native_auth.rst @@ -0,0 +1,43 @@ + +Submodules +---------- + +airbyte\_cdk.sources.streams.http.requests\_native\_auth.abstract\_oauth module +------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.requests\_native\_auth.abstract\_token module +------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.requests_native_auth.abstract_token + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.requests\_native\_auth.oauth module +--------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.requests_native_auth.oauth + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.requests\_native\_auth.token module +--------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.requests_native_auth.token + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.streams.http.requests_native_auth + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.rst new file mode 100644 index 000000000000..920db700aa92 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.rst @@ -0,0 +1,44 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.streams.http.auth + airbyte_cdk.sources.streams.http.requests_native_auth + +Submodules +---------- + +airbyte\_cdk.sources.streams.http.exceptions module +--------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.exceptions + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.http module +--------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.http + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.rate\_limiting module +------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.rate_limiting + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.streams.http + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.rst new file mode 100644 index 000000000000..e00b5d0f36f0 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.rst @@ -0,0 +1,27 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.streams.http + +Submodules +---------- + +airbyte\_cdk.sources.streams.core module +---------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.core + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.streams + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.utils.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.utils.rst new file mode 100644 index 000000000000..8b53a23a7199 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.utils.rst @@ -0,0 +1,51 @@ + +Submodules +---------- + +airbyte\_cdk.sources.utils.casing module +---------------------------------------- + +.. automodule:: airbyte_cdk.sources.utils.casing + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.utils.catalog\_helpers module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.utils.catalog_helpers + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.utils.schema\_helpers module +------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.utils.schema_helpers + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.utils.schema\_models module +------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.utils.schema_models + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.utils.transform module +------------------------------------------- + +.. automodule:: airbyte_cdk.sources.utils.transform + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.utils.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.utils.rst new file mode 100644 index 000000000000..d65a8f8ec055 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.utils.rst @@ -0,0 +1,35 @@ + +Submodules +---------- + +airbyte\_cdk.utils.airbyte\_secrets\_utils module +------------------------------------------------- + +.. automodule:: airbyte_cdk.utils.airbyte_secrets_utils + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.utils.event\_timing module +--------------------------------------- + +.. automodule:: airbyte_cdk.utils.event_timing + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.utils.traced\_exception module +------------------------------------------- + +.. automodule:: airbyte_cdk.utils.traced_exception + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/modules.rst b/airbyte-cdk/python/reference_docs/_source/api/modules.rst new file mode 100644 index 000000000000..453898da5f38 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/modules.rst @@ -0,0 +1,7 @@ +airbyte_cdk +=========== + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk diff --git a/airbyte-cdk/python/reference_docs/_source/conf.py b/airbyte-cdk/python/reference_docs/_source/conf.py new file mode 100644 index 000000000000..c661adb42534 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/conf.py @@ -0,0 +1,66 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) + + +# -- Project information ----------------------------------------------------- + +project = "Airbyte Connector Development Kit" +copyright = "2021, Airbyte" +author = "Airbyte" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", # Support for NumPy and Google style docstrings +] # API docs + +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] +html_logo = "../_static/img/airbyte_new_logo.svg" +html_theme_options = { + "logo_only": True, + "display_version": False, +} diff --git a/airbyte-cdk/python/reference_docs/_source/index.rst b/airbyte-cdk/python/reference_docs/_source/index.rst new file mode 100644 index 000000000000..cc6beef3ede9 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/index.rst @@ -0,0 +1,36 @@ +Welcome to Airbyte Connector Development Kit's documentation! +============================================================= +This documentation is autogenerated from source code comments. More extensive overviews and conceptual explanations will be coming soon. +======================================================================================================================================== + +.. toctree:: + :maxdepth: 4 + :caption: Destinations + + api/airbyte_cdk.destinations + +.. toctree:: + :maxdepth: 4 + :caption: Models + + api/airbyte_cdk.models + +.. toctree:: + :maxdepth: 4 + :caption: Sources + + api/airbyte_cdk.sources + +.. toctree:: + :maxdepth: 4 + :caption: Utils + + api/airbyte_cdk.utils + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/airbyte-cdk/python/reference_docs/_source/templates/master_doc.rst_t b/airbyte-cdk/python/reference_docs/_source/templates/master_doc.rst_t new file mode 100644 index 000000000000..082c0ce4ed8c --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/templates/master_doc.rst_t @@ -0,0 +1,19 @@ +Welcome to Airbyte Connector Development Kit's documentation! +============================================================= +This documentation is autogenerated from source code comments. More extensive overviews and conceptual explanations will be coming soon. +======================================================================================================================================== +{% for module in top_modules %} +.. toctree:: + :maxdepth: {{ maxdepth }} + :caption: {{ module.caption }} + + {{module.path}} +{% endfor %} + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/airbyte-cdk/python/reference_docs/_source/templates/package.rst_t b/airbyte-cdk/python/reference_docs/_source/templates/package.rst_t new file mode 100644 index 000000000000..8c7700310618 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/templates/package.rst_t @@ -0,0 +1,51 @@ +{%- macro automodule(modname, options) -%} +.. automodule:: {{ modname }} +{%- for option in options %} + :{{ option }}: +{%- endfor %} +{%- endmacro %} + +{%- macro toctree(docnames) -%} +.. toctree:: + :maxdepth: {{ maxdepth }} +{% for docname in docnames %} + {{ docname }} +{%- endfor %} +{%- endmacro %} + +{%- if is_namespace %} +.. py:module:: {{ pkgname }} +{% endif %} + +{%- if modulefirst and not is_namespace %} +{{ automodule(pkgname, automodule_options) }} +{% endif %} + +{%- if subpackages %} +Subpackages +----------- + +{{ toctree(subpackages) }} +{% endif %} + +{%- if submodules %} +Submodules +---------- +{% if separatemodules %} +{{ toctree(submodules) }} +{% else %} +{%- for submodule in submodules %} +{% if show_headings %} +{{- [submodule, "module"] | join(" ") | e | heading(2) }} +{% endif %} +{{ automodule(submodule, automodule_options) }} +{% endfor %} +{%- endif %} +{%- endif %} + +{%- if not modulefirst and not is_namespace %} +Module contents +--------------- + +{{ automodule(pkgname, automodule_options) }} +{% endif %} diff --git a/airbyte-cdk/python/reference_docs/_static/img/airbyte_new_logo.svg b/airbyte-cdk/python/reference_docs/_static/img/airbyte_new_logo.svg new file mode 100644 index 000000000000..463f23913c1a --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_static/img/airbyte_new_logo.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/airbyte-cdk/python/reference_docs/generate_rst_schema.py b/airbyte-cdk/python/reference_docs/generate_rst_schema.py new file mode 100755 index 000000000000..b401d2e4d7b3 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/generate_rst_schema.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import sys +from os import path +from typing import Any, Dict + +from sphinx.cmd.quickstart import QuickstartRenderer +from sphinx.ext.apidoc import get_parser, main, recurse_tree, write_file +from sphinx.locale import __ +from sphinx.util import ensuredir + + +def write_master_file(templatedir: str, master_name: str, values: Dict, opts: Any): + template = QuickstartRenderer(templatedir=templatedir) + opts.destdir = opts.destdir[: opts.destdir.rfind("/")] + write_file(master_name, template.render(f"{templatedir}/master_doc.rst_t", values), opts) + + +if __name__ == "__main__": + parser = get_parser() + parser.add_argument("--master", metavar="MASTER", default="index", help=__("master document name")) + args = parser.parse_args(sys.argv[1:]) + + rootpath = path.abspath(args.module_path) + + # normalize opts + if args.header is None: + args.header = rootpath.split(path.sep)[-1] + if args.suffix.startswith("."): + args.suffix = args.suffix[1:] + if not path.isdir(rootpath): + print(__(f"{rootpath} is not a directory."), file=sys.stderr) + sys.exit(1) + if not args.dryrun: + ensuredir(args.destdir) + excludes = [path.abspath(exclude) for exclude in args.exclude_pattern] + modules = recurse_tree(rootpath, excludes, args, args.templatedir) + + template_values = { + "top_modules": [{"path": f"api/{module}", "caption": module.split(".")[1].title()} for module in modules if module.count(".") == 1], + "maxdepth": args.maxdepth, + } + write_master_file(templatedir=args.templatedir, master_name=args.master, values=template_values, opts=args) + main() diff --git a/airbyte-cdk/python/sphinx-docs.md b/airbyte-cdk/python/sphinx-docs.md new file mode 100644 index 000000000000..055055cf61ab --- /dev/null +++ b/airbyte-cdk/python/sphinx-docs.md @@ -0,0 +1,96 @@ +# Sphinx Docs + +We're using the [Sphinx](https://www.sphinx-doc.org/) library in order +to automatically generate the docs for the [airbyte-cdk](https://pypi.org/project/airbyte-cdk/). + +## Updating the docs structure (manually) + +Documentation structure is set in `airbyte-cdk/python/reference_docs/_source`, using the `.rst` files. + +See [reStructuredText docs](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) +for the key concepts. + +Note that `index.rst` is the main index file, where we do define the layout of the main +docs page and relation to other sections. + +Each time a new module added to `airbyte-cdk/python/airbyte_cdk` module you'll need to update the Sphinx rst schema. + +Let's dive into using an example: + +- Assuming we're going to add a new package `airbyte_cdk/new_package`; +- Let this file contain a few modules: `airbyte_cdk/new_package/module1.py` and `airbyte_cdk/new_package/module2.py`; +- The above structure should be in `rst` config as: + - Add this block directly into `index.rst`: + + ``` + .. toctree:: + :maxdepth: 2 + :caption: New Package + + api/airbyte_cdk.new_package + ``` + + - Add a new file `api/airbyte_cdk.new_package.rst` with the following content: + + ``` + Submodules + ---------- + + airbyte\_cdk.new\_package.module1 module + -------------------------------------------- + + .. automodule:: airbyte_cdk.new_package.module1 + :members: + :undoc-members: + :show-inheritance: + + .. automodule:: airbyte_cdk.new_package.module2 + :members: + :undoc-members: + :show-inheritance: + + Module contents + --------------- + + .. automodule:: airbyte_cdk.models + :members: + :undoc-members: + :show-inheritance: + ``` + +For more examples see `airbyte-cdk/python/reference_docs/_source` +and read the [docs](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html). + +## Updating the docs structure (automatically) + +It's also possible to generate `.rst` files automatically using `generate_rst_schema.py` script. + +You should also update this script in order to change the docs appearance or structure. + +To generate the docs, +run `python generate_rst_schema.py -o _source/api ../../python/airbyte_cdk -f -t _source/templates` +from the `airbyte-cdk/python/reference_docs` root. + +## Building the docs locally + +After the `rst` files created to correctly represent current project structure you may build the docs locally. +This build could be useful on each `airbyte-cdk` update, especially if the package structure was changed. + +- Install Sphinx deps with `pip install ".[sphinx-docs]"`; +- Run `make html` from the `airbyte-cdk/python/reference_docs` root; +- Check out the `airbyte-cdk/python/reference_docs/_build` for the new documentation built. + +## Publishing to Read the Docs + +Our current sphinx docs setup is meant to be published to [readthedocs](https://readthedocs.org/). +So it may be useful to check our docs published at https://airbyte-cdk.readthedocs.io/en/latest/ +for the last build in case if the airbyte-cdk package was updated. + +Publishing process is automatic and implemented via the GitHub incoming webhook. +See https://docs.readthedocs.io/en/stable/webhooks.html. + +To check build logs and state, check the https://readthedocs.org/projects/airbyte-cdk/builds/. +You may also run build manually here if needed. + +Publishing configuration is placed to `.readthedocs.yaml`. +See https://docs.readthedocs.io/en/stable/config-file/v2.html for the config description. diff --git a/airbyte-cdk/python/unit_tests/__init__.py b/airbyte-cdk/python/unit_tests/__init__.py new file mode 100644 index 000000000000..51e56f3ad0e1 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/__init__.py @@ -0,0 +1,7 @@ +# THIS STOPS SOME MODELS TESTS FROM FALLING OVER. IT'S A HACK, WE SHOULD PIN DOWN WHAT'S ACTUALLY GOING ON HERE + +# Import the thing that needs to be imported to stop the tests from falling over +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource + +# "Use" the thing so that the linter doesn't complain +placeholder = ManifestDeclarativeSource diff --git a/airbyte-cdk/python/unit_tests/conftest.py b/airbyte-cdk/python/unit_tests/conftest.py new file mode 100644 index 000000000000..ab0c9bb847a4 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/conftest.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime + +import freezegun +import pytest + + +@pytest.fixture() +def mock_sleep(monkeypatch): + with freezegun.freeze_time(datetime.datetime.now(), ignore=["_pytest.runner", "_pytest.terminal"]) as frozen_datetime: + monkeypatch.setattr("time.sleep", lambda x: frozen_datetime.tick(x)) + yield + + +def pytest_addoption(parser): + parser.addoption( + "--skipslow", action="store_true", default=False, help="skip slow tests" + ) + + +def pytest_configure(config): + config.addinivalue_line("markers", "slow: mark test as slow to run") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--skipslow"): + skip_slow = pytest.mark.skip(reason="--skipslow option has been provided and this test is marked as slow") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) diff --git a/airbyte-cdk/python/unit_tests/connector_builder/__init__.py b/airbyte-cdk/python/unit_tests/connector_builder/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/connector_builder/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/connector_builder/test_connector_builder_handler.py b/airbyte-cdk/python/unit_tests/connector_builder/test_connector_builder_handler.py new file mode 100644 index 000000000000..7189c8027d85 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/connector_builder/test_connector_builder_handler.py @@ -0,0 +1,994 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +import dataclasses +import json +import logging +import os +from unittest import mock +from unittest.mock import MagicMock, patch + +import pytest +import requests +from airbyte_cdk import connector_builder +from airbyte_cdk.connector_builder.connector_builder_handler import ( + DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE, + DEFAULT_MAXIMUM_NUMBER_OF_SLICES, + DEFAULT_MAXIMUM_RECORDS, + TestReadLimits, + create_source, + get_limits, + resolve_manifest, +) +from airbyte_cdk.connector_builder.main import handle_connector_builder_request, handle_request, read_stream +from airbyte_cdk.connector_builder.models import LogMessage, StreamRead, StreamReadPages, StreamReadSlices +from airbyte_cdk.models import ( + AirbyteLogMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteRecordMessage, + AirbyteStateMessage, + AirbyteStream, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, + ConfiguredAirbyteStream, + ConnectorSpecification, + DestinationSyncMode, + Level, + StreamDescriptor, + SyncMode, +) +from airbyte_cdk.models import Type +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.retrievers import SimpleRetrieverTestReadDecorator +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets, update_secrets +from orjson import orjson +from unit_tests.connector_builder.utils import create_configured_catalog + +_stream_name = "stream_with_custom_requester" +_stream_primary_key = "id" +_stream_url_base = "https://api.sendgrid.com" +_stream_options = {"name": _stream_name, "primary_key": _stream_primary_key, "url_base": _stream_url_base} +_page_size = 2 + +_A_STATE = [ + AirbyteStateMessage( + type="STREAM", stream=AirbyteStreamState(stream_descriptor=StreamDescriptor(name=_stream_name), stream_state={"key": "value"}) + ) +] + +_A_PER_PARTITION_STATE = [ + AirbyteStateMessage( + type="STREAM", + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=_stream_name), + stream_state={ + "states": [ + { + "partition": {"key": "value"}, + "cursor": {"item_id": 0}, + }, + ], + "parent_state": {}, + }, + ), + ) +] + +MANIFEST = { + "version": "0.30.3", + "definitions": { + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": _page_size, + "page_size_option": {"inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"inject_into": "path", "type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": _page_size, + }, + }, + "partition_router": { + "type": "ListPartitionRouter", + "values": ["0", "1", "2", "3", "4", "5", "6", "7"], + "cursor_field": "item_id", + }, + "" + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"a_param": "10"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": _stream_options, + "retriever": "#/definitions/retriever", + }, + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [], + "properties": {}, + "additionalProperties": True, + }, + "type": "Spec", + }, +} + +OAUTH_MANIFEST = { + "version": "0.30.3", + "definitions": { + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": _page_size, + "page_size_option": {"inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"inject_into": "path", "type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response.next }}", + "page_size": _page_size, + }, + }, + "partition_router": { + "type": "ListPartitionRouter", + "values": ["0", "1", "2", "3", "4", "5", "6", "7"], + "cursor_field": "item_id", + }, + "" + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "OAuthAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"a_param": "10"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": _stream_options, + "retriever": "#/definitions/retriever", + }, + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [], + "properties": {}, + "additionalProperties": True, + }, + "type": "Spec", + }, +} + +RESOLVE_MANIFEST_CONFIG = { + "__injected_declarative_manifest": MANIFEST, + "__command": "resolve_manifest", +} + +TEST_READ_CONFIG = { + "__injected_declarative_manifest": MANIFEST, + "__command": "test_read", + "__test_read_config": {"max_pages_per_slice": 2, "max_slices": 5, "max_records": 10}, +} + +DUMMY_CATALOG = { + "streams": [ + { + "stream": { + "name": "dummy_stream", + "json_schema": {"$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {}}, + "supported_sync_modes": ["full_refresh"], + "source_defined_cursor": False, + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite", + } + ] +} + +CONFIGURED_CATALOG = { + "streams": [ + { + "stream": { + "name": _stream_name, + "json_schema": {"$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {}}, + "supported_sync_modes": ["full_refresh"], + "source_defined_cursor": False, + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite", + } + ] +} + +MOCK_RESPONSE = { + "result": [ + {"id": 1, "name": "Nora Moon", "position": "director"}, + {"id": 2, "name": "Hae Sung Jung", "position": "cinematographer"}, + {"id": 3, "name": "Arthur Zenneranski", "position": "composer"}, + ] +} + + +@pytest.fixture +def valid_resolve_manifest_config_file(tmp_path): + config_file = tmp_path / "config.json" + config_file.write_text(json.dumps(RESOLVE_MANIFEST_CONFIG)) + return config_file + + +@pytest.fixture +def valid_read_config_file(tmp_path): + config_file = tmp_path / "config.json" + config_file.write_text(json.dumps(TEST_READ_CONFIG)) + return config_file + + +@pytest.fixture +def dummy_catalog(tmp_path): + config_file = tmp_path / "catalog.json" + config_file.write_text(json.dumps(DUMMY_CATALOG)) + return config_file + + +@pytest.fixture +def configured_catalog(tmp_path): + config_file = tmp_path / "catalog.json" + config_file.write_text(json.dumps(CONFIGURED_CATALOG)) + return config_file + + +@pytest.fixture +def invalid_config_file(tmp_path): + invalid_config = copy.deepcopy(RESOLVE_MANIFEST_CONFIG) + invalid_config["__command"] = "bad_command" + config_file = tmp_path / "config.json" + config_file.write_text(json.dumps(invalid_config)) + return config_file + + +def _mocked_send(self, request, **kwargs) -> requests.Response: + """ + Mocks the outbound send operation to provide faster and more reliable responses compared to actual API requests + """ + response = requests.Response() + response.request = request + response.status_code = 200 + response.headers = {"header": "value"} + response_body = MOCK_RESPONSE + response._content = json.dumps(response_body).encode("utf-8") + return response + + +def test_handle_resolve_manifest(valid_resolve_manifest_config_file, dummy_catalog): + with mock.patch.object(connector_builder.main, "handle_connector_builder_request", return_value=AirbyteMessage(type=MessageType.RECORD)) as patched_handle: + handle_request(["read", "--config", str(valid_resolve_manifest_config_file), "--catalog", str(dummy_catalog)]) + assert patched_handle.call_count == 1 + + +def test_handle_test_read(valid_read_config_file, configured_catalog): + with mock.patch.object(connector_builder.main, "handle_connector_builder_request", return_value=AirbyteMessage(type=MessageType.RECORD)) as patch: + handle_request(["read", "--config", str(valid_read_config_file), "--catalog", str(configured_catalog)]) + assert patch.call_count == 1 + + +def test_resolve_manifest(valid_resolve_manifest_config_file): + config = copy.deepcopy(RESOLVE_MANIFEST_CONFIG) + command = "resolve_manifest" + config["__command"] = command + source = ManifestDeclarativeSource(MANIFEST) + limits = TestReadLimits() + resolved_manifest = handle_connector_builder_request( + source, command, config, create_configured_catalog("dummy_stream"), _A_STATE, limits + ) + + expected_resolved_manifest = { + "type": "DeclarativeSource", + "version": "0.30.3", + "definitions": { + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": _page_size, + "page_size_option": {"inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"inject_into": "path", "type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": _page_size, + }, + }, + "partition_router": { + "type": "ListPartitionRouter", + "values": ["0", "1", "2", "3", "4", "5", "6", "7"], + "cursor_field": "item_id", + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"a_param": "10"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "paginator": { + "type": "DefaultPaginator", + "page_size": _page_size, + "page_size_option": { + "type": "RequestOption", + "inject_into": "request_parameter", + "field_name": "page_size", + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "page_token_option": { + "type": "RequestPath", + "inject_into": "path", + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + "page_size": _page_size, + }, + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "requester": { + "type": "HttpRequester", + "path": "/v3/marketing/lists", + "authenticator": { + "type": "BearerAuthenticator", + "api_token": "{{ config.apikey }}", + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "request_parameters": {"a_param": "10"}, + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "partition_router": { + "type": "ListPartitionRouter", + "values": ["0", "1", "2", "3", "4", "5", "6", "7"], + "cursor_field": "item_id", + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": ["result"], + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + "name": _stream_name, + "primary_key": _stream_primary_key, + "url_base": _stream_url_base, + "$parameters": _stream_options, + }, + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [], + "properties": {}, + "additionalProperties": True, + }, + "type": "Spec", + }, + } + assert resolved_manifest.record.data["manifest"] == expected_resolved_manifest + assert resolved_manifest.record.stream == "resolve_manifest" + + +def test_resolve_manifest_error_returns_error_response(): + class MockManifestDeclarativeSource: + @property + def resolved_manifest(self): + raise ValueError + + source = MockManifestDeclarativeSource() + response = resolve_manifest(source) + assert "Error resolving manifest" in response.trace.error.message + + +def test_read(): + config = TEST_READ_CONFIG + source = ManifestDeclarativeSource(MANIFEST) + + real_record = AirbyteRecordMessage(data={"id": "1234", "key": "value"}, emitted_at=1, stream=_stream_name) + stream_read = StreamRead( + logs=[{"message": "here be a log message"}], + slices=[ + StreamReadSlices( + pages=[StreamReadPages(records=[real_record], request=None, response=None)], + slice_descriptor=None, + state=None, + ) + ], + auxiliary_requests=[], + test_read_limit_reached=False, + inferred_schema=None, + inferred_datetime_formats=None, + latest_config_update={}, + ) + + expected_airbyte_message = AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream=_stream_name, + data={ + "logs": [{"message": "here be a log message"}], + "slices": [ + {"pages": [{"records": [real_record], "request": None, "response": None}], "slice_descriptor": None, "state": None} + ], + "test_read_limit_reached": False, + "auxiliary_requests": [], + "inferred_schema": None, + "inferred_datetime_formats": None, + "latest_config_update": {}, + }, + emitted_at=1, + ), + ) + limits = TestReadLimits() + with patch("airbyte_cdk.connector_builder.message_grouper.MessageGrouper.get_message_groups", return_value=stream_read) as mock: + output_record = handle_connector_builder_request( + source, "test_read", config, ConfiguredAirbyteCatalogSerializer.load(CONFIGURED_CATALOG), _A_STATE, limits + ) + mock.assert_called_with(source, config, ConfiguredAirbyteCatalogSerializer.load(CONFIGURED_CATALOG), _A_STATE, limits.max_records) + output_record.record.emitted_at = 1 + assert ( + orjson.dumps(AirbyteMessageSerializer.dump(output_record)).decode() + == orjson.dumps(AirbyteMessageSerializer.dump(expected_airbyte_message)).decode() + ) + + +def test_config_update(): + manifest = copy.deepcopy(MANIFEST) + manifest["definitions"]["retriever"]["requester"]["authenticator"] = { + "type": "OAuthAuthenticator", + "token_refresh_endpoint": "https://oauth.endpoint.com/tokens/bearer", + "client_id": "{{ config['credentials']['client_id'] }}", + "client_secret": "{{ config['credentials']['client_secret'] }}", + "refresh_token": "{{ config['credentials']['refresh_token'] }}", + "refresh_token_updater": {}, + } + config = copy.deepcopy(TEST_READ_CONFIG) + config["__injected_declarative_manifest"] = manifest + config["credentials"] = { + "client_id": "a client id", + "client_secret": "a client secret", + "refresh_token": "a refresh token", + } + source = ManifestDeclarativeSource(manifest) + + refresh_request_response = { + "access_token": "an updated access token", + "refresh_token": "an updated refresh token", + "expires_in": 3600, + } + with patch( + "airbyte_cdk.sources.streams.http.requests_native_auth.SingleUseRefreshTokenOauth2Authenticator._get_refresh_access_token_response", + return_value=refresh_request_response, + ): + output = handle_connector_builder_request( + source, + "test_read", + config, + ConfiguredAirbyteCatalogSerializer.load(CONFIGURED_CATALOG), + _A_PER_PARTITION_STATE, + TestReadLimits(), + ) + assert output.record.data["latest_config_update"] + + +@patch("traceback.TracebackException.from_exception") +def test_read_returns_error_response(mock_from_exception): + class MockDeclarativeStream: + @property + def primary_key(self): + return [[]] + + @property + def cursor_field(self): + return [] + + class MockManifestDeclarativeSource: + def streams(self, config): + return [MockDeclarativeStream()] + + def read(self, logger, config, catalog, state): + raise ValueError("error_message") + + def spec(self, logger: logging.Logger) -> ConnectorSpecification: + connector_specification = mock.Mock() + connector_specification.connectionSpecification = {} + return connector_specification + + @property + def check_config_against_spec(self): + return False + + stack_trace = "a stack trace" + mock_from_exception.return_value = stack_trace + + source = MockManifestDeclarativeSource() + limits = TestReadLimits() + response = read_stream(source, TEST_READ_CONFIG, ConfiguredAirbyteCatalogSerializer.load(CONFIGURED_CATALOG), _A_STATE, limits) + + expected_stream_read = StreamRead( + logs=[LogMessage("error_message", "ERROR", "error_message", "a stack trace")], + slices=[], + test_read_limit_reached=False, + auxiliary_requests=[], + inferred_schema=None, + inferred_datetime_formats={}, + latest_config_update=None, + ) + + expected_message = AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage(stream=_stream_name, data=dataclasses.asdict(expected_stream_read), emitted_at=1), + ) + response.record.emitted_at = 1 + assert response == expected_message + + +def test_handle_429_response(): + response = _create_429_page_response({"result": [{"error": "too many requests"}], "_metadata": {"next": "next"}}) + + # Add backoff strategy to avoid default endless backoff loop + TEST_READ_CONFIG["__injected_declarative_manifest"]["definitions"]["retriever"]["requester"]["error_handler"] = { + "backoff_strategies": [{"type": "ConstantBackoffStrategy", "backoff_time_in_seconds": 5}] + } + + config = TEST_READ_CONFIG + limits = TestReadLimits() + source = create_source(config, limits) + + with patch("requests.Session.send", return_value=response) as mock_send: + response = handle_connector_builder_request( + source, "test_read", config, ConfiguredAirbyteCatalogSerializer.load(CONFIGURED_CATALOG), _A_PER_PARTITION_STATE, limits + ) + + mock_send.assert_called_once() + + +@pytest.mark.parametrize( + "command", + [ + pytest.param("check", id="test_check_command_error"), + pytest.param("spec", id="test_spec_command_error"), + pytest.param("discover", id="test_discover_command_error"), + pytest.param(None, id="test_command_is_none_error"), + pytest.param("", id="test_command_is_empty_error"), + ], +) +def test_invalid_protocol_command(command, valid_resolve_manifest_config_file): + config = copy.deepcopy(RESOLVE_MANIFEST_CONFIG) + config["__command"] = "resolve_manifest" + with pytest.raises(SystemExit): + handle_request([command, "--config", str(valid_resolve_manifest_config_file), "--catalog", ""]) + + +def test_missing_command(valid_resolve_manifest_config_file): + with pytest.raises(SystemExit): + handle_request(["--config", str(valid_resolve_manifest_config_file), "--catalog", ""]) + + +def test_missing_catalog(valid_resolve_manifest_config_file): + with pytest.raises(SystemExit): + handle_request(["read", "--config", str(valid_resolve_manifest_config_file)]) + + +def test_missing_config(valid_resolve_manifest_config_file): + with pytest.raises(SystemExit): + handle_request(["read", "--catalog", str(valid_resolve_manifest_config_file)]) + + +def test_invalid_config_command(invalid_config_file, dummy_catalog): + with pytest.raises(ValueError): + handle_request(["read", "--config", str(invalid_config_file), "--catalog", str(dummy_catalog)]) + + +@pytest.fixture +def manifest_declarative_source(): + return mock.Mock(spec=ManifestDeclarativeSource, autospec=True) + + +def create_mock_retriever(name, url_base, path): + http_stream = mock.Mock(spec=SimpleRetriever, autospec=True) + http_stream.name = name + http_stream.requester = MagicMock() + http_stream.requester.get_url_base.return_value = url_base + http_stream.requester.get_path.return_value = path + http_stream._paginator_path.return_value = None + return http_stream + + +def create_mock_declarative_stream(http_stream): + declarative_stream = mock.Mock(spec=DeclarativeStream, autospec=True) + declarative_stream.retriever = http_stream + return declarative_stream + + +@pytest.mark.parametrize( + "test_name, config, expected_max_records, expected_max_slices, expected_max_pages_per_slice", + [ + ( + "test_no_test_read_config", + {}, + DEFAULT_MAXIMUM_RECORDS, + DEFAULT_MAXIMUM_NUMBER_OF_SLICES, + DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE, + ), + ( + "test_no_values_set", + {"__test_read_config": {}}, + DEFAULT_MAXIMUM_RECORDS, + DEFAULT_MAXIMUM_NUMBER_OF_SLICES, + DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE, + ), + ("test_values_are_set", {"__test_read_config": {"max_slices": 1, "max_pages_per_slice": 2, "max_records": 3}}, 3, 1, 2), + ], +) +def test_get_limits(test_name, config, expected_max_records, expected_max_slices, expected_max_pages_per_slice): + limits = get_limits(config) + assert limits.max_records == expected_max_records + assert limits.max_pages_per_slice == expected_max_pages_per_slice + assert limits.max_slices == expected_max_slices + + +def test_create_source(): + max_records = 3 + max_pages_per_slice = 2 + max_slices = 1 + limits = TestReadLimits(max_records, max_pages_per_slice, max_slices) + + config = {"__injected_declarative_manifest": MANIFEST} + + source = create_source(config, limits) + + assert isinstance(source, ManifestDeclarativeSource) + assert source._constructor._limit_pages_fetched_per_slice == limits.max_pages_per_slice + assert source._constructor._limit_slices_fetched == limits.max_slices + assert source._constructor._disable_cache + + +def request_log_message(request: dict) -> AirbyteMessage: + return AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message=f"request:{json.dumps(request)}")) + + +def response_log_message(response: dict) -> AirbyteMessage: + return AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message=f"response:{json.dumps(response)}")) + + +def _create_request(): + url = "https://example.com/api" + headers = {"Content-Type": "application/json"} + return requests.Request("POST", url, headers=headers, json={"key": "value"}).prepare() + + +def _create_response(body, request): + response = requests.Response() + response.status_code = 200 + response._content = bytes(json.dumps(body), "utf-8") + response.headers["Content-Type"] = "application/json" + response.request = request + return response + + +def _create_429_response(body, request): + response = requests.Response() + response.status_code = 429 + response._content = bytes(json.dumps(body), "utf-8") + response.headers["Content-Type"] = "application/json" + response.request = request + return response + + +def _create_page_response(response_body): + request = _create_request() + return _create_response(response_body, request) + + +def _create_429_page_response(response_body): + request = _create_request() + return _create_429_response(response_body, request) + + +@patch.object( + requests.Session, + "send", + side_effect=( + _create_page_response({"result": [{"id": 0}, {"id": 1}], "_metadata": {"next": "next"}}), + _create_page_response({"result": [{"id": 2}], "_metadata": {"next": "next"}}), + ) + * 10, +) +def test_read_source(mock_http_stream): + """ + This test sort of acts as an integration test for the connector builder. + + Each slice has two pages + The first page has two records + The second page one record + + The response._metadata.next field in the first page tells the paginator to fetch the next page. + """ + max_records = 100 + max_pages_per_slice = 2 + max_slices = 3 + limits = TestReadLimits(max_records, max_pages_per_slice, max_slices) + + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=_stream_name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + + config = {"__injected_declarative_manifest": MANIFEST} + + source = create_source(config, limits) + + output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data + slices = output_data["slices"] + + assert len(slices) == max_slices + for s in slices: + pages = s["pages"] + assert len(pages) == max_pages_per_slice + + first_page, second_page = pages[0], pages[1] + assert len(first_page["records"]) == _page_size + assert len(second_page["records"]) == 1 + + streams = source.streams(config) + for s in streams: + assert isinstance(s.retriever, SimpleRetrieverTestReadDecorator) + + +@patch.object( + requests.Session, + "send", + side_effect=( + _create_page_response({"result": [{"id": 0}, {"id": 1}], "_metadata": {"next": "next"}}), + _create_page_response({"result": [{"id": 2}], "_metadata": {"next": "next"}}), + ), +) +def test_read_source_single_page_single_slice(mock_http_stream): + max_records = 100 + max_pages_per_slice = 1 + max_slices = 1 + limits = TestReadLimits(max_records, max_pages_per_slice, max_slices) + + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=_stream_name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + + config = {"__injected_declarative_manifest": MANIFEST} + + source = create_source(config, limits) + + output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data + slices = output_data["slices"] + + assert len(slices) == max_slices + for s in slices: + pages = s["pages"] + assert len(pages) == max_pages_per_slice + + first_page = pages[0] + assert len(first_page["records"]) == _page_size + + streams = source.streams(config) + for s in streams: + assert isinstance(s.retriever, SimpleRetrieverTestReadDecorator) + + +@pytest.mark.parametrize( + "deployment_mode, url_base, expected_error", + [ + pytest.param("CLOUD", "https://airbyte.com/api/v1/characters", None, id="test_cloud_read_with_public_endpoint"), + pytest.param("CLOUD", "https://10.0.27.27", "AirbyteTracedException", id="test_cloud_read_with_private_endpoint"), + pytest.param("CLOUD", "https://localhost:80/api/v1/cast", "AirbyteTracedException", id="test_cloud_read_with_localhost"), + pytest.param("CLOUD", "http://unsecured.protocol/api/v1", "InvalidSchema", id="test_cloud_read_with_unsecured_endpoint"), + pytest.param("CLOUD", "https://domainwithoutextension", "Invalid URL", id="test_cloud_read_with_invalid_url_endpoint"), + pytest.param("OSS", "https://airbyte.com/api/v1/", None, id="test_oss_read_with_public_endpoint"), + pytest.param("OSS", "https://10.0.27.27/api/v1/", None, id="test_oss_read_with_private_endpoint"), + ], +) +@patch.object(requests.Session, "send", _mocked_send) +def test_handle_read_external_requests(deployment_mode, url_base, expected_error): + """ + This test acts like an integration test for the connector builder when it receives Test Read requests. + + The scenario being tested is whether requests should be denied if they are done on an unsecure channel or are made to internal + endpoints when running on Cloud or OSS deployments + """ + + limits = TestReadLimits(max_records=100, max_pages_per_slice=1, max_slices=1) + + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=_stream_name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + + test_manifest = MANIFEST + test_manifest["streams"][0]["$parameters"]["url_base"] = url_base + config = {"__injected_declarative_manifest": test_manifest} + + source = create_source(config, limits) + + with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): + output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data + if expected_error: + assert len(output_data["logs"]) > 0, "Expected at least one log message with the expected error" + error_message = output_data["logs"][0] + assert error_message["level"] == "ERROR" + assert expected_error in error_message["stacktrace"] + else: + page_records = output_data["slices"][0]["pages"][0] + assert len(page_records) == len(MOCK_RESPONSE["result"]) + + +@pytest.mark.parametrize( + "deployment_mode, token_url, expected_error", + [ + pytest.param("CLOUD", "https://airbyte.com/tokens/bearer", None, id="test_cloud_read_with_public_endpoint"), + pytest.param("CLOUD", "https://10.0.27.27/tokens/bearer", "AirbyteTracedException", id="test_cloud_read_with_private_endpoint"), + pytest.param("CLOUD", "http://unsecured.protocol/tokens/bearer", "InvalidSchema", id="test_cloud_read_with_unsecured_endpoint"), + pytest.param("CLOUD", "https://domainwithoutextension", "Invalid URL", id="test_cloud_read_with_invalid_url_endpoint"), + pytest.param("OSS", "https://airbyte.com/tokens/bearer", None, id="test_oss_read_with_public_endpoint"), + pytest.param("OSS", "https://10.0.27.27/tokens/bearer", None, id="test_oss_read_with_private_endpoint"), + ], +) +@patch.object(requests.Session, "send", _mocked_send) +def test_handle_read_external_oauth_request(deployment_mode, token_url, expected_error): + """ + This test acts like an integration test for the connector builder when it receives Test Read requests. + + The scenario being tested is whether requests should be denied if they are done on an unsecure channel or are made to internal + endpoints when running on Cloud or OSS deployments + """ + + limits = TestReadLimits(max_records=100, max_pages_per_slice=1, max_slices=1) + + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=_stream_name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + + oauth_authenticator_config: dict[str, str] = { + "type": "OAuthAuthenticator", + "token_refresh_endpoint": token_url, + "client_id": "greta", + "client_secret": "teo", + "refresh_token": "john", + } + + test_manifest = MANIFEST + test_manifest["definitions"]["retriever"]["requester"]["authenticator"] = oauth_authenticator_config + config = {"__injected_declarative_manifest": test_manifest} + + source = create_source(config, limits) + + with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): + output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data + if expected_error: + assert len(output_data["logs"]) > 0, "Expected at least one log message with the expected error" + error_message = output_data["logs"][0] + assert error_message["level"] == "ERROR" + assert expected_error in error_message["stacktrace"] + + +def test_read_stream_exception_with_secrets(): + # Define the test parameters + config = {"__injected_declarative_manifest": "test_manifest", "api_key": "super_secret_key"} + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=_stream_name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + state = [] + limits = TestReadLimits() + + # Add the secret to be filtered + update_secrets([config["api_key"]]) + + # Mock the source + mock_source = MagicMock() + + # Patch the handler to raise an exception + with patch("airbyte_cdk.connector_builder.message_grouper.MessageGrouper.get_message_groups") as mock_handler: + mock_handler.side_effect = Exception("Test exception with secret key: super_secret_key") + + # Call the read_stream function and check for the correct error message + response = read_stream(mock_source, config, catalog, state, limits) + + # Check if the error message contains the filtered secret + filtered_message = filter_secrets("Test exception with secret key: super_secret_key") + assert response.type == Type.TRACE + assert filtered_message in response.trace.error.message + assert "super_secret_key" not in response.trace.error.message diff --git a/airbyte-cdk/python/unit_tests/connector_builder/test_message_grouper.py b/airbyte-cdk/python/unit_tests/connector_builder/test_message_grouper.py new file mode 100644 index 000000000000..6a4c7a99e4a8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/connector_builder/test_message_grouper.py @@ -0,0 +1,830 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from typing import Any, Iterator, List, Mapping +from unittest.mock import MagicMock, Mock, patch + +import pytest +from airbyte_cdk.connector_builder.message_grouper import MessageGrouper +from airbyte_cdk.connector_builder.models import HttpRequest, HttpResponse, LogMessage, StreamRead, StreamReadPages +from airbyte_cdk.models import ( + AirbyteControlConnectorConfigMessage, + AirbyteControlMessage, + AirbyteLogMessage, + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStreamState, + Level, + OrchestratorType, + StreamDescriptor, +) +from airbyte_cdk.models import Type as MessageType +from orjson import orjson +from unit_tests.connector_builder.utils import create_configured_catalog + +_NO_PK = [[]] +_NO_CURSOR_FIELD = [] + +MAX_PAGES_PER_SLICE = 4 +MAX_SLICES = 3 + +_NO_STATE = [] + +MANIFEST = { + "version": "0.30.0", + "type": "DeclarativeSource", + "definitions": { + "selector": {"extractor": {"field_path": ["items"], "type": "DpathExtractor"}, "type": "RecordSelector"}, + "requester": {"url_base": "https://demonslayers.com/api/v1/", "http_method": "GET", "type": "DeclarativeSource"}, + "retriever": { + "type": "DeclarativeSource", + "record_selector": {"extractor": {"field_path": ["items"], "type": "DpathExtractor"}, "type": "RecordSelector"}, + "paginator": {"type": "NoPagination"}, + "requester": {"url_base": "https://demonslayers.com/api/v1/", "http_method": "GET", "type": "HttpRequester"}, + }, + "hashiras_stream": { + "retriever": { + "type": "DeclarativeSource", + "record_selector": {"extractor": {"field_path": ["items"], "type": "DpathExtractor"}, "type": "RecordSelector"}, + "paginator": {"type": "NoPagination"}, + "requester": {"url_base": "https://demonslayers.com/api/v1/", "http_method": "GET", "type": "HttpRequester"}, + }, + "$parameters": {"name": "hashiras", "path": "/hashiras"}, + }, + "breathing_techniques_stream": { + "retriever": { + "type": "DeclarativeSource", + "record_selector": {"extractor": {"field_path": ["items"], "type": "DpathExtractor"}, "type": "RecordSelector"}, + "paginator": {"type": "NoPagination"}, + "requester": {"url_base": "https://demonslayers.com/api/v1/", "http_method": "GET", "type": "HttpRequester"}, + }, + "$parameters": {"name": "breathing-techniques", "path": "/breathing_techniques"}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "record_selector": {"extractor": {"field_path": ["items"], "type": "DpathExtractor"}, "type": "RecordSelector"}, + "paginator": {"type": "NoPagination"}, + "requester": {"url_base": "https://demonslayers.com/api/v1/", "http_method": "GET", "type": "HttpRequester"}, + }, + "$parameters": {"name": "hashiras", "path": "/hashiras"}, + }, + { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "record_selector": {"extractor": {"field_path": ["items"], "type": "DpathExtractor"}, "type": "RecordSelector"}, + "paginator": {"type": "NoPagination"}, + "requester": {"url_base": "https://demonslayers.com/api/v1/", "http_method": "GET", "type": "HttpRequester"}, + }, + "$parameters": {"name": "breathing-techniques", "path": "/breathing_techniques"}, + }, + ], + "check": {"stream_names": ["hashiras"], "type": "CheckStream"}, +} + +CONFIG = {"rank": "upper-six"} + +A_SOURCE = MagicMock() + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages(mock_entrypoint_read: Mock) -> None: + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = {"status_code": 200, "headers": {"field": "value"}, "body": {"content": '{"name": "field"}'}} + expected_schema = { + "$schema": "http://json-schema.org/schema#", + "properties": {"name": {"type": ["string", "null"]}, "date": {"type": ["string", "null"]}}, + "type": "object", + } + expected_datetime_fields = {"date": "%Y-%m-%d"} + expected_pages = [ + StreamReadPages( + request=HttpRequest( + url="https://demonslayers.com/api/v1/hashiras?era=taisho", + headers={"Content-Type": "application/json"}, + body='{"custom": "field"}', + http_method="GET", + ), + response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'), + records=[{"name": "Shinobu Kocho", "date": "2023-03-03"}, {"name": "Muichiro Tokito", "date": "2023-03-04"}], + ), + StreamReadPages( + request=HttpRequest( + url="https://demonslayers.com/api/v1/hashiras?era=taisho", + headers={"Content-Type": "application/json"}, + body='{"custom": "field"}', + http_method="GET", + ), + response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'), + records=[{"name": "Mitsuri Kanroji", "date": "2023-03-05"}], + ), + ] + + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Shinobu Kocho", "date": "2023-03-03"}), + record_message("hashiras", {"name": "Muichiro Tokito", "date": "2023-03-04"}), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Mitsuri Kanroji", "date": "2023-03-05"}), + ] + ), + ) + + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + actual_response: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + assert actual_response.inferred_schema == expected_schema + assert actual_response.inferred_datetime_formats == expected_datetime_fields + + single_slice = actual_response.slices[0] + for i, actual_page in enumerate(single_slice.pages): + assert actual_page == expected_pages[i] + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_with_logs(mock_entrypoint_read: Mock) -> None: + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = {"status_code": 200, "headers": {"field": "value"}, "body": {"content": '{"name": "field"}'}} + expected_pages = [ + StreamReadPages( + request=HttpRequest( + url="https://demonslayers.com/api/v1/hashiras?era=taisho", + headers={"Content-Type": "application/json"}, + body='{"custom": "field"}', + http_method="GET", + ), + response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'), + records=[{"name": "Shinobu Kocho"}, {"name": "Muichiro Tokito"}], + ), + StreamReadPages( + request=HttpRequest( + url="https://demonslayers.com/api/v1/hashiras?era=taisho", + headers={"Content-Type": "application/json"}, + body='{"custom": "field"}', + http_method="GET", + ), + response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'), + records=[{"name": "Mitsuri Kanroji"}], + ), + ] + expected_logs = [ + LogMessage(**{"message": "log message before the request", "level": "INFO"}), + LogMessage(**{"message": "log message during the page", "level": "INFO"}), + LogMessage(**{"message": "log message after the response", "level": "INFO"}), + ] + + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message="log message before the request")), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Shinobu Kocho"}), + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message="log message during the page")), + record_message("hashiras", {"name": "Muichiro Tokito"}), + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message="log message after the response")), + ] + ), + ) + + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + actual_response: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + single_slice = actual_response.slices[0] + for i, actual_page in enumerate(single_slice.pages): + assert actual_page == expected_pages[i] + + for i, actual_log in enumerate(actual_response.logs): + assert actual_log == expected_logs[i] + + +@pytest.mark.parametrize( + "request_record_limit, max_record_limit, should_fail", + [ + pytest.param(1, 3, False, id="test_create_request_with_record_limit"), + pytest.param(3, 1, True, id="test_create_request_record_limit_exceeds_max"), + ], +) +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_record_limit( + mock_entrypoint_read: Mock, request_record_limit: int, max_record_limit: int, should_fail: bool +) -> None: + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = {"status_code": 200, "headers": {"field": "value"}, "body": {"content": '{"name": "field"}'}} + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Shinobu Kocho"}), + record_message("hashiras", {"name": "Muichiro Tokito"}), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Mitsuri Kanroji"}), + ] + ), + ) + n_records = 2 + record_limit = min(request_record_limit, max_record_limit) + + api = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES, max_record_limit=max_record_limit) + # this is the call we expect to raise an exception + if should_fail: + with pytest.raises(ValueError): + api.get_message_groups( + mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + record_limit=request_record_limit, + ) + else: + actual_response: StreamRead = api.get_message_groups( + mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + record_limit=request_record_limit, + ) + single_slice = actual_response.slices[0] + total_records = 0 + for i, actual_page in enumerate(single_slice.pages): + total_records += len(actual_page.records) + assert total_records == min([record_limit, n_records]) + + assert (total_records >= max_record_limit) == actual_response.test_read_limit_reached + + +@pytest.mark.parametrize( + "max_record_limit", + [ + pytest.param(2, id="test_create_request_no_record_limit"), + pytest.param(1, id="test_create_request_no_record_limit_n_records_exceed_max"), + ], +) +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_default_record_limit(mock_entrypoint_read: Mock, max_record_limit: int) -> None: + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = {"status_code": 200, "headers": {"field": "value"}, "body": {"content": '{"name": "field"}'}} + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Shinobu Kocho"}), + record_message("hashiras", {"name": "Muichiro Tokito"}), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Mitsuri Kanroji"}), + ] + ), + ) + n_records = 2 + + api = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES, max_record_limit=max_record_limit) + actual_response: StreamRead = api.get_message_groups( + source=mock_source, config=CONFIG, configured_catalog=create_configured_catalog("hashiras"), state=_NO_STATE + ) + single_slice = actual_response.slices[0] + total_records = 0 + for i, actual_page in enumerate(single_slice.pages): + total_records += len(actual_page.records) + assert total_records == min([max_record_limit, n_records]) + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_limit_0(mock_entrypoint_read: Mock) -> None: + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = {"status_code": 200, "headers": {"field": "value"}, "body": {"content": '{"name": "field"}'}} + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Shinobu Kocho"}), + record_message("hashiras", {"name": "Muichiro Tokito"}), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Mitsuri Kanroji"}), + ] + ), + ) + api = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + with pytest.raises(ValueError): + api.get_message_groups( + source=mock_source, config=CONFIG, configured_catalog=create_configured_catalog("hashiras"), state=_NO_STATE, record_limit=0 + ) + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_no_records(mock_entrypoint_read: Mock) -> None: + url = "https://demonslayers.com/api/v1/hashiras?era=taisho" + request = { + "headers": {"Content-Type": "application/json"}, + "method": "GET", + "body": {"content": '{"custom": "field"}'}, + } + response = {"status_code": 200, "headers": {"field": "value"}, "body": {"content": '{"name": "field"}'}} + expected_pages = [ + StreamReadPages( + request=HttpRequest( + url="https://demonslayers.com/api/v1/hashiras?era=taisho", + headers={"Content-Type": "application/json"}, + body='{"custom": "field"}', + http_method="GET", + ), + response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'), + records=[], + ), + StreamReadPages( + request=HttpRequest( + url="https://demonslayers.com/api/v1/hashiras?era=taisho", + headers={"Content-Type": "application/json"}, + body='{"custom": "field"}', + http_method="GET", + ), + response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'), + records=[], + ), + ] + + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message(request, response, url), + request_response_log_message(request, response, url), + ] + ), + ) + + message_grouper = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + actual_response: StreamRead = message_grouper.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + single_slice = actual_response.slices[0] + for i, actual_page in enumerate(single_slice.pages): + assert actual_page == expected_pages[i] + + +@pytest.mark.parametrize( + "log_message, expected_response", + [ + pytest.param( + { + "http": { + "response": { + "status_code": 200, + "headers": {"field": "name"}, + "body": {"content": '{"id": "fire", "owner": "kyojuro_rengoku"}'}, + } + } + }, + HttpResponse(status=200, headers={"field": "name"}, body='{"id": "fire", "owner": "kyojuro_rengoku"}'), + id="test_create_response_with_all_fields", + ), + pytest.param( + {"http": {"response": {"status_code": 200, "headers": {"field": "name"}}}}, + HttpResponse(status=200, headers={"field": "name"}, body=""), + id="test_create_response_with_no_body", + ), + pytest.param( + {"http": {"response": {"status_code": 200, "body": {"content": '{"id": "fire", "owner": "kyojuro_rengoku"}'}}}}, + HttpResponse(status=200, body='{"id": "fire", "owner": "kyojuro_rengoku"}'), + id="test_create_response_with_no_headers", + ), + pytest.param( + { + "http": { + "response": { + "status_code": 200, + "headers": {"field": "name"}, + "body": {"content": '[{"id": "fire", "owner": "kyojuro_rengoku"}, {"id": "mist", "owner": "muichiro_tokito"}]'}, + } + } + }, + HttpResponse( + status=200, + headers={"field": "name"}, + body='[{"id": "fire", "owner": "kyojuro_rengoku"}, {"id": "mist", "owner": "muichiro_tokito"}]', + ), + id="test_create_response_with_array", + ), + pytest.param( + {"http": {"response": {"status_code": 200, "body": {"content": "tomioka"}}}}, + HttpResponse(status=200, body="tomioka"), + id="test_create_response_with_string", + ), + ], +) +def test_create_response_from_log_message(log_message: str, expected_response: HttpResponse) -> None: + if isinstance(log_message, str): + response_message = json.loads(log_message) + else: + response_message = log_message + + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + actual_response = connector_builder_handler._create_response_from_log_message(response_message) + + assert actual_response == expected_response + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_with_many_slices(mock_entrypoint_read: Mock) -> None: + url = "http://a-url.com" + request: Mapping[str, Any] = {} + response = {"status_code": 200} + + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + slice_message('{"descriptor": "first_slice"}'), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Muichiro Tokito"}), + slice_message('{"descriptor": "second_slice"}'), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Shinobu Kocho"}), + record_message("hashiras", {"name": "Mitsuri Kanroji"}), + request_response_log_message(request, response, url), + record_message("hashiras", {"name": "Obanai Iguro"}), + request_response_log_message(request, response, url), + state_message("hashiras", {"a_timestamp": 123}), + ] + ), + ) + + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + assert not stream_read.test_read_limit_reached + assert len(stream_read.slices) == 2 + + assert stream_read.slices[0].slice_descriptor == {"descriptor": "first_slice"} + assert len(stream_read.slices[0].pages) == 1 + assert len(stream_read.slices[0].pages[0].records) == 1 + assert stream_read.slices[0].state == [] + + assert stream_read.slices[1].slice_descriptor == {"descriptor": "second_slice"} + assert len(stream_read.slices[1].pages) == 3 + assert len(stream_read.slices[1].pages[0].records) == 2 + assert len(stream_read.slices[1].pages[1].records) == 1 + assert len(stream_read.slices[1].pages[2].records) == 0 + + assert ( + orjson.dumps(stream_read.slices[1].state[0].stream.stream_state).decode() + == orjson.dumps(AirbyteStateBlob(a_timestamp=123)).decode() + ) + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_given_maximum_number_of_slices_then_test_read_limit_reached(mock_entrypoint_read: Mock) -> None: + maximum_number_of_slices = 5 + request: Mapping[str, Any] = {} + response = {"status_code": 200} + mock_source = make_mock_source( + mock_entrypoint_read, iter([slice_message(), request_response_log_message(request, response, "a_url")] * maximum_number_of_slices) + ) + + api = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + stream_read: StreamRead = api.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + assert stream_read.test_read_limit_reached + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_get_grouped_messages_given_maximum_number_of_pages_then_test_read_limit_reached(mock_entrypoint_read: Mock) -> None: + maximum_number_of_pages_per_slice = 5 + request: Mapping[str, Any] = {} + response = {"status_code": 200} + mock_source = make_mock_source( + mock_entrypoint_read, + iter([slice_message()] + [request_response_log_message(request, response, "a_url")] * maximum_number_of_pages_per_slice), + ) + + api = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + stream_read: StreamRead = api.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + assert stream_read.test_read_limit_reached + + +def test_read_stream_returns_error_if_stream_does_not_exist() -> None: + mock_source = MagicMock() + mock_source.read.side_effect = ValueError("error") + mock_source.streams.return_value = [make_mock_stream()] + + full_config: Mapping[str, Any] = {**CONFIG, **{"__injected_declarative_manifest": MANIFEST}} + + message_grouper = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + actual_response = message_grouper.get_message_groups( + source=mock_source, + config=full_config, + configured_catalog=create_configured_catalog("not_in_manifest"), + state=_NO_STATE, + ) + + assert len(actual_response.logs) == 1 + assert "Traceback" in actual_response.logs[0].stacktrace + assert "ERROR" in actual_response.logs[0].level + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_given_control_message_then_stream_read_has_config_update(mock_entrypoint_read: Mock) -> None: + updated_config = {"x": 1} + mock_source = make_mock_source( + mock_entrypoint_read, iter(any_request_and_response_with_a_record() + [connector_configuration_control_message(1, updated_config)]) + ) + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + assert stream_read.latest_config_update == updated_config + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_given_multiple_control_messages_then_stream_read_has_latest_based_on_emitted_at(mock_entrypoint_read: Mock) -> None: + earliest = 0 + earliest_config = {"earliest": 0} + latest = 1 + latest_config = {"latest": 1} + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + any_request_and_response_with_a_record() + + [ + # here, we test that even if messages are emitted in a different order, we still rely on `emitted_at` + connector_configuration_control_message(latest, latest_config), + connector_configuration_control_message(earliest, earliest_config), + ] + ), + ) + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + assert stream_read.latest_config_update == latest_config + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_given_multiple_control_messages_with_same_timestamp_then_stream_read_has_latest_based_on_message_order( + mock_entrypoint_read: Mock, +) -> None: + emitted_at = 0 + earliest_config = {"earliest": 0} + latest_config = {"latest": 1} + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + any_request_and_response_with_a_record() + + [ + connector_configuration_control_message(emitted_at, earliest_config), + connector_configuration_control_message(emitted_at, latest_config), + ] + ), + ) + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, + config=CONFIG, + configured_catalog=create_configured_catalog("hashiras"), + state=_NO_STATE, + ) + + assert stream_read.latest_config_update == latest_config + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_given_auxiliary_requests_then_return_auxiliary_request(mock_entrypoint_read: Mock) -> None: + mock_source = make_mock_source(mock_entrypoint_read, iter(any_request_and_response_with_a_record() + [auxiliary_request_log_message()])) + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, config=CONFIG, configured_catalog=create_configured_catalog("hashiras"), state=_NO_STATE + ) + + assert len(stream_read.auxiliary_requests) == 1 + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_given_no_slices_then_return_empty_slices(mock_entrypoint_read: Mock) -> None: + mock_source = make_mock_source(mock_entrypoint_read, iter([auxiliary_request_log_message()])) + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, config=CONFIG, configured_catalog=create_configured_catalog("hashiras"), state=_NO_STATE + ) + + assert len(stream_read.slices) == 0 + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_given_pk_then_ensure_pk_is_pass_to_schema_inferrence(mock_entrypoint_read: Mock) -> None: + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message({"request": 1}, {"response": 2}, "http://any_url.com"), + record_message("hashiras", {"id": "Shinobu Kocho", "date": "2023-03-03"}), + record_message("hashiras", {"id": "Muichiro Tokito", "date": "2023-03-04"}), + ] + ), + ) + mock_source.streams.return_value = [Mock()] + mock_source.streams.return_value[0].primary_key = [["id"]] + mock_source.streams.return_value[0].cursor_field = _NO_CURSOR_FIELD + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, config=CONFIG, configured_catalog=create_configured_catalog("hashiras"), state=_NO_STATE + ) + + assert stream_read.inferred_schema["required"] == ["id"] + + +@patch("airbyte_cdk.connector_builder.message_grouper.AirbyteEntrypoint.read") +def test_given_cursor_field_then_ensure_cursor_field_is_pass_to_schema_inferrence(mock_entrypoint_read: Mock) -> None: + mock_source = make_mock_source( + mock_entrypoint_read, + iter( + [ + request_response_log_message({"request": 1}, {"response": 2}, "http://any_url.com"), + record_message("hashiras", {"id": "Shinobu Kocho", "date": "2023-03-03"}), + record_message("hashiras", {"id": "Muichiro Tokito", "date": "2023-03-04"}), + ] + ), + ) + mock_source.streams.return_value = [Mock()] + mock_source.streams.return_value[0].primary_key = _NO_PK + mock_source.streams.return_value[0].cursor_field = ["date"] + connector_builder_handler = MessageGrouper(MAX_PAGES_PER_SLICE, MAX_SLICES) + + stream_read: StreamRead = connector_builder_handler.get_message_groups( + source=mock_source, config=CONFIG, configured_catalog=create_configured_catalog("hashiras"), state=_NO_STATE + ) + + assert stream_read.inferred_schema["required"] == ["date"] + + +def make_mock_source(mock_entrypoint_read: Mock, return_value: Iterator[AirbyteMessage]) -> MagicMock: + mock_source = MagicMock() + mock_entrypoint_read.return_value = return_value + mock_source.streams.return_value = [make_mock_stream()] + return mock_source + + +def make_mock_stream(): + mock_stream = MagicMock() + mock_stream.primary_key = [] + mock_stream.cursor_field = [] + return mock_stream + + +def request_log_message(request: Mapping[str, Any]) -> AirbyteMessage: + return AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message=f"request:{json.dumps(request)}")) + + +def response_log_message(response: Mapping[str, Any]) -> AirbyteMessage: + return AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message=f"response:{json.dumps(response)}")) + + +def record_message(stream: str, data: Mapping[str, Any]) -> AirbyteMessage: + return AirbyteMessage(type=MessageType.RECORD, record=AirbyteRecordMessage(stream=stream, data=data, emitted_at=1234)) + + +def state_message(stream: str, data: Mapping[str, Any]) -> AirbyteMessage: + return AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage(stream=AirbyteStreamState(stream_descriptor=StreamDescriptor(name=stream), stream_state=data)), + ) + + +def slice_message(slice_descriptor: str = '{"key": "value"}') -> AirbyteMessage: + return AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message="slice:" + slice_descriptor)) + + +def connector_configuration_control_message(emitted_at: float, config: Mapping[str, Any]) -> AirbyteMessage: + return AirbyteMessage( + type=MessageType.CONTROL, + control=AirbyteControlMessage( + type=OrchestratorType.CONNECTOR_CONFIG, + emitted_at=emitted_at, + connectorConfig=AirbyteControlConnectorConfigMessage(config=config), + ), + ) + + +def auxiliary_request_log_message() -> AirbyteMessage: + return AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.INFO, + message=json.dumps( + { + "http": { + "is_auxiliary": True, + "title": "a title", + "description": "a description", + "request": {}, + "response": {}, + }, + "url": {"full": "https://a-url.com"}, + } + ), + ), + ) + + +def request_response_log_message(request: Mapping[str, Any], response: Mapping[str, Any], url: str) -> AirbyteMessage: + return AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.INFO, + message=json.dumps( + { + "airbyte_cdk": {"stream": {"name": "a stream name"}}, + "http": {"title": "a title", "description": "a description", "request": request, "response": response}, + "url": {"full": url}, + } + ), + ), + ) + + +def any_request_and_response_with_a_record() -> List[AirbyteMessage]: + return [ + request_response_log_message({"request": 1}, {"response": 2}, "http://any_url.com"), + record_message("hashiras", {"name": "Shinobu Kocho"}), + ] diff --git a/airbyte-cdk/python/unit_tests/connector_builder/utils.py b/airbyte-cdk/python/unit_tests/connector_builder/utils.py new file mode 100644 index 000000000000..a94a0416437c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/connector_builder/utils.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Mapping + +from airbyte_cdk.models import ConfiguredAirbyteCatalog, ConfiguredAirbyteCatalogSerializer + + +def create_configured_catalog_dict(stream_name: str) -> Mapping[str, Any]: + return { + "streams": [ + { + "stream": { + "name": stream_name, + "json_schema": {}, + "supported_sync_modes": ["full_refresh", "incremental"], + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite", + } + ] + } + + +def create_configured_catalog(stream_name: str) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalogSerializer.load(create_configured_catalog_dict(stream_name)) diff --git a/airbyte-cdk/python/unit_tests/destinations/__init__.py b/airbyte-cdk/python/unit_tests/destinations/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/destinations/test_destination.py b/airbyte-cdk/python/unit_tests/destinations/test_destination.py new file mode 100644 index 000000000000..a03d7ffcc6b0 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/destinations/test_destination.py @@ -0,0 +1,277 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import argparse +import io +import json +from os import PathLike +from typing import Any, Dict, Iterable, List, Mapping, Union +from unittest.mock import ANY + +import pytest +from airbyte_cdk.destinations import Destination +from airbyte_cdk.destinations import destination as destination_module +from airbyte_cdk.models import ( + AirbyteCatalog, + AirbyteConnectionStatus, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteRecordMessage, + AirbyteStateMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, + ConfiguredAirbyteStream, + ConnectorSpecification, + DestinationSyncMode, + Status, + SyncMode, + Type, +) +from orjson import orjson + + +@pytest.fixture(name="destination") +def destination_fixture(mocker) -> Destination: + # Wipe the internal list of abstract methods to allow instantiating the abstract class without implementing its abstract methods + mocker.patch("airbyte_cdk.destinations.Destination.__abstractmethods__", set()) + # Mypy yells at us because we're init'ing an abstract class + return Destination() # type: ignore + + +class TestArgParsing: + @pytest.mark.parametrize( + ("arg_list", "expected_output"), + [ + (["spec"], {"command": "spec"}), + (["check", "--config", "bogus_path/"], {"command": "check", "config": "bogus_path/"}), + ( + ["write", "--config", "config_path1", "--catalog", "catalog_path1"], + {"command": "write", "config": "config_path1", "catalog": "catalog_path1"}, + ), + ], + ) + def test_successful_parse(self, arg_list: List[str], expected_output: Mapping[str, Any], destination: Destination): + parsed_args = vars(destination.parse_args(arg_list)) + assert ( + parsed_args == expected_output + ), f"Expected parsing {arg_list} to return parsed args {expected_output} but instead found {parsed_args}" + + @pytest.mark.parametrize( + ("arg_list"), + [ + # Invalid commands + ([]), + (["not-a-real-command"]), + ([""]), + # Incorrect parameters + (["spec", "--config", "path"]), + (["check"]), + (["check", "--catalog", "path"]), + (["check", "path"]), + ], + ) + def test_failed_parse(self, arg_list: List[str], destination: Destination): + # We use BaseException because it encompasses SystemExit (raised by failed parsing) and other exceptions (raised by additional semantic + # checks) + with pytest.raises(BaseException): + destination.parse_args(arg_list) + + +def _state(state: Dict[str, Any]) -> AirbyteStateMessage: + return AirbyteStateMessage(data=state) + + +def _record(stream: str, data: Dict[str, Any]) -> AirbyteRecordMessage: + return AirbyteRecordMessage(stream=stream, data=data, emitted_at=0) + + +def _spec(schema: Dict[str, Any]) -> ConnectorSpecification: + return ConnectorSpecification(connectionSpecification=schema) + + +def write_file(path: PathLike, content: Union[str, Mapping]): + content = json.dumps(content) if isinstance(content, Mapping) else content + with open(path, "w") as f: + f.write(content) + + +def _wrapped( + msg: Union[AirbyteRecordMessage, AirbyteStateMessage, AirbyteCatalog, ConnectorSpecification, AirbyteConnectionStatus] +) -> AirbyteMessage: + if isinstance(msg, AirbyteRecordMessage): + return AirbyteMessage(type=Type.RECORD, record=msg) + elif isinstance(msg, AirbyteStateMessage): + return AirbyteMessage(type=Type.STATE, state=msg) + elif isinstance(msg, AirbyteCatalog): + return AirbyteMessage(type=Type.CATALOG, catalog=msg) + elif isinstance(msg, AirbyteConnectionStatus): + return AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=msg) + elif isinstance(msg, ConnectorSpecification): + return AirbyteMessage(type=Type.SPEC, spec=msg) + else: + raise Exception(f"Invalid Airbyte Message: {msg}") + + +class OrderedIterableMatcher(Iterable): + """ + A class whose purpose is to verify equality of one iterable object against another + in an ordered fashion + """ + + def attempt_consume(self, iterator): + try: + return next(iterator) + except StopIteration: + return None + + def __iter__(self): + return iter(self.iterable) + + def __init__(self, iterable: Iterable): + self.iterable = iterable + + def __eq__(self, other): + if not isinstance(other, Iterable): + return False + + return list(self) == list(other) + + +class TestRun: + def test_run_initializes_exception_handler(self, mocker, destination: Destination): + mocker.patch.object(destination_module, "init_uncaught_exception_handler") + mocker.patch.object(destination, "parse_args") + mocker.patch.object(destination, "run_cmd") + destination.run(["dummy"]) + destination_module.init_uncaught_exception_handler.assert_called_once_with(destination_module.logger) + + def test_run_spec(self, mocker, destination: Destination): + args = {"command": "spec"} + parsed_args = argparse.Namespace(**args) + + expected_spec = ConnectorSpecification(connectionSpecification={"json_schema": {"prop": "value"}}) + mocker.patch.object(destination, "spec", return_value=expected_spec, autospec=True) + + spec_message = next(iter(destination.run_cmd(parsed_args))) + + # Mypy doesn't understand magicmock so it thinks spec doesn't have assert_called_once attr + destination.spec.assert_called_once() # type: ignore + + # verify the output of spec was returned + assert spec_message == _wrapped(expected_spec) + + def test_run_check(self, mocker, destination: Destination, tmp_path): + file_path = tmp_path / "config.json" + dummy_config = {"user": "sherif"} + write_file(file_path, dummy_config) + args = {"command": "check", "config": file_path} + + parsed_args = argparse.Namespace(**args) + destination.run_cmd(parsed_args) + spec_msg = ConnectorSpecification(connectionSpecification={}) + mocker.patch.object(destination, "spec", return_value=spec_msg) + validate_mock = mocker.patch("airbyte_cdk.destinations.destination.check_config_against_spec_or_exit") + expected_check_result = AirbyteConnectionStatus(status=Status.SUCCEEDED) + mocker.patch.object(destination, "check", return_value=expected_check_result, autospec=True) + + returned_check_result = next(iter(destination.run_cmd(parsed_args))) + # verify method call with the correct params + # Affirm to Mypy that this is indeed a method on this mock + destination.check.assert_called_once() # type: ignore + # Affirm to Mypy that this is indeed a method on this mock + destination.check.assert_called_with(logger=ANY, config=dummy_config) # type: ignore + # Check if config validation has been called + validate_mock.assert_called_with(dummy_config, spec_msg) + + # verify output was correct + assert returned_check_result == _wrapped(expected_check_result) + + def test_run_check_with_invalid_config(self, mocker, destination: Destination, tmp_path): + file_path = tmp_path / "config.json" + invalid_config = {"not": "valid"} + write_file(file_path, invalid_config) + args = {"command": "check", "config": file_path} + + parsed_args = argparse.Namespace(**args) + destination.run_cmd(parsed_args) + + spec = {"type": "integer"} + spec_msg = ConnectorSpecification(connectionSpecification=spec) + + mocker.patch.object(destination, "spec", return_value=spec_msg) + + # validation against spec happens first, so this should not be reached + mocker.patch.object(destination, "check") + + returned_check_result = next(iter(destination.run_cmd(parsed_args))) + + destination.spec.assert_called_once() # type: ignore + + # config validation against spec happens first, so this should not be reached + destination.check.assert_not_called() # type: ignore + + # verify output was correct + assert isinstance(returned_check_result, AirbyteMessage) + assert returned_check_result.type == Type.CONNECTION_STATUS + assert returned_check_result.connectionStatus.status == Status.FAILED + # the specific phrasing is not relevant, so only check for the keywords + assert "validation error" in returned_check_result.connectionStatus.message + + def test_run_write(self, mocker, destination: Destination, tmp_path, monkeypatch): + config_path, dummy_config = tmp_path / "config.json", {"user": "sherif"} + write_file(config_path, dummy_config) + + dummy_catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="mystream", json_schema={"type": "object"}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + ] + ) + catalog_path = tmp_path / "catalog.json" + write_file(catalog_path, ConfiguredAirbyteCatalogSerializer.dump(dummy_catalog)) + + args = {"command": "write", "config": config_path, "catalog": catalog_path} + parsed_args = argparse.Namespace(**args) + + expected_write_result = [_wrapped(_state({"k1": "v1"})), _wrapped(_state({"k2": "v2"}))] + mocker.patch.object( + destination, "write", return_value=iter(expected_write_result), autospec=True # convert to iterator to mimic real usage + ) + spec_msg = ConnectorSpecification(connectionSpecification={}) + mocker.patch.object(destination, "spec", return_value=spec_msg) + validate_mock = mocker.patch("airbyte_cdk.destinations.destination.check_config_against_spec_or_exit") + # mock input is a record followed by some state messages + mocked_input: List[AirbyteMessage] = [_wrapped(_record("s1", {"k1": "v1"})), *expected_write_result] + mocked_stdin_string = "\n".join([orjson.dumps(AirbyteMessageSerializer.dump(record)).decode() for record in mocked_input]) + mocked_stdin_string += "\n add this non-serializable string to verify the destination does not break on malformed input" + mocked_stdin = io.TextIOWrapper(io.BytesIO(bytes(mocked_stdin_string, "utf-8"))) + + monkeypatch.setattr("sys.stdin", mocked_stdin) + + returned_write_result = list(destination.run_cmd(parsed_args)) + # verify method call with the correct params + # Affirm to Mypy that call_count is indeed a method on this mock + destination.write.assert_called_once() # type: ignore + # Affirm to Mypy that call_count is indeed a method on this mock + destination.write.assert_called_with( # type: ignore + config=dummy_config, + configured_catalog=dummy_catalog, + # Stdin is internally consumed as a generator so we use a custom matcher + # that iterates over two iterables to check equality + input_messages=OrderedIterableMatcher(mocked_input), + ) + # Check if config validation has been called + validate_mock.assert_called_with(dummy_config, spec_msg) + + # verify output was correct + assert returned_write_result == expected_write_result + + @pytest.mark.parametrize("args", [{}, {"command": "fake"}]) + def test_run_cmd_with_incorrect_args_fails(self, args, destination: Destination): + with pytest.raises(Exception): + list(destination.run_cmd(parsed_args=argparse.Namespace(**args))) diff --git a/airbyte-cdk/python/unit_tests/destinations/vector_db_based/config_test.py b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/config_test.py new file mode 100644 index 000000000000..0eeae37b7f3e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/config_test.py @@ -0,0 +1,385 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Union + +import dpath +from airbyte_cdk.destinations.vector_db_based.config import ( + AzureOpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, + ProcessingConfigModel, +) +from airbyte_cdk.utils.spec_schema_transformations import resolve_refs +from pydantic.v1 import BaseModel, Field + + +class IndexingModel(BaseModel): + foo: str = Field( + ..., + title="Foo", + description="Foo", + ) + + +class ConfigModel(BaseModel): + indexing: IndexingModel + + embedding: Union[ + OpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + AzureOpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, + ] = Field( + ..., + title="Embedding", + description="Embedding configuration", + discriminator="mode", + group="embedding", + type="object", + ) + processing: ProcessingConfigModel + + class Config: + title = "My Destination Config" + schema_extra = { + "groups": [ + {"id": "processing", "title": "Processing"}, + {"id": "embedding", "title": "Embedding"}, + {"id": "indexing", "title": "Indexing"}, + ] + } + + @staticmethod + def remove_discriminator(schema: dict) -> None: + """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references""" + dpath.delete(schema, "properties/**/discriminator") + + @classmethod + def schema(cls): + """we're overriding the schema classmethod to enable some post-processing""" + schema = super().schema() + schema = resolve_refs(schema) + cls.remove_discriminator(schema) + return schema + + +def test_json_schema_generation(): + # This is the expected output of the schema generation + expected = { + "title": "My Destination Config", + "type": "object", + "properties": { + "indexing": { + "title": "IndexingModel", + "type": "object", + "properties": {"foo": {"title": "Foo", "description": "Foo", "type": "string"}}, + "required": ["foo"], + }, + "embedding": { + "title": "Embedding", + "description": "Embedding configuration", + "group": "embedding", + "type": "object", + "oneOf": [ + { + "title": "OpenAI", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "openai", + "const": "openai", + "enum": ["openai"], + "type": "string", + }, + "openai_key": { + "title": "OpenAI API key", + "airbyte_secret": True, + "type": "string", + }, + }, + "required": ["openai_key", "mode"], + "description": "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions.", + }, + { + "title": "Cohere", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "cohere", + "const": "cohere", + "enum": ["cohere"], + "type": "string", + }, + "cohere_key": { + "title": "Cohere API key", + "airbyte_secret": True, + "type": "string", + }, + }, + "required": ["cohere_key", "mode"], + "description": "Use the Cohere API to embed text.", + }, + { + "title": "Fake", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "fake", + "const": "fake", + "enum": ["fake"], + "type": "string", + } + }, + "description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs.", + "required": ["mode"], + }, + { + "title": "Azure OpenAI", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "azure_openai", + "const": "azure_openai", + "enum": ["azure_openai"], + "type": "string", + }, + "openai_key": { + "title": "Azure OpenAI API key", + "description": "The API key for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource", + "airbyte_secret": True, + "type": "string", + }, + "api_base": { + "title": "Resource base URL", + "description": "The base URL for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource", + "examples": ["https://your-resource-name.openai.azure.com"], + "type": "string", + }, + "deployment": { + "title": "Deployment", + "description": "The deployment for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource", + "examples": ["your-resource-name"], + "type": "string", + }, + }, + "required": ["openai_key", "api_base", "deployment", "mode"], + "description": "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions.", + }, + { + "title": "OpenAI-compatible", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "openai_compatible", + "const": "openai_compatible", + "enum": ["openai_compatible"], + "type": "string", + }, + "api_key": { + "title": "API key", + "default": "", + "airbyte_secret": True, + "type": "string", + }, + "base_url": { + "title": "Base URL", + "description": "The base URL for your OpenAI-compatible service", + "examples": ["https://your-service-name.com"], + "type": "string", + }, + "model_name": { + "title": "Model name", + "description": "The name of the model to use for embedding", + "default": "text-embedding-ada-002", + "examples": ["text-embedding-ada-002"], + "type": "string", + }, + "dimensions": { + "title": "Embedding dimensions", + "description": "The number of dimensions the embedding model is generating", + "examples": [1536, 384], + "type": "integer", + }, + }, + "required": ["base_url", "dimensions", "mode"], + "description": "Use a service that's compatible with the OpenAI API to embed text.", + }, + ], + }, + "processing": { + "title": "ProcessingConfigModel", + "type": "object", + "properties": { + "chunk_size": { + "title": "Chunk size", + "description": "Size of chunks in tokens to store in vector store (make sure it is not too big for the context if your LLM)", + "maximum": 8191, + "minimum": 1, + "type": "integer", + }, + "chunk_overlap": { + "title": "Chunk overlap", + "description": "Size of overlap between chunks in tokens to store in vector store to better capture relevant context", + "default": 0, + "type": "integer", + }, + "text_fields": { + "title": "Text fields to embed", + "description": "List of fields in the record that should be used to calculate the embedding. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered text fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array.", + "default": [], + "always_show": True, + "examples": ["text", "user.name", "users.*.name"], + "type": "array", + "items": {"type": "string"}, + }, + "metadata_fields": { + "title": "Fields to store as metadata", + "description": "List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.", + "default": [], + "always_show": True, + "examples": ["age", "user", "user.name"], + "type": "array", + "items": {"type": "string"}, + }, + "text_splitter": { + "title": "Text splitter", + "description": "Split text fields into chunks based on the specified method.", + "type": "object", + "oneOf": [ + { + "title": "By Separator", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "separator", + "const": "separator", + "enum": ["separator"], + "type": "string", + }, + "separators": { + "title": "Separators", + "description": 'List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".', + "default": ['"\\n\\n"', '"\\n"', '" "', '""'], + "type": "array", + "items": {"type": "string"}, + }, + "keep_separator": { + "title": "Keep separator", + "description": "Whether to keep the separator in the resulting chunks", + "default": False, + "type": "boolean", + }, + }, + "description": "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc.", + "required": ["mode"], + }, + { + "title": "By Markdown header", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "markdown", + "const": "markdown", + "enum": ["markdown"], + "type": "string", + }, + "split_level": { + "title": "Split level", + "description": "Level of markdown headers to split text fields by. Headings down to the specified level will be used as split points", + "default": 1, + "minimum": 1, + "maximum": 6, + "type": "integer", + }, + }, + "description": "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk.", + "required": ["mode"], + }, + { + "title": "By Programming Language", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "code", + "const": "code", + "enum": ["code"], + "type": "string", + }, + "language": { + "title": "Language", + "description": "Split code in suitable places based on the programming language", + "enum": [ + "cpp", + "go", + "java", + "js", + "php", + "proto", + "python", + "rst", + "ruby", + "rust", + "scala", + "swift", + "markdown", + "latex", + "html", + "sol", + ], + "type": "string", + }, + }, + "required": ["language", "mode"], + "description": "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks.", + }, + ], + }, + "field_name_mappings": { + "title": "Field name mappings", + "description": "List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.", + "default": [], + "type": "array", + "items": { + "title": "FieldNameMappingConfigModel", + "type": "object", + "properties": { + "from_field": { + "title": "From field name", + "description": "The field name in the source", + "type": "string", + }, + "to_field": { + "title": "To field name", + "description": "The field name to use in the destination", + "type": "string", + }, + }, + "required": ["from_field", "to_field"], + }, + }, + }, + "required": ["chunk_size"], + "group": "processing", + }, + }, + "required": ["indexing", "embedding", "processing"], + "groups": [ + {"id": "processing", "title": "Processing"}, + {"id": "embedding", "title": "Embedding"}, + {"id": "indexing", "title": "Indexing"}, + ], + } + assert ConfigModel.schema() == expected diff --git a/airbyte-cdk/python/unit_tests/destinations/vector_db_based/document_processor_test.py b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/document_processor_test.py new file mode 100644 index 000000000000..db3ce730c89e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/document_processor_test.py @@ -0,0 +1,664 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, List, Mapping, Optional +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.destinations.vector_db_based.config import ( + CodeSplitterConfigModel, + FieldNameMappingConfigModel, + MarkdownHeaderSplitterConfigModel, + ProcessingConfigModel, + SeparatorSplitterConfigModel, +) +from airbyte_cdk.destinations.vector_db_based.document_processor import DocumentProcessor +from airbyte_cdk.models import ( + AirbyteRecordMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + SyncMode, +) +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +def initialize_processor(config=ProcessingConfigModel(chunk_size=48, chunk_overlap=0, text_fields=None, metadata_fields=None)): + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream( + name="stream1", + json_schema={}, + namespace="namespace1", + supported_sync_modes=[SyncMode.full_refresh], + ), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + primary_key=[["id"]], + ), + ConfiguredAirbyteStream( + stream=AirbyteStream( + name="stream2", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ), + ] + ) + return DocumentProcessor(config=config, catalog=catalog) + + +@pytest.mark.parametrize( + "metadata_fields, expected_metadata", + [ + ( + None, + { + "_ab_stream": "namespace1_stream1", + "id": 1, + "text": "This is the text", + "complex": {"test": "abc"}, + "arr": [{"test": "abc"}, {"test": "def"}], + }, + ), + (["id"], {"_ab_stream": "namespace1_stream1", "id": 1}), + (["id", "non_existing"], {"_ab_stream": "namespace1_stream1", "id": 1}), + ( + ["id", "complex.test"], + {"_ab_stream": "namespace1_stream1", "id": 1, "complex.test": "abc"}, + ), + ( + ["id", "arr.*.test"], + {"_ab_stream": "namespace1_stream1", "id": 1, "arr.*.test": ["abc", "def"]}, + ), + ], +) +def test_process_single_chunk_with_metadata(metadata_fields, expected_metadata): + processor = initialize_processor() + processor.metadata_fields = metadata_fields + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={ + "id": 1, + "text": "This is the text", + "complex": {"test": "abc"}, + "arr": [{"test": "abc"}, {"test": "def"}], + }, + emitted_at=1234, + ) + + chunks, id_to_delete = processor.process(record) + + assert len(chunks) == 1 + # natural id is only set for dedup mode + assert "_ab_record_id" not in chunks[0].metadata + assert chunks[0].metadata == expected_metadata + assert id_to_delete is None + + +def test_process_single_chunk_limited_metadata(): + processor = initialize_processor() + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={ + "id": 1, + "text": "This is the text", + }, + emitted_at=1234, + ) + + chunks, id_to_delete = processor.process(record) + + assert len(chunks) == 1 + # natural id is only set for dedup mode + assert "_ab_record_id" not in chunks[0].metadata + assert chunks[0].metadata["_ab_stream"] == "namespace1_stream1" + assert chunks[0].metadata["id"] == 1 + assert chunks[0].metadata["text"] == "This is the text" + assert chunks[0].page_content == "id: 1\ntext: This is the text" + assert id_to_delete is None + + +def test_process_single_chunk_without_namespace(): + config = ProcessingConfigModel(chunk_size=48, chunk_overlap=0, text_fields=None, metadata_fields=None) + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream( + name="stream1", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ), + ] + ) + processor = DocumentProcessor(config=config, catalog=catalog) + + record = AirbyteRecordMessage( + stream="stream1", + data={ + "id": 1, + "text": "This is the text", + }, + emitted_at=1234, + ) + + chunks, _ = processor.process(record) + assert chunks[0].metadata["_ab_stream"] == "stream1" + + +def test_complex_text_fields(): + processor = initialize_processor() + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={ + "id": 1, + "nested": { + "texts": [ + {"text": "This is the text"}, + {"text": "And another"}, + ] + }, + "non_text": "a", + "non_text_2": 1, + "text": "This is the regular text", + "other_nested": {"non_text": {"a": "xyz", "b": "abc"}}, + }, + emitted_at=1234, + ) + + processor.text_fields = [ + "nested.texts.*.text", + "text", + "other_nested.non_text", + "non.*.existing", + ] + processor.metadata_fields = ["non_text", "non_text_2", "id"] + + chunks, _ = processor.process(record) + + assert len(chunks) == 1 + assert ( + chunks[0].page_content + == """nested.texts.*.text: This is the text +And another +text: This is the regular text +other_nested.non_text: \na: xyz +b: abc""" + ) + assert chunks[0].metadata == { + "id": 1, + "non_text": "a", + "non_text_2": 1, + "_ab_stream": "namespace1_stream1", + } + + +def test_no_text_fields(): + processor = initialize_processor() + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={ + "id": 1, + "text": "This is the regular text", + }, + emitted_at=1234, + ) + + processor.text_fields = ["another_field"] + processor.logger = MagicMock() + + # assert process is throwing with no text fields found + with pytest.raises(AirbyteTracedException): + processor.process(record) + + +def test_process_multiple_chunks_with_relevant_fields(): + processor = initialize_processor() + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={ + "id": 1, + "name": "John Doe", + "text": "This is the text and it is long enough to be split into multiple chunks. This is the text and it is long enough to be split into multiple chunks. This is the text and it is long enough to be split into multiple chunks", + "age": 25, + }, + emitted_at=1234, + ) + + processor.text_fields = ["text"] + + chunks, id_to_delete = processor.process(record) + + assert len(chunks) == 2 + + for chunk in chunks: + assert chunk.metadata["age"] == 25 + assert id_to_delete is None + + +@pytest.mark.parametrize( + "label, text, chunk_size, chunk_overlap, splitter_config, expected_chunks", + [ + ( + "Default splitting", + "By default, splits are done \non multi newlines,\n\n then single newlines, then spaces", + 10, + 0, + None, + [ + "text: By default, splits are done", + "on multi newlines,", + "then single newlines, then spaces", + ], + ), + ( + "Overlap splitting", + "One two three four five six seven eight nine ten eleven twelve thirteen", + 15, + 5, + None, + [ + "text: One two three four five six", + "four five six seven eight nine ten", + "eight nine ten eleven twelve thirteen", + ], + ), + ( + "Special tokens", + "Special tokens like <|endoftext|> are treated like regular text", + 15, + 0, + None, + [ + "text: Special tokens like", + "<|endoftext|> are treated like regular", + "text", + ], + ), + ( + "Custom separator", + "Custom \nseparatorxxxDoes not split on \n\nnewlines", + 10, + 0, + SeparatorSplitterConfigModel(mode="separator", separators=['"xxx"']), + [ + "text: Custom \nseparator", + "Does not split on \n\nnewlines\n", + ], + ), + ( + "Only splits if chunks dont fit", + "Does yyynot usexxxseparators yyyif not needed", + 10, + 0, + SeparatorSplitterConfigModel(mode="separator", separators=['"xxx"', '"yyy"']), + [ + "text: Does yyynot use", + "separators yyyif not needed", + ], + ), + ( + "Use first separator first", + "Does alwaysyyy usexxxmain separators yyyfirst", + 10, + 0, + SeparatorSplitterConfigModel(mode="separator", separators=['"yyy"', '"xxx"']), + [ + "text: Does always", + "usexxxmain separators yyyfirst", + ], + ), + ( + "Basic markdown splitting", + "# Heading 1\nText 1\n\n# Heading 2\nText 2\n\n# Heading 3\nText 3", + 10, + 0, + MarkdownHeaderSplitterConfigModel(mode="markdown", split_level=1), + [ + "text: # Heading 1\nText 1\n", + "# Heading 2\nText 2", + "# Heading 3\nText 3", + ], + ), + ( + "Split multiple levels", + "# Heading 1\nText 1\n\n## Sub-Heading 1\nText 2\n\n# Heading 2\nText 3", + 10, + 0, + MarkdownHeaderSplitterConfigModel(mode="markdown", split_level=2), + [ + "text: # Heading 1\nText 1\n", + "\n## Sub-Heading 1\nText 2\n", + "# Heading 2\nText 3", + ], + ), + ( + "Do not split if split level does not allow", + "## Heading 1\nText 1\n\n## Heading 2\nText 2\n\n## Heading 3\nText 3", + 10, + 0, + MarkdownHeaderSplitterConfigModel(mode="markdown", split_level=1), + [ + "text: ## Heading 1\nText 1\n\n## Heading 2\nText 2\n\n## Heading 3\nText 3\n", + ], + ), + ( + "Do not split if everything fits", + "## Does not split if everything fits. Heading 1\nText 1\n\n## Heading 2\nText 2\n\n## Heading 3\nText 3", + 1000, + 0, + MarkdownHeaderSplitterConfigModel(mode="markdown", split_level=5), + [ + "text: ## Does not split if everything fits. Heading 1\nText 1\n\n## Heading 2\nText 2\n\n## Heading 3\nText 3", + ], + ), + ( + "Split Java code, respecting class boundaries", + "class A { /* \n\nthis is the first class */ }\nclass B {}", + 20, + 0, + CodeSplitterConfigModel(mode="code", language="java"), + [ + "text: class A { /* \n\nthis is the first class */ }", + "class B {}", + ], + ), + ( + "Split Java code as proto, not respecting class boundaries", + "class A { /* \n\nthis is the first class */ }\nclass B {}", + 20, + 0, + CodeSplitterConfigModel(mode="code", language="proto"), + [ + "text: class A { /*", + "this is the first class */ }\nclass B {}", + ], + ), + ], +) +def test_text_splitters(label, text, chunk_size, chunk_overlap, splitter_config, expected_chunks): + processor = initialize_processor( + ProcessingConfigModel( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + text_fields=["text"], + metadata_fields=None, + text_splitter=splitter_config, + ) + ) + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={ + "id": 1, + "name": "John Doe", + "text": text, + "age": 25, + }, + emitted_at=1234, + ) + + processor.text_fields = ["text"] + + chunks, id_to_delete = processor.process(record) + + assert len(chunks) == len(expected_chunks) + + # check that the page_content in each chunk equals the expected chunk + for i, chunk in enumerate(chunks): + print(chunk.page_content) + assert chunk.page_content == expected_chunks[i] + assert id_to_delete is None + + +@pytest.mark.parametrize( + "label, split_config, has_error_message", + [ + ( + "Invalid separator", + SeparatorSplitterConfigModel(mode="separator", separators=['"xxx']), + True, + ), + ( + "Missing quotes", + SeparatorSplitterConfigModel(mode="separator", separators=["xxx"]), + True, + ), + ( + "Non-string separator", + SeparatorSplitterConfigModel(mode="separator", separators=["123"]), + True, + ), + ( + "Object separator", + SeparatorSplitterConfigModel(mode="separator", separators=["{}"]), + True, + ), + ( + "Proper separator", + SeparatorSplitterConfigModel(mode="separator", separators=['"xxx"', '"\\n\\n"']), + False, + ), + ], +) +def test_text_splitter_check(label, split_config, has_error_message): + error = DocumentProcessor.check_config( + ProcessingConfigModel( + chunk_size=48, + chunk_overlap=0, + text_fields=None, + metadata_fields=None, + text_splitter=split_config, + ) + ) + if has_error_message: + assert error is not None + else: + assert error is None + + +@pytest.mark.parametrize( + "mappings, fields, expected_chunk_metadata", + [ + (None, {"abc": "def", "xyz": 123}, {"abc": "def", "xyz": 123}), + ([], {"abc": "def", "xyz": 123}, {"abc": "def", "xyz": 123}), + ( + [FieldNameMappingConfigModel(from_field="abc", to_field="AAA")], + {"abc": "def", "xyz": 123}, + {"AAA": "def", "xyz": 123}, + ), + ( + [FieldNameMappingConfigModel(from_field="non_existing", to_field="AAA")], + {"abc": "def", "xyz": 123}, + {"abc": "def", "xyz": 123}, + ), + ], +) +def test_rename_metadata_fields( + mappings: Optional[List[FieldNameMappingConfigModel]], + fields: Mapping[str, Any], + expected_chunk_metadata: Mapping[str, Any], +): + processor = initialize_processor() + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={**fields, "text": "abc"}, + emitted_at=1234, + ) + + processor.field_name_mappings = mappings + processor.text_fields = ["text"] + + chunks, id_to_delete = processor.process(record) + + assert len(chunks) == 1 + assert chunks[0].metadata == { + **expected_chunk_metadata, + "_ab_stream": "namespace1_stream1", + "text": "abc", + } + + +@pytest.mark.parametrize( + "primary_key_value, stringified_primary_key, primary_key", + [ + ({"id": 99}, "namespace1_stream1_99", [["id"]]), + ( + {"id": 99, "name": "John Doe"}, + "namespace1_stream1_99_John Doe", + [["id"], ["name"]], + ), + ( + {"id": 99, "name": "John Doe", "age": 25}, + "namespace1_stream1_99_John Doe_25", + [["id"], ["name"], ["age"]], + ), + ( + {"nested": {"id": "abc"}, "name": "John Doe"}, + "namespace1_stream1_abc_John Doe", + [["nested", "id"], ["name"]], + ), + ( + {"nested": {"id": "abc"}}, + "namespace1_stream1_abc___not_found__", + [["nested", "id"], ["name"]], + ), + ], +) +def test_process_multiple_chunks_with_dedupe_mode( + primary_key_value: Mapping[str, Any], + stringified_primary_key: str, + primary_key: List[List[str]], +): + processor = initialize_processor() + + record = AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={ + "text": "This is the text and it is long enough to be split into multiple chunks. This is the text and it is long enough to be split into multiple chunks. This is the text and it is long enough to be split into multiple chunks", + "age": 25, + **primary_key_value, + }, + emitted_at=1234, + ) + + processor.text_fields = ["text"] + + processor.streams["namespace1_stream1"].destination_sync_mode = DestinationSyncMode.append_dedup + processor.streams["namespace1_stream1"].primary_key = primary_key + + chunks, id_to_delete = processor.process(record) + + assert len(chunks) > 1 + for chunk in chunks: + assert chunk.metadata["_ab_record_id"] == stringified_primary_key + assert id_to_delete == stringified_primary_key + + +@pytest.mark.parametrize( + "record, sync_mode, has_chunks, raises, expected_id_to_delete", + [ + pytest.param( + AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={"text": "This is the text", "id": "1"}, + emitted_at=1234, + ), + DestinationSyncMode.append_dedup, + True, + False, + "namespace1_stream1_1", + id="update", + ), + pytest.param( + AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={"text": "This is the text", "id": "1"}, + emitted_at=1234, + ), + DestinationSyncMode.append, + True, + False, + None, + id="append", + ), + pytest.param( + AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={"text": "This is the text", "id": "1", "_ab_cdc_deleted_at": 1234}, + emitted_at=1234, + ), + DestinationSyncMode.append_dedup, + False, + False, + "namespace1_stream1_1", + id="cdc_delete", + ), + pytest.param( + AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={"id": "1", "_ab_cdc_deleted_at": 1234}, + emitted_at=1234, + ), + DestinationSyncMode.append_dedup, + False, + False, + "namespace1_stream1_1", + id="cdc_delete_without_text", + ), + pytest.param( + AirbyteRecordMessage( + stream="stream1", + namespace="namespace1", + data={"id": "1"}, + emitted_at=1234, + ), + DestinationSyncMode.append_dedup, + False, + True, + "namespace1_stream1_1", + id="update_without_text", + ), + ], +) +def test_process_cdc_records(record, sync_mode, has_chunks, raises, expected_id_to_delete): + processor = initialize_processor() + + processor.text_fields = ["text"] + + processor.streams["namespace1_stream1"].destination_sync_mode = sync_mode + + if raises: + with pytest.raises(AirbyteTracedException): + processor.process(record) + else: + chunks, id_to_delete = processor.process(record) + if has_chunks: + assert len(chunks) > 0 + assert id_to_delete == expected_id_to_delete diff --git a/airbyte-cdk/python/unit_tests/destinations/vector_db_based/embedder_test.py b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/embedder_test.py new file mode 100644 index 000000000000..600a4c0890d3 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/embedder_test.py @@ -0,0 +1,123 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock, call + +import pytest +from airbyte_cdk.destinations.vector_db_based.config import ( + AzureOpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + FromFieldEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, +) +from airbyte_cdk.destinations.vector_db_based.embedder import ( + COHERE_VECTOR_SIZE, + OPEN_AI_VECTOR_SIZE, + AzureOpenAIEmbedder, + CohereEmbedder, + Document, + FakeEmbedder, + FromFieldEmbedder, + OpenAICompatibleEmbedder, + OpenAIEmbedder, +) +from airbyte_cdk.models import AirbyteRecordMessage +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +@pytest.mark.parametrize( + "embedder_class, args, dimensions", + ( + (OpenAIEmbedder, [OpenAIEmbeddingConfigModel(**{"mode": "openai", "openai_key": "abc"}), 1000], OPEN_AI_VECTOR_SIZE), + (CohereEmbedder, [CohereEmbeddingConfigModel(**{"mode": "cohere", "cohere_key": "abc"})], COHERE_VECTOR_SIZE), + (FakeEmbedder, [FakeEmbeddingConfigModel(**{"mode": "fake"})], OPEN_AI_VECTOR_SIZE), + ( + AzureOpenAIEmbedder, + [ + AzureOpenAIEmbeddingConfigModel( + **{ + "mode": "azure_openai", + "openai_key": "abc", + "api_base": "https://my-resource.openai.azure.com", + "deployment": "my-deployment", + } + ), + 1000, + ], + OPEN_AI_VECTOR_SIZE, + ), + ( + OpenAICompatibleEmbedder, + [ + OpenAICompatibleEmbeddingConfigModel( + **{ + "mode": "openai_compatible", + "api_key": "abc", + "base_url": "https://my-service.com", + "model_name": "text-embedding-ada-002", + "dimensions": 50, + } + ) + ], + 50, + ), + ), +) +def test_embedder(embedder_class, args, dimensions): + embedder = embedder_class(*args) + mock_embedding_instance = MagicMock() + embedder.embeddings = mock_embedding_instance + + mock_embedding_instance.embed_query.side_effect = Exception("Some error") + assert embedder.check().startswith("Some error") + + mock_embedding_instance.embed_query.side_effect = None + assert embedder.check() is None + + assert embedder.embedding_dimensions == dimensions + + mock_embedding_instance.embed_documents.return_value = [[0] * dimensions] * 2 + + chunks = [ + Document(page_content="a", record=AirbyteRecordMessage(stream="mystream", data={}, emitted_at=0)), + Document(page_content="b", record=AirbyteRecordMessage(stream="mystream", data={}, emitted_at=0)), + ] + assert embedder.embed_documents(chunks) == mock_embedding_instance.embed_documents.return_value + mock_embedding_instance.embed_documents.assert_called_with(["a", "b"]) + + +@pytest.mark.parametrize( + "field_name, dimensions, metadata, expected_embedding, expected_error", + ( + ("a", 2, {"a": [1, 2]}, [1, 2], False), + ("a", 2, {"b": "b"}, None, True), + ("a", 2, {}, None, True), + ("a", 2, {"a": []}, None, True), + ("a", 2, {"a": [1, 2, 3]}, None, True), + ("a", 2, {"a": [1, "2", 3]}, None, True), + ), +) +def test_from_field_embedder(field_name, dimensions, metadata, expected_embedding, expected_error): + embedder = FromFieldEmbedder(FromFieldEmbeddingConfigModel(mode="from_field", dimensions=dimensions, field_name=field_name)) + chunks = [Document(page_content="a", record=AirbyteRecordMessage(stream="mystream", data=metadata, emitted_at=0))] + if expected_error: + with pytest.raises(AirbyteTracedException): + embedder.embed_documents(chunks) + else: + assert embedder.embed_documents(chunks) == [expected_embedding] + + +def test_openai_chunking(): + config = OpenAIEmbeddingConfigModel(**{"mode": "openai", "openai_key": "abc"}) + embedder = OpenAIEmbedder(config, 150) + mock_embedding_instance = MagicMock() + embedder.embeddings = mock_embedding_instance + + mock_embedding_instance.embed_documents.side_effect = lambda texts: [[0] * OPEN_AI_VECTOR_SIZE] * len(texts) + + chunks = [Document(page_content="a", record=AirbyteRecordMessage(stream="mystream", data={}, emitted_at=0)) for _ in range(1005)] + assert embedder.embed_documents(chunks) == [[0] * OPEN_AI_VECTOR_SIZE] * 1005 + mock_embedding_instance.embed_documents.assert_has_calls([call(["a"] * 1000), call(["a"] * 5)]) diff --git a/airbyte-cdk/python/unit_tests/destinations/vector_db_based/writer_test.py b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/writer_test.py new file mode 100644 index 000000000000..ac831694c726 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/destinations/vector_db_based/writer_test.py @@ -0,0 +1,173 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Optional +from unittest.mock import ANY, MagicMock, call + +import pytest +from airbyte_cdk.destinations.vector_db_based import ProcessingConfigModel, Writer +from airbyte_cdk.models import ( + AirbyteLogMessage, + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateMessage, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, + Level, + Type, +) + + +def _generate_record_message(index: int, stream: str = "example_stream", namespace: Optional[str] = None): + return AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage( + stream=stream, namespace=namespace, emitted_at=1234, data={"column_name": f"value {index}", "id": index} + ), + ) + + +BATCH_SIZE = 32 + + +def generate_stream(name: str = "example_stream", namespace: Optional[str] = None): + return { + "stream": { + "name": name, + "namespace": namespace, + "json_schema": {"$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {}}, + "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": False, + "default_cursor_field": ["column_name"], + }, + "primary_key": [["id"]], + "sync_mode": "incremental", + "destination_sync_mode": "append_dedup", + } + + +def generate_mock_embedder(): + mock_embedder = MagicMock() + mock_embedder.embed_documents.return_value = [[0] * 1536] * (BATCH_SIZE + 5 + 5) + mock_embedder.embed_documents.side_effect = lambda chunks: [[0] * 1536] * len(chunks) + + return mock_embedder + + +@pytest.mark.parametrize("omit_raw_text", [True, False]) +def test_write(omit_raw_text: bool): + """ + Basic test for the write method, batcher and document processor. + """ + config_model = ProcessingConfigModel(chunk_overlap=0, chunk_size=1000, metadata_fields=None, text_fields=["column_name"]) + + configured_catalog: ConfiguredAirbyteCatalog = ConfiguredAirbyteCatalogSerializer.load({"streams": [generate_stream()]}) + # messages are flushed after 32 records or after a state message, so this will trigger two batches to be processed + input_messages = [_generate_record_message(i) for i in range(BATCH_SIZE + 5)] + state_message = AirbyteMessage(type=Type.STATE, state=AirbyteStateMessage()) + input_messages.append(state_message) + # messages are also flushed once the input messages are exhausted, so this will trigger another batch + input_messages.extend([_generate_record_message(i) for i in range(5)]) + + mock_embedder = generate_mock_embedder() + + mock_indexer = MagicMock() + post_sync_log_message = AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="post sync")) + mock_indexer.post_sync.return_value = [post_sync_log_message] + + # Create the DestinationLangchain instance + writer = Writer(config_model, mock_indexer, mock_embedder, BATCH_SIZE, omit_raw_text) + + output_messages = writer.write(configured_catalog, input_messages) + output_message = next(output_messages) + # assert state message is + assert output_message == state_message + + mock_indexer.pre_sync.assert_called_with(configured_catalog) + + # 1 batches due to max batch size reached and 1 batch due to state message + assert mock_indexer.index.call_count == 2 + assert mock_indexer.delete.call_count == 2 + assert mock_embedder.embed_documents.call_count == 2 + + if omit_raw_text: + for call_args in mock_indexer.index.call_args_list: + for chunk in call_args[0][0]: + if omit_raw_text: + assert chunk.page_content is None + else: + assert chunk.page_content is not None + + output_message = next(output_messages) + assert output_message == post_sync_log_message + + try: + next(output_messages) + assert False, "Expected end of message stream" + except StopIteration: + pass + + # 1 batch due to end of message stream + assert mock_indexer.index.call_count == 3 + assert mock_indexer.delete.call_count == 3 + assert mock_embedder.embed_documents.call_count == 3 + + mock_indexer.post_sync.assert_called() + + +def test_write_stream_namespace_split(): + """ + Test separate handling of streams and namespaces in the writer + + generate BATCH_SIZE - 10 records for example_stream, 5 records for example_stream with namespace abc and 10 records for example_stream2 + messages are flushed after 32 records or after a state message, so this will trigger 4 calls to the indexer: + * out of the first batch of 32, example_stream, example stream with namespace abd and the first 5 records for example_stream2 + * in the second batch, the remaining 5 records for example_stream2 + """ + config_model = ProcessingConfigModel(chunk_overlap=0, chunk_size=1000, metadata_fields=None, text_fields=["column_name"]) + + configured_catalog: ConfiguredAirbyteCatalog = ConfiguredAirbyteCatalogSerializer.load( + { + "streams": [ + generate_stream(), + generate_stream(namespace="abc"), + generate_stream("example_stream2"), + ] + } + ) + + input_messages = [_generate_record_message(i, "example_stream", None) for i in range(BATCH_SIZE - 10)] + input_messages.extend([_generate_record_message(i, "example_stream", "abc") for i in range(5)]) + input_messages.extend([_generate_record_message(i, "example_stream2", None) for i in range(10)]) + state_message = AirbyteMessage(type=Type.STATE, state=AirbyteStateMessage()) + input_messages.append(state_message) + + mock_embedder = generate_mock_embedder() + + mock_indexer = MagicMock() + mock_indexer.post_sync.return_value = [] + + # Create the DestinationLangchain instance + writer = Writer(config_model, mock_indexer, mock_embedder, BATCH_SIZE, False) + + output_messages = writer.write(configured_catalog, input_messages) + next(output_messages) + + mock_indexer.index.assert_has_calls( + [ + call(ANY, None, "example_stream"), + call(ANY, "abc", "example_stream"), + call(ANY, None, "example_stream2"), + call(ANY, None, "example_stream2"), + ] + ) + mock_indexer.index.assert_has_calls( + [ + call(ANY, None, "example_stream"), + call(ANY, "abc", "example_stream"), + call(ANY, None, "example_stream2"), + call(ANY, None, "example_stream2"), + ] + ) + assert mock_embedder.embed_documents.call_count == 4 diff --git a/airbyte-cdk/python/unit_tests/resource/http/response/test-resource.json b/airbyte-cdk/python/unit_tests/resource/http/response/test-resource.json new file mode 100644 index 000000000000..667ec0669008 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/resource/http/response/test-resource.json @@ -0,0 +1,3 @@ +{ + "test-source template": "this is a template for test-resource" +} diff --git a/airbyte-cdk/python/unit_tests/sources/__init__.py b/airbyte-cdk/python/unit_tests/sources/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/concurrent_source/__init__.py b/airbyte-cdk/python/unit_tests/sources/concurrent_source/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/concurrent_source/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py b/airbyte-cdk/python/unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py new file mode 100644 index 000000000000..22c5d34a6a9c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py @@ -0,0 +1,169 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from typing import Any, List, Mapping, Optional, Tuple +from unittest.mock import Mock + +import freezegun +import pytest +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStream, + AirbyteStreamStatus, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + FailureType, + SyncMode, + TraceType, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter +from airbyte_cdk.sources.message import InMemoryMessageRepository +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +class _MockSource(ConcurrentSourceAdapter): + def __init__(self, concurrent_source, _streams_to_is_concurrent, logger, raise_exception_on_missing_stream=True): + super().__init__(concurrent_source) + self._streams_to_is_concurrent = _streams_to_is_concurrent + self._logger = logger + self._raise_exception_on_missing_stream = raise_exception_on_missing_stream + + message_repository = InMemoryMessageRepository() + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + raise NotImplementedError + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + return [ + self.convert_to_concurrent_stream(self._logger, s, Mock()) + if is_concurrent + else s + for s, is_concurrent in self._streams_to_is_concurrent.items() + ] + + @property + def raise_exception_on_missing_stream(self): + """The getter method.""" + return self._raise_exception_on_missing_stream + + @raise_exception_on_missing_stream.setter + def raise_exception_on_missing_stream(self, value): + self._raise_exception_on_missing_stream = value + + +@freezegun.freeze_time("2020-01-01T00:00:00") +def test_concurrent_source_adapter(as_stream_status, remove_stack_trace): + concurrent_source = Mock() + message_from_concurrent_stream = AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="s2", + data={"data": 2}, + emitted_at=1577836800000, + ), + ) + concurrent_source.read.return_value = iter([message_from_concurrent_stream]) + regular_stream = _mock_stream("s1", [{"data": 1}]) + concurrent_stream = _mock_stream("s2", []) + unavailable_stream = _mock_stream("s3", [{"data": 3}], False) + concurrent_stream.name = "s2" + logger = Mock() + adapter = _MockSource(concurrent_source, {regular_stream: False, concurrent_stream: True}, logger) + with pytest.raises(AirbyteTracedException): + messages = [] + for message in adapter.read(logger, {}, _configured_catalog([regular_stream, concurrent_stream, unavailable_stream])): + messages.append(message) + + records = [m for m in messages if m.type == MessageType.RECORD] + + expected_records = [ + message_from_concurrent_stream, + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="s1", + data={"data": 1}, + emitted_at=1577836800000, + ), + ), + ] + + assert records == expected_records + + unavailable_stream_trace_messages = [ + m + for m in messages + if m.type == MessageType.TRACE + and m.trace.type == TraceType.STREAM_STATUS + and m.trace.stream_status.status == AirbyteStreamStatus.INCOMPLETE + ] + expected_status = [as_stream_status("s3", AirbyteStreamStatus.INCOMPLETE)] + + assert len(unavailable_stream_trace_messages) == 1 + assert unavailable_stream_trace_messages[0].trace.stream_status == expected_status[0].trace.stream_status + + +def _mock_stream(name: str, data=[], available: bool = True): + s = Mock() + s.name = name + s.namespace = None + s.as_airbyte_stream.return_value = AirbyteStream( + name=name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + s.check_availability.return_value = (True, None) if available else (False, "not available") + s.get_json_schema.return_value = {} + s.read.return_value = iter(data) + s.primary_key = None + return s + + +def _configured_catalog(streams: List[Stream]): + return ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=stream.as_airbyte_stream(), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + for stream in streams + ] + ) + + +@pytest.mark.parametrize("raise_exception_on_missing_stream", [True, False]) +def test_read_nonexistent_concurrent_stream_emit_incomplete_stream_status( + mocker, remove_stack_trace, as_stream_status, raise_exception_on_missing_stream +): + """ + Tests that attempting to sync a stream which the source does not return from the `streams` method emits incomplete stream status. + """ + logger = Mock() + + s1 = _mock_stream("s1", []) + s2 = _mock_stream("this_stream_doesnt_exist_in_the_source", []) + + concurrent_source = Mock() + concurrent_source.read.return_value = [] + + adapter = _MockSource(concurrent_source, {s1: True}, logger) + expected_status = [as_stream_status("this_stream_doesnt_exist_in_the_source", AirbyteStreamStatus.INCOMPLETE)] + + adapter.raise_exception_on_missing_stream = raise_exception_on_missing_stream + + if not raise_exception_on_missing_stream: + messages = [remove_stack_trace(message) for message in adapter.read(logger, {}, _configured_catalog([s2]))] + assert messages[0].trace.stream_status == expected_status[0].trace.stream_status + else: + with pytest.raises(AirbyteTracedException) as exc_info: + messages = [remove_stack_trace(message) for message in adapter.read(logger, {}, _configured_catalog([s2]))] + assert messages == expected_status + assert exc_info.value.failure_type == FailureType.config_error + assert "not found in the source" in exc_info.value.message diff --git a/airbyte-cdk/python/unit_tests/sources/conftest.py b/airbyte-cdk/python/unit_tests/sources/conftest.py new file mode 100644 index 000000000000..d20d763bb17b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/conftest.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime + +import pytest +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteStreamStatus, + AirbyteStreamStatusTraceMessage, + AirbyteTraceMessage, + StreamDescriptor, + TraceType, +) +from airbyte_cdk.models import Type as MessageType + + +@pytest.fixture +def remove_stack_trace(): + def _remove_stack_trace(message: AirbyteMessage) -> AirbyteMessage: + """ + Helper method that removes the stack trace from Airbyte trace messages to make asserting against expected records easier + """ + if message.trace and message.trace.error and message.trace.error.stack_trace: + message.trace.error.stack_trace = None + return message + + return _remove_stack_trace + + +@pytest.fixture +def as_stream_status(): + def _as_stream_status(stream: str, status: AirbyteStreamStatus) -> AirbyteMessage: + trace_message = AirbyteTraceMessage( + emitted_at=datetime.datetime.now().timestamp() * 1000.0, + type=TraceType.STREAM_STATUS, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=stream), + status=status, + ), + ) + + return AirbyteMessage(type=MessageType.TRACE, trace=trace_message) + + return _as_stream_status diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/async_job/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_integration.py b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_integration.py new file mode 100644 index 000000000000..7fbd04b92926 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_integration.py @@ -0,0 +1,121 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + + +import logging +from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple +from unittest import TestCase, mock + +from airbyte_cdk import AbstractSource, DeclarativeStream, SinglePartitionRouter, Stream, StreamSlice +from airbyte_cdk.models import ConnectorSpecification +from airbyte_cdk.sources.declarative.async_job.job import AsyncJob +from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator +from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker +from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository +from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus +from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector +from airbyte_cdk.sources.declarative.retrievers.async_retriever import AsyncRetriever +from airbyte_cdk.sources.declarative.schema import InlineSchemaLoader +from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer +from airbyte_cdk.sources.message import NoopMessageRepository +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer +from airbyte_cdk.test.catalog_builder import CatalogBuilder, ConfiguredAirbyteStreamBuilder +from airbyte_cdk.test.entrypoint_wrapper import read + +_A_STREAM_NAME = "a_stream_name" +_EXTRACTOR_NOT_USED: RecordExtractor = None # type: ignore # the extractor should not be used. If it is the case, there is an issue that needs fixing +_NO_LIMIT = 10000 + + +class MockAsyncJobRepository(AsyncJobRepository): + + def start(self, stream_slice: StreamSlice) -> AsyncJob: + return AsyncJob("a_job_id", StreamSlice(partition={}, cursor_slice={})) + + def update_jobs_status(self, jobs: Set[AsyncJob]) -> None: + for job in jobs: + job.update_status(AsyncJobStatus.COMPLETED) + + def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]: + yield from [{"record_field": 10}] + + def abort(self, job: AsyncJob) -> None: + pass + + def delete(self, job: AsyncJob) -> None: + pass + + +class MockSource(AbstractSource): + + def __init__(self, stream_slicer: Optional[StreamSlicer] = None) -> None: + self._stream_slicer = SinglePartitionRouter({}) if stream_slicer is None else stream_slicer + self._message_repository = NoopMessageRepository() + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + return True, None + + def spec(self, logger: logging.Logger) -> ConnectorSpecification: + return ConnectorSpecification(connectionSpecification={}) + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + noop_record_selector = RecordSelector( + extractor=_EXTRACTOR_NOT_USED, + config={}, + parameters={}, + schema_normalization=TypeTransformer(TransformConfig.NoTransform), + record_filter=None, + transformations=[] + ) + return [ + DeclarativeStream( + retriever=AsyncRetriever( + config={}, + parameters={}, + record_selector=noop_record_selector, + stream_slicer=self._stream_slicer, + job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( + MockAsyncJobRepository(), stream_slices, JobTracker(_NO_LIMIT), self._message_repository, + ), + ), + config={}, + parameters={}, + name=_A_STREAM_NAME, + primary_key=["id"], + schema_loader=InlineSchemaLoader({}, {}), + # the interface mentions that this is Optional, + # but I get `'NoneType' object has no attribute 'eval'` by passing None + stream_cursor_field="", + ) + ] + + +class JobDeclarativeStreamTest(TestCase): + _CONFIG: Mapping[str, Any] = {} + + def setUp(self) -> None: + self._stream_slicer = mock.Mock(wraps=SinglePartitionRouter({})) + self._source = MockSource(self._stream_slicer) + self._source.streams({}) + + def test_when_read_then_return_records_from_repository(self) -> None: + output = read( + self._source, + self._CONFIG, + CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name(_A_STREAM_NAME)).build() + ) + + assert len(output.records) == 1 + + def test_when_read_then_call_stream_slices_only_once(self) -> None: + """ + As generating stream slices is very expensive, we want to ensure that during a read, it is only called once. + """ + output = read( + self._source, + self._CONFIG, + CatalogBuilder().with_stream(ConfiguredAirbyteStreamBuilder().with_name(_A_STREAM_NAME)).build() + ) + + assert not output.errors + assert self._stream_slicer.stream_slices.call_count == 1 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job.py b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job.py new file mode 100644 index 000000000000..6399433e4413 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import time +from datetime import timedelta +from unittest import TestCase + +from airbyte_cdk.sources.declarative.async_job.job import AsyncJob +from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus +from airbyte_cdk.sources.declarative.types import StreamSlice + +_AN_API_JOB_ID = "an api job id" +_ANY_STREAM_SLICE = StreamSlice(partition={}, cursor_slice={}) +_A_VERY_BIG_TIMEOUT = timedelta(days=999999999) +_IMMEDIATELY_TIMED_OUT = timedelta(microseconds=1) + + +class AsyncJobTest(TestCase): + def test_given_timer_is_not_out_when_status_then_return_actual_status(self) -> None: + job = AsyncJob(_AN_API_JOB_ID, _ANY_STREAM_SLICE, _A_VERY_BIG_TIMEOUT) + assert job.status() == AsyncJobStatus.RUNNING + + def test_given_timer_is_out_when_status_then_return_timed_out(self) -> None: + job = AsyncJob(_AN_API_JOB_ID, _ANY_STREAM_SLICE, _IMMEDIATELY_TIMED_OUT) + time.sleep(0.001) + assert job.status() == AsyncJobStatus.TIMED_OUT + + def test_given_status_is_terminal_when_update_status_then_stop_timer(self) -> None: + """ + This test will become important once we will print stats associated with jobs. As for now, we stop the timer but do not return any + metrics regarding the timer so it is not useful. + """ + pass diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job_orchestrator.py b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job_orchestrator.py new file mode 100644 index 000000000000..7f10bb3ac28a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job_orchestrator.py @@ -0,0 +1,303 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import logging +import sys +import threading +import time +from typing import Callable, List, Mapping, Optional, Set, Tuple +from unittest import TestCase, mock +from unittest.mock import MagicMock, Mock, call + +import pytest +from airbyte_cdk import AirbyteTracedException, StreamSlice +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.async_job.job import AsyncJob, AsyncJobStatus +from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator, AsyncPartition +from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker +from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.http.http_client import MessageRepresentationAirbyteTracedErrors + +_ANY_STREAM_SLICE = Mock() +_A_STREAM_SLICE = Mock() +_ANOTHER_STREAM_SLICE = Mock() +_ANY_RECORD = {"a record field": "a record value"} +_NO_JOB_LIMIT = sys.maxsize +_BUFFER = 10000 # this buffer allows us to be unconcerned with the number of times the update status is called + + +def _create_job(status: AsyncJobStatus = AsyncJobStatus.FAILED) -> AsyncJob: + job = Mock(spec=AsyncJob) + job.status.return_value = status + return job + + +class AsyncPartitionTest(TestCase): + def test_given_one_failed_job_when_status_then_return_failed(self) -> None: + partition = AsyncPartition([_create_job(status) for status in AsyncJobStatus], _ANY_STREAM_SLICE) + assert partition.status == AsyncJobStatus.FAILED + + def test_given_all_status_except_failed_when_status_then_return_timed_out(self) -> None: + statuses = [status for status in AsyncJobStatus if status != AsyncJobStatus.FAILED] + partition = AsyncPartition([_create_job(status) for status in statuses], _ANY_STREAM_SLICE) + assert partition.status == AsyncJobStatus.TIMED_OUT + + def test_given_running_and_completed_jobs_when_status_then_return_running(self) -> None: + partition = AsyncPartition([_create_job(AsyncJobStatus.RUNNING), _create_job(AsyncJobStatus.COMPLETED)], _ANY_STREAM_SLICE) + assert partition.status == AsyncJobStatus.RUNNING + + def test_given_only_completed_jobs_when_status_then_return_running(self) -> None: + partition = AsyncPartition([_create_job(AsyncJobStatus.COMPLETED) for _ in range(10)], _ANY_STREAM_SLICE) + assert partition.status == AsyncJobStatus.COMPLETED + + +def _status_update_per_jobs(status_update_per_jobs: Mapping[AsyncJob, List[AsyncJobStatus]]) -> Callable[[set[AsyncJob]], None]: + status_index_by_job = {job: 0 for job in status_update_per_jobs.keys()} + + def _update_status(jobs: Set[AsyncJob]) -> None: + for job in jobs: + status_index = status_index_by_job[job] + job.update_status(status_update_per_jobs[job][status_index]) + status_index_by_job[job] += 1 + + return _update_status + + +sleep_mock_target = "airbyte_cdk.sources.declarative.async_job.job_orchestrator.time.sleep" +_MAX_NUMBER_OF_ATTEMPTS = 3 + + +class AsyncJobOrchestratorTest(TestCase): + def setUp(self) -> None: + self._job_repository = Mock(spec=AsyncJobRepository) + self._message_repository = Mock(spec=MessageRepository) + self._logger = Mock(spec=logging.Logger) + + self._job_for_a_slice = self._an_async_job("an api job id", _A_STREAM_SLICE) + self._job_for_another_slice = self._an_async_job("another api job id", _ANOTHER_STREAM_SLICE) + + @mock.patch(sleep_mock_target) + def test_when_create_and_get_completed_partitions_then_create_job_and_update_status_until_completed(self, mock_sleep: MagicMock) -> None: + self._job_repository.start.return_value = self._job_for_a_slice + status_updates = [AsyncJobStatus.RUNNING, AsyncJobStatus.RUNNING, AsyncJobStatus.COMPLETED] + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs( + { + self._job_for_a_slice: status_updates + } + ) + orchestrator = self._orchestrator([_A_STREAM_SLICE]) + + partitions = list(orchestrator.create_and_get_completed_partitions()) + + assert len(partitions) == 1 + assert partitions[0].status == AsyncJobStatus.COMPLETED + assert self._job_for_a_slice.update_status.mock_calls == [call(status) for status in status_updates] + + @mock.patch(sleep_mock_target) + def test_given_one_job_still_running_when_create_and_get_completed_partitions_then_only_update_running_job_status(self, mock_sleep: MagicMock) -> None: + self._job_repository.start.side_effect = [self._job_for_a_slice, self._job_for_another_slice] + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs( + { + self._job_for_a_slice: [AsyncJobStatus.COMPLETED], + self._job_for_another_slice: [AsyncJobStatus.RUNNING, AsyncJobStatus.COMPLETED], + } + ) + orchestrator = self._orchestrator([_A_STREAM_SLICE, _ANOTHER_STREAM_SLICE]) + + list(orchestrator.create_and_get_completed_partitions()) + + assert self._job_repository.update_jobs_status.mock_calls == [ + call({self._job_for_a_slice, self._job_for_another_slice}), + call({self._job_for_another_slice}), + ] + + @mock.patch(sleep_mock_target) + def test_given_timeout_when_create_and_get_completed_partitions_then_free_budget_and_raise_exception(self, mock_sleep: MagicMock) -> None: + job_tracker = JobTracker(1) + self._job_repository.start.return_value = self._job_for_a_slice + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs( + { + self._job_for_a_slice: [AsyncJobStatus.TIMED_OUT] + } + ) + orchestrator = self._orchestrator([_A_STREAM_SLICE], job_tracker) + + with pytest.raises(AirbyteTracedException): + list(orchestrator.create_and_get_completed_partitions()) + assert job_tracker.try_to_get_intent() + assert self._job_repository.start.call_args_list == [call(_A_STREAM_SLICE)] * _MAX_NUMBER_OF_ATTEMPTS + + @mock.patch(sleep_mock_target) + def test_given_failure_when_create_and_get_completed_partitions_then_raise_exception(self, mock_sleep: MagicMock) -> None: + self._job_repository.start.return_value = self._job_for_a_slice + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs( + { + self._job_for_a_slice: [AsyncJobStatus.FAILED] + } + ) + orchestrator = self._orchestrator([_A_STREAM_SLICE]) + + with pytest.raises(AirbyteTracedException): + list(orchestrator.create_and_get_completed_partitions()) + assert self._job_repository.start.call_args_list == [call(_A_STREAM_SLICE)] * _MAX_NUMBER_OF_ATTEMPTS + + def test_when_fetch_records_then_yield_records_from_each_job(self) -> None: + self._job_repository.fetch_records.return_value = [_ANY_RECORD] + orchestrator = self._orchestrator([_A_STREAM_SLICE]) + first_job = _create_job() + second_job = _create_job() + partition = AsyncPartition([first_job, second_job], _A_STREAM_SLICE) + + records = list(orchestrator.fetch_records(partition)) + + assert len(records) == 2 + assert self._job_repository.fetch_records.mock_calls == [call(first_job), call(second_job)] + assert self._job_repository.delete.mock_calls == [call(first_job), call(second_job)] + + def _orchestrator(self, slices: List[StreamSlice], job_tracker: Optional[JobTracker] = None) -> AsyncJobOrchestrator: + job_tracker = job_tracker if job_tracker else JobTracker(_NO_JOB_LIMIT) + return AsyncJobOrchestrator(self._job_repository, slices, job_tracker, self._message_repository) + + def test_given_more_jobs_than_limit_when_create_and_get_completed_partitions_then_still_return_all_slices_and_free_job_budget(self) -> None: + job_tracker = JobTracker(1) + self._job_repository.start.side_effect = [self._job_for_a_slice, self._job_for_another_slice] + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs( + { + self._job_for_a_slice: [AsyncJobStatus.COMPLETED], + self._job_for_another_slice: [AsyncJobStatus.COMPLETED], + } + ) + orchestrator = self._orchestrator([self._job_for_a_slice.job_parameters(), self._job_for_another_slice.job_parameters()], job_tracker) + + partitions = list(orchestrator.create_and_get_completed_partitions()) + + assert len(partitions) == 2 + assert job_tracker.try_to_get_intent() + + @mock.patch(sleep_mock_target) + def test_given_exception_to_break_when_start_job_and_raise_this_exception_and_abort_jobs(self, mock_sleep: MagicMock) -> None: + orchestrator = AsyncJobOrchestrator( + self._job_repository, + [_A_STREAM_SLICE, _ANOTHER_STREAM_SLICE], + JobTracker(_NO_JOB_LIMIT), + self._message_repository, + exceptions_to_break_on=[ValueError], + ) + self._job_repository.start.side_effect = [self._job_for_a_slice, ValueError("Something went wrong")] + + with pytest.raises(ValueError): + # assert that orchestrator exits on expected error + list(orchestrator.create_and_get_completed_partitions()) + assert len(orchestrator._job_tracker._jobs) == 0 + self._job_repository.abort.assert_called_once_with(self._job_for_a_slice) + + def test_given_traced_config_error_when_start_job_and_raise_this_exception_and_abort_jobs(self) -> None: + """ + Since this is a config error, we assume the other jobs will fail for the same reasons. + """ + job_tracker = JobTracker(1) + self._job_repository.start.side_effect = MessageRepresentationAirbyteTracedErrors("Can't create job", failure_type=FailureType.config_error) + + orchestrator = AsyncJobOrchestrator(self._job_repository, [_A_STREAM_SLICE], job_tracker, self._message_repository, [ValueError]) + + with pytest.raises(AirbyteTracedException): + list(orchestrator.create_and_get_completed_partitions()) + + assert job_tracker.try_to_get_intent() + + @mock.patch(sleep_mock_target) + def test_given_exception_on_single_job_when_create_and_get_completed_partitions_then_return(self, mock_sleep: MagicMock) -> None: + """ + We added this test because the initial logic of breaking the main loop we implemented (when `self._has_started_a_job and self._running_partitions`) was not enough in the case where there was only one slice and it would fail to start. + """ + orchestrator = self._orchestrator([_A_STREAM_SLICE]) + self._job_repository.start.side_effect = ValueError + + with pytest.raises(AirbyteTracedException): + # assert that orchestrator exits on expected error + list(orchestrator.create_and_get_completed_partitions()) + + @mock.patch(sleep_mock_target) + def test_given_exception_when_start_job_and_skip_this_exception(self, mock_sleep: MagicMock) -> None: + self._job_repository.start.side_effect = [ + AirbyteTracedException("Something went wrong. Expected error #1"), + self._job_for_another_slice, + AirbyteTracedException("Something went wrong. Expected error #2"), + AirbyteTracedException("Something went wrong. Expected error #3"), + ] + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs( + { + self._job_for_a_slice: [AsyncJobStatus.COMPLETED], + self._job_for_another_slice: [AsyncJobStatus.RUNNING, AsyncJobStatus.COMPLETED], + } + ) + orchestrator = self._orchestrator([_A_STREAM_SLICE, _ANOTHER_STREAM_SLICE]) + + partitions, exception = self._accumulate_create_and_get_completed_partitions(orchestrator) + + assert len(partitions) == 1 # only _job_for_another_slice has succeeded + assert self._message_repository.emit_message.call_count == 3 # one for each traced message + assert exception.failure_type == FailureType.config_error # type: ignore # exception should be of type AirbyteTracedException + + @mock.patch(sleep_mock_target) + def test_given_jobs_failed_more_than_max_attempts_when_create_and_get_completed_partitions_then_free_job_budget(self, mock_sleep: MagicMock) -> None: + job_tracker = JobTracker(1) + jobs = [self._an_async_job(str(i), _A_STREAM_SLICE) for i in range(_MAX_NUMBER_OF_ATTEMPTS)] + self._job_repository.start.side_effect = jobs + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs({job: [AsyncJobStatus.FAILED] for job in jobs}) + + orchestrator = self._orchestrator([_A_STREAM_SLICE], job_tracker) + + with pytest.raises(AirbyteTracedException): + list(orchestrator.create_and_get_completed_partitions()) + + assert job_tracker.try_to_get_intent() + + def given_budget_already_taken_before_start_when_create_and_get_completed_partitions_then_wait_for_budget_to_be_freed(self) -> None: + job_tracker = JobTracker(1) + intent_to_free = job_tracker.try_to_get_intent() + + def wait_and_free_intent(_job_tracker: JobTracker, _intent_to_free: str) -> None: + print("Waiting before freeing budget...") + time.sleep(1) + print("Waiting done, freeing budget!") + _job_tracker.remove_job(_intent_to_free) + self._job_repository.start.return_value = self._job_for_a_slice + self._job_repository.update_jobs_status.side_effect = _status_update_per_jobs( + { + self._job_for_a_slice: [AsyncJobStatus.COMPLETED] * _BUFFER + } + ) + orchestrator = self._orchestrator([_A_STREAM_SLICE], job_tracker) + + threading.Thread(target=wait_and_free_intent, args=[job_tracker, intent_to_free]).start() + partitions = list(orchestrator.create_and_get_completed_partitions()) + + assert len(partitions) == 1 + + def test_given_start_job_raise_when_create_and_get_completed_partitions_then_free_budget(self) -> None: + job_tracker = JobTracker(1) + self._job_repository.start.side_effect = ValueError("Can't create job") + + orchestrator = AsyncJobOrchestrator(self._job_repository, [_A_STREAM_SLICE], job_tracker, self._message_repository, [ValueError]) + + with pytest.raises(Exception): + list(orchestrator.create_and_get_completed_partitions()) + + assert job_tracker.try_to_get_intent() + + def _mock_repository(self) -> None: + self._job_repository = Mock(spec=AsyncJobRepository) + + def _an_async_job(self, job_id: str, stream_slice: StreamSlice) -> AsyncJob: + return mock.Mock(wraps=AsyncJob(job_id, stream_slice)) + + def _accumulate_create_and_get_completed_partitions(self, orchestrator: AsyncJobOrchestrator) -> Tuple[List[AsyncPartition], Optional[Exception]]: + result = [] + try: + for i in orchestrator.create_and_get_completed_partitions(): + result.append(i) + except Exception as exception: + return result, exception + + return result, None diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job_tracker.py b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job_tracker.py new file mode 100644 index 000000000000..f3c2744d1fbe --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/async_job/test_job_tracker.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from typing import List +from unittest import TestCase + +import pytest +from airbyte_cdk.sources.declarative.async_job.job_tracker import ConcurrentJobLimitReached, JobTracker + +_LIMIT = 3 + + +class JobTrackerTest(TestCase): + def setUp(self) -> None: + self._tracker = JobTracker(_LIMIT) + + def test_given_limit_reached_when_remove_job_then_can_get_intent_again(self) -> None: + intents = self._reach_limit() + with pytest.raises(ConcurrentJobLimitReached): + self._tracker.try_to_get_intent() + + self._tracker.remove_job(intents[0]) + assert self._tracker.try_to_get_intent() + + def test_given_job_does_not_exist_when_remove_job_then_do_not_raise(self) -> None: + self._tracker.remove_job("non existing job id") + + def test_given_limit_reached_when_add_job_then_limit_is_still_reached(self) -> None: + intents = [self._tracker.try_to_get_intent() for i in range(_LIMIT)] + with pytest.raises(ConcurrentJobLimitReached): + self._tracker.try_to_get_intent() + + self._tracker.add_job(intents[0], "a created job") + with pytest.raises(ConcurrentJobLimitReached): + self._tracker.try_to_get_intent() + + def _reach_limit(self) -> List[str]: + return [self._tracker.try_to_get_intent() for i in range(_LIMIT)] diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_jwt.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_jwt.py new file mode 100644 index 000000000000..51bef48230c9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_jwt.py @@ -0,0 +1,166 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import base64 +import logging +from datetime import datetime + +import freezegun +import jwt +import pytest +from airbyte_cdk.sources.declarative.auth.jwt import JwtAuthenticator + +LOGGER = logging.getLogger(__name__) + + +class TestJwtAuthenticator: + """ + Test class for JWT Authenticator. + """ + + @pytest.mark.parametrize( + "algorithm, kid, typ, cty, additional_jwt_headers, expected", + [ + ( + "ALGORITHM", + "test_kid", + "test_typ", + "test_cty", + {"test": "test"}, + {"kid": "test_kid", "typ": "test_typ", "cty": "test_cty", "test": "test", "alg": "ALGORITHM"}, + ), + ("ALGORITHM", None, None, None, None, {"alg": "ALGORITHM"}), + ], + ) + def test_get_jwt_headers(self, algorithm, kid, typ, cty, additional_jwt_headers, expected): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + algorithm=algorithm, + secret_key="test_key", + token_duration=1200, + kid=kid, + typ=typ, + cty=cty, + additional_jwt_headers=additional_jwt_headers, + ) + assert authenticator._get_jwt_headers() == expected + + def test_given_overriden_reserverd_properties_get_jwt_headers_throws_error(self): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + algorithm="ALGORITHM", + secret_key="test_key", + token_duration=1200, + additional_jwt_headers={"kid": "test_kid"}, + ) + with pytest.raises(ValueError): + authenticator._get_jwt_headers() + + @pytest.mark.parametrize( + "iss, sub, aud, additional_jwt_payload, expected", + [ + ( + "test_iss", + "test_sub", + "test_aud", + {"test": "test"}, + {"iss": "test_iss", "sub": "test_sub", "aud": "test_aud", "test": "test"}, + ), + (None, None, None, None, {}), + ], + ) + def test_get_jwt_payload(self, iss, sub, aud, additional_jwt_payload, expected): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + algorithm="ALGORITHM", + secret_key="test_key", + token_duration=1000, + iss=iss, + sub=sub, + aud=aud, + additional_jwt_payload=additional_jwt_payload, + ) + with freezegun.freeze_time("2022-01-01 00:00:00"): + expected["iat"] = int(datetime.now().timestamp()) + expected["exp"] = expected["iat"] + 1000 + expected["nbf"] = expected["iat"] + assert authenticator._get_jwt_payload() == expected + + def test_given_overriden_reserverd_properties_get_jwt_payload_throws_error(self): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + algorithm="ALGORITHM", + secret_key="test_key", + token_duration=0, + additional_jwt_payload={"exp": 1234}, + ) + with pytest.raises(ValueError): + authenticator._get_jwt_payload() + + @pytest.mark.parametrize( + "base64_encode_secret_key, secret_key, expected", + [ + (True, "test", base64.b64encode("test".encode()).decode()), + (False, "test", "test"), + ], + ) + def test_get_secret_key(self, base64_encode_secret_key, secret_key, expected): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + secret_key=secret_key, + algorithm="test_algo", + token_duration=1200, + base64_encode_secret_key=base64_encode_secret_key, + ) + assert authenticator._get_secret_key() == expected + + def test_get_signed_token(self): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + secret_key="test", + algorithm="HS256", + token_duration=1000, + typ="JWT", + iss="iss", + ) + assert authenticator._get_signed_token() == jwt.encode( + payload=authenticator._get_jwt_payload(), + key=authenticator._get_secret_key(), + algorithm=authenticator._algorithm, + headers=authenticator._get_jwt_headers(), + ) + + def test_given_invalid_algorithm_get_signed_token_throws_error(self): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + secret_key="test", + algorithm="invalid algorithm type", + token_duration=1000, + base64_encode_secret_key=False, + header_prefix="Bearer", + typ="JWT", + iss="iss", + additional_jwt_headers={}, + additional_jwt_payload={}, + ) + with pytest.raises(ValueError): + authenticator._get_signed_token() + + @pytest.mark.parametrize("header_prefix, expected", [("test", "test"), (None, None)]) + def test_get_header_prefix(self, header_prefix, expected): + authenticator = JwtAuthenticator( + config={}, + parameters={}, + secret_key="key", + algorithm="test_algo", + token_duration=1200, + header_prefix=header_prefix, + ) + assert authenticator._get_header_prefix() == expected diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_oauth.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_oauth.py new file mode 100644 index 000000000000..78dd0b591ec3 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_oauth.py @@ -0,0 +1,331 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import base64 +import logging +from unittest.mock import Mock + +import freezegun +import pendulum +import pytest +import requests +from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from requests import Response + +LOGGER = logging.getLogger(__name__) + +resp = Response() + +config = { + "refresh_endpoint": "refresh_end", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "token_expiry_date": pendulum.now().subtract(days=2).to_rfc3339_string(), + "custom_field": "in_outbound_request", + "another_field": "exists_in_body", + "grant_type": "some_grant_type", +} +parameters = {"refresh_token": "some_refresh_token"} + + +class TestOauth2Authenticator: + """ + Test class for OAuth2Authenticator. + """ + + def test_refresh_request_body(self): + """ + Request body should match given configuration. + """ + scopes = ["scope1", "scope2"] + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ parameters['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + token_expiry_date="{{ config['token_expiry_date'] }}", + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + parameters=parameters, + grant_type="{{ config['grant_type'] }}", + ) + body = oauth.build_refresh_request_body() + expected = { + "grant_type": "some_grant_type", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "refresh_token": "some_refresh_token", + "scopes": scopes, + "custom_field": "in_outbound_request", + "another_field": "exists_in_body", + } + assert body == expected + + def test_refresh_with_encode_config_params(self): + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] | base64encode }}", + client_secret="{{ config['client_secret'] | base64encode }}", + config=config, + parameters={}, + grant_type="client_credentials", + ) + body = oauth.build_refresh_request_body() + expected = { + "grant_type": "client_credentials", + "client_id": base64.b64encode(config["client_id"].encode("utf-8")).decode(), + "client_secret": base64.b64encode(config["client_secret"].encode("utf-8")).decode(), + "refresh_token": None, + } + assert body == expected + + def test_refresh_with_decode_config_params(self): + updated_config_fields = { + "client_id": base64.b64encode(config["client_id"].encode("utf-8")).decode(), + "client_secret": base64.b64encode(config["client_secret"].encode("utf-8")).decode(), + } + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] | base64decode }}", + client_secret="{{ config['client_secret'] | base64decode }}", + config=config | updated_config_fields, + parameters={}, + grant_type="client_credentials", + ) + body = oauth.build_refresh_request_body() + expected = { + "grant_type": "client_credentials", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "refresh_token": None, + } + assert body == expected + + def test_refresh_without_refresh_token(self): + """ + Should work fine for grant_type client_credentials. + """ + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + config=config, + parameters={}, + grant_type="client_credentials", + ) + body = oauth.build_refresh_request_body() + expected = { + "grant_type": "client_credentials", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "refresh_token": None, + } + assert body == expected + + def test_error_on_refresh_token_grant_without_refresh_token(self): + """ + Should throw an error if grant_type refresh_token is configured without refresh_token. + """ + with pytest.raises(ValueError): + DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + config=config, + parameters={}, + grant_type="refresh_token", + ) + + def test_refresh_access_token(self, mocker): + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ config['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + token_expiry_date="{{ config['token_expiry_date'] }}", + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + parameters={}, + ) + + resp.status_code = 200 + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": 1000}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + token = oauth.refresh_access_token() + + assert ("access_token", 1000) == token + + filtered = filter_secrets("access_token") + assert filtered == "****" + + def test_refresh_access_token_missing_access_token(self, mocker): + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ config['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + token_expiry_date="{{ config['token_expiry_date'] }}", + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + parameters={}, + ) + + resp.status_code = 200 + mocker.patch.object(resp, "json", return_value={"expires_in": 1000}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + with pytest.raises(Exception): + oauth.refresh_access_token() + + @pytest.mark.parametrize( + "timestamp, expected_date", + [ + (1640995200, "2022-01-01T00:00:00Z"), + ("1650758400", "2022-04-24T00:00:00Z"), + ], + ids=["timestamp_as_integer", "timestamp_as_integer_inside_string"], + ) + def test_initialize_declarative_oauth_with_token_expiry_date_as_timestamp(self, timestamp, expected_date): + # TODO: should be fixed inside DeclarativeOauth2Authenticator, remove next line after fixing + with pytest.raises(TypeError): + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ parameters['refresh_token'] }}", + config=config | {"token_expiry_date": timestamp}, + scopes=["scope1", "scope2"], + token_expiry_date="{{ config['token_expiry_date'] }}", + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + parameters={}, + ) + + assert oauth.get_token_expiry_date() == pendulum.parse(expected_date) + + @pytest.mark.parametrize( + "expires_in_response, token_expiry_date_format", + [ + ("2020-01-02T00:00:00Z", "YYYY-MM-DDTHH:mm:ss[Z]"), + ("2020-01-02T00:00:00.000000+00:00", "YYYY-MM-DDTHH:mm:ss.SSSSSSZ"), + ("2020-01-02", "YYYY-MM-DD"), + ], + ids=["rfc3339", "iso8601", "simple_date"], + ) + @freezegun.freeze_time("2020-01-01") + def test_refresh_access_token_expire_format(self, mocker, expires_in_response, token_expiry_date_format): + next_day = "2020-01-02T00:00:00Z" + config.update({"token_expiry_date": pendulum.parse(next_day).subtract(days=2).to_rfc3339_string()}) + message_repository = Mock() + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ config['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + token_expiry_date="{{ config['token_expiry_date'] }}", + token_expiry_date_format=token_expiry_date_format, + token_expiry_is_time_of_expiration=True, + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + message_repository=message_repository, + parameters={}, + ) + + resp.status_code = 200 + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": expires_in_response}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + token = oauth.get_access_token() + assert "access_token" == token + assert oauth.get_token_expiry_date() == pendulum.parse(next_day) + assert message_repository.log_message.call_count == 1 + + @pytest.mark.parametrize( + "expires_in_response, next_day, raises", + [ + (86400, "2020-01-02T00:00:00Z", False), + (86400.1, "2020-01-02T00:00:00Z", False), + ("86400", "2020-01-02T00:00:00Z", False), + ("86400.1", "2020-01-02T00:00:00Z", False), + ("2020-01-02T00:00:00Z", "2020-01-02T00:00:00Z", True), + ], + ids=["time_in_seconds", "time_in_seconds_float", "time_in_seconds_str", "time_in_seconds_str_float", "invalid"], + ) + @freezegun.freeze_time("2020-01-01") + def test_set_token_expiry_date_no_format(self, mocker, expires_in_response, next_day, raises): + config.update({"token_expiry_date": pendulum.parse(next_day).subtract(days=2).to_rfc3339_string()}) + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ config['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + parameters={}, + ) + + resp.status_code = 200 + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": expires_in_response}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + if raises: + with pytest.raises(ValueError): + oauth.get_access_token() + else: + token = oauth.get_access_token() + assert "access_token" == token + assert oauth.get_token_expiry_date() == pendulum.parse(next_day) + + def test_error_handling(self, mocker): + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ config['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + parameters={}, + ) + resp.status_code = 400 + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": 123}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + with pytest.raises(requests.exceptions.HTTPError) as e: + oauth.refresh_access_token() + assert e.value.errno == 400 + + +def mock_request(method, url, data): + if url == "refresh_end": + return resp + raise Exception(f"Error while refreshing access token with request: {method}, {url}, {data}") diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_selective_authenticator.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_selective_authenticator.py new file mode 100644 index 000000000000..346b284c3786 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_selective_authenticator.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator + + +def test_authenticator_selected(mocker): + authenticators = {"one": mocker.Mock(), "two": mocker.Mock()} + auth = SelectiveAuthenticator( + config={"auth": {"type": "one"}}, + authenticators=authenticators, + authenticator_selection_path=["auth", "type"], + ) + + assert auth is authenticators["one"] + + +def test_selection_path_not_found(mocker): + authenticators = {"one": mocker.Mock(), "two": mocker.Mock()} + + with pytest.raises(ValueError, match="The path from `authenticator_selection_path` is not found in the config"): + _ = SelectiveAuthenticator( + config={"auth": {"type": "one"}}, + authenticators=authenticators, + authenticator_selection_path=["auth_type"], + ) + + +def test_selected_auth_not_found(mocker): + authenticators = {"one": mocker.Mock(), "two": mocker.Mock()} + + with pytest.raises(ValueError, match="The authenticator `unknown` is not found"): + _ = SelectiveAuthenticator( + config={"auth": {"type": "unknown"}}, + authenticators=authenticators, + authenticator_selection_path=["auth", "type"], + ) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_session_token_auth.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_session_token_auth.py new file mode 100644 index 000000000000..5f99fabf00b3 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_session_token_auth.py @@ -0,0 +1,182 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.auth.token import LegacySessionTokenAuthenticator, get_new_session_token +from requests.exceptions import HTTPError + +parameters = {"hello": "world"} +instance_api_url = "https://airbyte.metabaseapp.com/api/" +username = "username" +password = "password" +session_token = "session_token" +header = "X-App-Session" +session_token_response_key = "id" +login_url = "session" +validate_session_url = "user/current" + +input_instance_api_url = "{{ config['instance_api_url'] }}" +input_username = "{{ config['username'] }}" +input_password = "{{ config['password'] }}" +input_session_token = "{{ config['session_token'] }}" + +config = { + "instance_api_url": instance_api_url, + "username": username, + "password": password, + "session_token": session_token, + "header": header, + "session_token_response_key": session_token_response_key, + "login_url": login_url, + "validate_session_url": validate_session_url, +} + +config_session_token = { + "instance_api_url": instance_api_url, + "username": "", + "password": "", + "session_token": session_token, + "header": header, + "session_token_response_key": session_token_response_key, + "login_url": login_url, + "validate_session_url": validate_session_url, +} + +config_username_password = { + "instance_api_url": instance_api_url, + "username": username, + "password": password, + "session_token": "", + "header": header, + "session_token_response_key": session_token_response_key, + "login_url": login_url, + "validate_session_url": validate_session_url, +} + + +def test_auth_header(): + auth_header = LegacySessionTokenAuthenticator( + config=config, + parameters=parameters, + api_url=input_instance_api_url, + username=input_username, + password=input_password, + session_token=input_session_token, + header=header, + session_token_response_key=session_token_response_key, + login_url=login_url, + validate_session_url=validate_session_url, + ).auth_header + assert auth_header == "X-App-Session" + + +def test_get_token_valid_session(requests_mock): + requests_mock.get( + f"{config_session_token['instance_api_url']}user/current", json={"common_name": "common_name", "last_login": "last_login"} + ) + + token = LegacySessionTokenAuthenticator( + config=config_session_token, + parameters=parameters, + api_url=input_instance_api_url, + username=input_username, + password=input_password, + session_token=input_session_token, + header=header, + session_token_response_key=session_token_response_key, + login_url=login_url, + validate_session_url=validate_session_url, + ).token + assert token == "session_token" + + +def test_get_token_invalid_session_unauthorized(): + with pytest.raises(ConnectionError): + _ = LegacySessionTokenAuthenticator( + config=config_session_token, + parameters=parameters, + api_url=input_instance_api_url, + username=input_username, + password=input_password, + session_token=input_session_token, + header=header, + session_token_response_key=session_token_response_key, + login_url=login_url, + validate_session_url=validate_session_url, + ).token + + +def test_get_token_invalid_username_password_unauthorized(): + with pytest.raises(HTTPError): + _ = LegacySessionTokenAuthenticator( + config=config_username_password, + parameters=parameters, + api_url=input_instance_api_url, + username=input_username, + password=input_password, + session_token=input_session_token, + header=header, + session_token_response_key=session_token_response_key, + validate_session_url=validate_session_url, + login_url=login_url, + ).token + + +def test_get_token_username_password(requests_mock): + requests_mock.post(f"{config['instance_api_url']}session", json={"id": "some session id"}) + + token = LegacySessionTokenAuthenticator( + config=config_username_password, + parameters=parameters, + api_url=input_instance_api_url, + username=input_username, + password=input_password, + session_token=input_session_token, + header=header, + session_token_response_key=session_token_response_key, + login_url=login_url, + validate_session_url=validate_session_url, + ).token + assert token == "some session id" + + +def test_check_is_valid_session_token(requests_mock): + requests_mock.get(f"{config['instance_api_url']}user/current", json={"common_name": "common_name", "last_login": "last_login"}) + + assert LegacySessionTokenAuthenticator( + config=config, + parameters=parameters, + api_url=input_instance_api_url, + username=input_username, + password=input_password, + session_token=input_session_token, + header=header, + session_token_response_key=session_token_response_key, + validate_session_url=validate_session_url, + login_url=login_url, + ).is_valid_session_token() + + +def test_check_is_valid_session_token_unauthorized(): + assert not LegacySessionTokenAuthenticator( + config=config, + parameters=parameters, + api_url=input_instance_api_url, + username=input_username, + password=input_password, + session_token=input_session_token, + header=header, + session_token_response_key=session_token_response_key, + login_url=login_url, + validate_session_url=validate_session_url, + ).is_valid_session_token() + + +def test_get_new_session_token(requests_mock): + requests_mock.post(f"{config['instance_api_url']}session", headers={"Content-Type": "application/json"}, json={"id": "some session id"}) + + session_token = get_new_session_token( + f'{config["instance_api_url"]}session', config["username"], config["password"], config["session_token_response_key"] + ) + assert session_token == "some session id" diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_auth.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_auth.py new file mode 100644 index 000000000000..599667c42f9b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_auth.py @@ -0,0 +1,200 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging + +import pytest +import requests +from airbyte_cdk.sources.declarative.auth.token import ApiKeyAuthenticator, BasicHttpAuthenticator, BearerAuthenticator +from airbyte_cdk.sources.declarative.auth.token_provider import InterpolatedStringTokenProvider +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from requests import Response + +LOGGER = logging.getLogger(__name__) + +resp = Response() +config = {"username": "user", "password": "password", "header": "header"} +parameters = {"username": "user", "password": "password", "header": "header"} + + +@pytest.mark.parametrize( + "test_name, token, expected_header_value", + [ + ("test_static_token", "test-token", "Bearer test-token"), + ("test_token_from_config", "{{ config.username }}", "Bearer user"), + ("test_token_from_parameters", "{{ parameters.username }}", "Bearer user"), + ], +) +def test_bearer_token_authenticator(test_name, token, expected_header_value): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_provider = InterpolatedStringTokenProvider(config=config, api_token=token, parameters=parameters) + token_auth = BearerAuthenticator(token_provider, config, parameters=parameters) + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {"Authorization": expected_header_value} == prepared_request.headers + assert {"Authorization": expected_header_value} == header1 + assert {"Authorization": expected_header_value} == header2 + + +@pytest.mark.parametrize( + "test_name, username, password, expected_header_value", + [ + ("test_static_creds", "user", "password", "Basic dXNlcjpwYXNzd29yZA=="), + ("test_creds_from_config", "{{ config.username }}", "{{ config.password }}", "Basic dXNlcjpwYXNzd29yZA=="), + ("test_creds_from_parameters", "{{ parameters.username }}", "{{ parameters.password }}", "Basic dXNlcjpwYXNzd29yZA=="), + ], +) +def test_basic_authenticator(test_name, username, password, expected_header_value): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_auth = BasicHttpAuthenticator(username=username, password=password, config=config, parameters=parameters) + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {"Authorization": expected_header_value} == prepared_request.headers + assert {"Authorization": expected_header_value} == header1 + assert {"Authorization": expected_header_value} == header2 + + +@pytest.mark.parametrize( + "test_name, header, token, expected_header, expected_header_value", + [ + ("test_static_token", "Authorization", "test-token", "Authorization", "test-token"), + ("test_token_from_config", "{{ config.header }}", "{{ config.username }}", "header", "user"), + ("test_token_from_parameters", "{{ parameters.header }}", "{{ parameters.username }}", "header", "user"), + ], +) +def test_api_key_authenticator(test_name, header, token, expected_header, expected_header_value): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_provider = InterpolatedStringTokenProvider(config=config, api_token=token, parameters=parameters) + token_auth = ApiKeyAuthenticator( + request_option=RequestOption(inject_into=RequestOptionType.header, field_name=header, parameters=parameters), + token_provider=token_provider, + config=config, + parameters=parameters, + ) + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {expected_header: expected_header_value} == prepared_request.headers + assert {expected_header: expected_header_value} == header1 + assert {expected_header: expected_header_value} == header2 + + +@pytest.mark.parametrize( + "test_name, field_name, token, expected_field_name, expected_field_value, inject_type, validation_fn", + [ + ( + "test_static_token", + "Authorization", + "test-token", + "Authorization", + "test-token", + RequestOptionType.request_parameter, + "get_request_params", + ), + ( + "test_token_from_config", + "{{ config.header }}", + "{{ config.username }}", + "header", + "user", + RequestOptionType.request_parameter, + "get_request_params", + ), + ( + "test_token_from_parameters", + "{{ parameters.header }}", + "{{ parameters.username }}", + "header", + "user", + RequestOptionType.request_parameter, + "get_request_params", + ), + ( + "test_static_token", + "Authorization", + "test-token", + "Authorization", + "test-token", + RequestOptionType.body_data, + "get_request_body_data", + ), + ( + "test_token_from_config", + "{{ config.header }}", + "{{ config.username }}", + "header", + "user", + RequestOptionType.body_data, + "get_request_body_data", + ), + ( + "test_token_from_parameters", + "{{ parameters.header }}", + "{{ parameters.username }}", + "header", + "user", + RequestOptionType.body_data, + "get_request_body_data", + ), + ( + "test_static_token", + "Authorization", + "test-token", + "Authorization", + "test-token", + RequestOptionType.body_json, + "get_request_body_json", + ), + ( + "test_token_from_config", + "{{ config.header }}", + "{{ config.username }}", + "header", + "user", + RequestOptionType.body_json, + "get_request_body_json", + ), + ( + "test_token_from_parameters", + "{{ parameters.header }}", + "{{ parameters.username }}", + "header", + "user", + RequestOptionType.body_json, + "get_request_body_json", + ), + ], +) +def test_api_key_authenticator_inject(test_name, field_name, token, expected_field_name, expected_field_value, inject_type, validation_fn): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_provider = InterpolatedStringTokenProvider(config=config, api_token=token, parameters=parameters) + token_auth = ApiKeyAuthenticator( + request_option=RequestOption(inject_into=inject_type, field_name=field_name, parameters=parameters), + token_provider=token_provider, + config=config, + parameters=parameters, + ) + assert {expected_field_name: expected_field_value} == getattr(token_auth, validation_fn)() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_provider.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_provider.py new file mode 100644 index 000000000000..e73e5eef0838 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_provider.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pendulum +import pytest +from airbyte_cdk.sources.declarative.auth.token_provider import InterpolatedStringTokenProvider, SessionTokenProvider +from airbyte_cdk.sources.declarative.exceptions import ReadException +from isodate import parse_duration + + +def create_session_token_provider(): + login_requester = MagicMock() + login_response = MagicMock() + login_response.json.return_value = {"nested": {"token": "my_token"}} + login_requester.send_request.return_value = login_response + + return SessionTokenProvider( + login_requester=login_requester, + session_token_path=["nested", "token"], + expiration_duration=parse_duration("PT1H"), + parameters={"test": "test"}, + ) + + +def test_interpolated_string_token_provider(): + provider = InterpolatedStringTokenProvider( + config={"config_key": "val"}, api_token="{{ config.config_key }}-{{ parameters.test }}", parameters={"test": "test"} + ) + assert provider.get_token() == "val-test" + + +def test_session_token_provider(): + provider = create_session_token_provider() + assert provider.get_token() == "my_token" + + +def test_session_token_provider_cache(): + provider = create_session_token_provider() + provider.get_token() + assert provider.get_token() == "my_token" + assert provider.login_requester.send_request.call_count == 1 + + +def test_session_token_provider_cache_expiration(): + with pendulum.test(pendulum.datetime(2001, 5, 21, 12)): + provider = create_session_token_provider() + provider.get_token() + + provider.login_requester.send_request.return_value.json.return_value = {"nested": {"token": "updated_token"}} + + with pendulum.test(pendulum.datetime(2001, 5, 21, 14)): + assert provider.get_token() == "updated_token" + + assert provider.login_requester.send_request.call_count == 2 + + +def test_session_token_provider_no_cache(): + provider = create_session_token_provider() + provider.expiration_duration = None + provider.get_token() + assert provider.login_requester.send_request.call_count == 1 + provider.get_token() + assert provider.login_requester.send_request.call_count == 2 + + +def test_session_token_provider_ignored_response(): + provider = create_session_token_provider() + provider.login_requester.send_request.return_value = None + with pytest.raises(ReadException): + provider.get_token() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/checks/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/checks/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/checks/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/checks/test_check_stream.py b/airbyte-cdk/python/unit_tests/sources/declarative/checks/test_check_stream.py new file mode 100644 index 000000000000..4ebe449dcd69 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/checks/test_check_stream.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from typing import Any, Iterable, Mapping, Optional +from unittest.mock import MagicMock + +import pytest +import requests +from airbyte_cdk.sources.declarative.checks.check_stream import CheckStream +from airbyte_cdk.sources.streams.http import HttpStream + +logger = logging.getLogger("test") +config = dict() + +stream_names = ["s1"] +record = MagicMock() + + +@pytest.mark.parametrize( + "test_name, record, streams_to_check, stream_slice, expectation", + [ + ("test_success_check", record, stream_names, {}, (True, None)), + ("test_success_check_stream_slice", record, stream_names, {"slice": "slice_value"}, (True, None)), + ("test_fail_check", None, stream_names, {}, (True, None)), + ("test_try_to_check_invalid stream", record, ["invalid_stream_name"], {}, None), + ], +) +@pytest.mark.parametrize("slices_as_list", [True, False]) +def test_check_stream_with_slices_as_list(test_name, record, streams_to_check, stream_slice, expectation, slices_as_list): + stream = MagicMock() + stream.name = "s1" + stream.availability_strategy = None + if slices_as_list: + stream.stream_slices.return_value = [stream_slice] + else: + stream.stream_slices.return_value = iter([stream_slice]) + + stream.read_records.side_effect = mock_read_records({frozenset(stream_slice): iter([record])}) + + source = MagicMock() + source.streams.return_value = [stream] + + check_stream = CheckStream(streams_to_check, parameters={}) + + if expectation: + actual = check_stream.check_connection(source, logger, config) + assert actual == expectation + else: + with pytest.raises(ValueError): + check_stream.check_connection(source, logger, config) + + +def mock_read_records(responses, default_response=None, **kwargs): + return lambda stream_slice, sync_mode: responses[frozenset(stream_slice)] if frozenset(stream_slice) in responses else default_response + + +def test_check_empty_stream(): + stream = MagicMock() + stream.name = "s1" + stream.read_records.return_value = iter([]) + stream.stream_slices.return_value = iter([None]) + + source = MagicMock() + source.streams.return_value = [stream] + + check_stream = CheckStream(["s1"], parameters={}) + stream_is_available, reason = check_stream.check_connection(source, logger, config) + assert stream_is_available + + +def test_check_stream_with_no_stream_slices_aborts(): + stream = MagicMock() + stream.name = "s1" + stream.stream_slices.return_value = iter([]) + + source = MagicMock() + source.streams.return_value = [stream] + + check_stream = CheckStream(["s1"], parameters={}) + stream_is_available, reason = check_stream.check_connection(source, logger, config) + assert not stream_is_available + assert "no stream slices were found, likely because the parent stream is empty" in reason + + +@pytest.mark.parametrize( + "test_name, response_code, available_expectation, expected_messages", + [ + ("test_stream_unavailable_unhandled_error", 404, False, ["Not found. The requested resource was not found on the server."]), + ( + "test_stream_unavailable_handled_error", + 403, + False, + ["Forbidden. You don't have permission to access this resource."], + ), + ("test_stream_available", 200, True, []), + ], +) +def test_check_http_stream_via_availability_strategy(mocker, test_name, response_code, available_expectation, expected_messages): + class MockHttpStream(HttpStream): + url_base = "https://test_base_url.com" + primary_key = "" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.resp_counter = 1 + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + def path(self, **kwargs) -> str: + return "" + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + stub_resp = {"data": self.resp_counter} + self.resp_counter += 1 + yield stub_resp + + pass + + http_stream = MockHttpStream() + assert isinstance(http_stream, HttpStream) + + source = MagicMock() + source.streams.return_value = [http_stream] + + check_stream = CheckStream(stream_names=["mock_http_stream"], parameters={}) + + req = requests.Response() + req.status_code = response_code + mocker.patch.object(requests.Session, "send", return_value=req) + + logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}") + stream_is_available, reason = check_stream.check_connection(source, logger, config) + + assert stream_is_available == available_expectation + for message in expected_messages: + assert message in reason diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/concurrency_level/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/concurrency_level/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/concurrency_level/test_concurrency_level.py b/airbyte-cdk/python/unit_tests/sources/declarative/concurrency_level/test_concurrency_level.py new file mode 100644 index 000000000000..0195a71c1968 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/concurrency_level/test_concurrency_level.py @@ -0,0 +1,71 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from typing import Any, Mapping, Optional, Type, Union + +import pytest +from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel + + +@pytest.mark.parametrize( + "default_concurrency, max_concurrency, expected_concurrency", + [ + pytest.param(20, 75, 20, id="test_default_concurrency_as_int"), + pytest.param(20, 75, 20, id="test_default_concurrency_as_int_ignores_max_concurrency"), + pytest.param("{{ config['num_workers'] or 40 }}", 75, 50, id="test_default_concurrency_using_interpolation"), + pytest.param("{{ config['missing'] or 40 }}", 75, 40, id="test_default_concurrency_using_interpolation_no_value"), + pytest.param("{{ config['num_workers'] or 40 }}", 10, 10, id="test_use_max_concurrency_if_default_is_too_high"), + ], +) +def test_stream_slices(default_concurrency: Union[int, str], max_concurrency: int, expected_concurrency: int) -> None: + config = {"num_workers": 50} + concurrency_level = ConcurrencyLevel( + default_concurrency=default_concurrency, + max_concurrency=max_concurrency, + config=config, + parameters={} + ) + + actual_concurrency = concurrency_level.get_concurrency_level() + + assert actual_concurrency == expected_concurrency + + +@pytest.mark.parametrize( + "config, expected_concurrency, expected_error", + [ + pytest.param({"num_workers": "fifty five"}, None, ValueError, id="test_invalid_default_concurrency_as_string"), + pytest.param({"num_workers": "55"}, 55, None, id="test_default_concurrency_as_string_int"), + pytest.param({"num_workers": 60}, 60, None, id="test_default_concurrency_as_int"), + ], +) +def test_default_concurrency_input_types_and_errors( + config: Mapping[str, Any], + expected_concurrency: Optional[int], + expected_error: Optional[Type[Exception]], +) -> None: + concurrency_level = ConcurrencyLevel( + default_concurrency="{{ config['num_workers'] or 30 }}", + max_concurrency=65, + config=config, + parameters={} + ) + + if expected_error: + with pytest.raises(expected_error): + concurrency_level.get_concurrency_level() + else: + actual_concurrency = concurrency_level.get_concurrency_level() + + assert actual_concurrency == expected_concurrency + + +def test_max_concurrency_is_required_for_default_concurrency_using_config() -> None: + config = {"num_workers": "50"} + + with pytest.raises(ValueError): + ConcurrencyLevel( + default_concurrency="{{ config['num_workers'] or 40 }}", + max_concurrency=None, + config=config, + parameters={} + ) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_datetime_parser.py b/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_datetime_parser.py new file mode 100644 index 000000000000..1a7d45f7a78f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_datetime_parser.py @@ -0,0 +1,71 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime + +import pytest +from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser + + +@pytest.mark.parametrize( + "test_name, input_date, date_format, expected_output_date", + [ + ( + "test_parse_date_iso", + "2021-01-01T00:00:00.000000+0000", + "%Y-%m-%dT%H:%M:%S.%f%z", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ( + "test_parse_date_iso_with_timezone_not_utc", + "2021-01-01T00:00:00.000000+0400", + "%Y-%m-%dT%H:%M:%S.%f%z", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(seconds=14400))), + ), + ( + "test_parse_timestamp", + "1609459200", + "%s", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ( + "test_parse_timestamp_as_float", + "1675092508.873709", + "%s_as_float", + datetime.datetime(2023, 1, 30, 15, 28, 28, 873709, tzinfo=datetime.timezone.utc), + ), + ( + "test_parse_ms_timestamp", + "1609459200001", + "%ms", + datetime.datetime(2021, 1, 1, 0, 0, 0, 1000, tzinfo=datetime.timezone.utc), + ), + ("test_parse_date_ms", "20210101", "%Y%m%d", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)), + ], +) +def test_parse_date(test_name, input_date, date_format, expected_output_date): + parser = DatetimeParser() + output_date = parser.parse(input_date, date_format) + assert output_date == expected_output_date + + +@pytest.mark.parametrize( + "test_name, input_dt, datetimeformat, expected_output", + [ + ("test_format_timestamp", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%s", "1609459200"), + ("test_format_timestamp_ms", datetime.datetime(2021, 1, 1, 0, 0, 0, 1000, tzinfo=datetime.timezone.utc), "%ms", "1609459200001"), + ( + "test_format_timestamp_as_float", + datetime.datetime(2023, 1, 30, 15, 28, 28, 873709, tzinfo=datetime.timezone.utc), + "%s_as_float", + "1675092508.873709", + ), + ("test_format_string", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%Y-%m-%d", "2021-01-01"), + ("test_format_to_number", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%Y%m%d", "20210101"), + ], +) +def test_format_datetime(test_name, input_dt, datetimeformat, expected_output): + parser = DatetimeParser() + output_date = parser.format(input_dt, datetimeformat) + assert output_date == expected_output diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_min_max_datetime.py b/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_min_max_datetime.py new file mode 100644 index 000000000000..ff9aedf0752a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_min_max_datetime.py @@ -0,0 +1,128 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime + +import pytest +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + +date_format = "%Y-%m-%dT%H:%M:%S.%f%z" + +old_date = "2021-01-01T20:12:19.597854Z" +middle_date = "2022-01-01T20:12:19.597854Z" +new_date = "2022-06-24T20:12:19.597854Z" + + +@pytest.mark.parametrize( + "test_name, date, min_date, max_date, expected_date", + [ + ("test_time_is_greater_than_min", "{{ config['older'] }}", "{{ stream_state['newer'] }}", "", new_date), + ("test_time_is_less_than_min", "{{ stream_state['newer'] }}", "{{ config['older'] }}", "", new_date), + ("test_time_is_equal_to_min", "{{ config['older'] }}", "{{ config['older'] }}", "", old_date), + ("test_time_is_greater_than_max", "{{ stream_state['newer'] }}", "", "{{ config['older'] }}", old_date), + ("test_time_is_less_than_max", "{{ config['older'] }}", "", "{{ stream_state['newer'] }}", old_date), + ("test_time_is_equal_to_min", "{{ stream_state['newer'] }}", "{{ stream_state['newer'] }}", "", new_date), + ( + "test_time_is_between_min_and_max", + "{{ config['middle'] }}", + "{{ config['older'] }}", + "{{ stream_state['newer'] }}", + middle_date, + ), + ("test_min_newer_time_from_parameters", "{{ config['older'] }}", "{{ parameters['newer'] }}", "", new_date), + ("test_max_newer_time_from_parameters", "{{ stream_state['newer'] }}", "", "{{ parameters['older'] }}", old_date), + ], +) +def test_min_max_datetime(test_name, date, min_date, max_date, expected_date): + config = {"older": old_date, "middle": middle_date} + stream_state = {"newer": new_date} + parameters = {"newer": new_date, "older": old_date} + + min_max_date = MinMaxDatetime(datetime=date, min_datetime=min_date, max_datetime=max_date, parameters=parameters) + actual_date = min_max_date.get_datetime(config, **{"stream_state": stream_state}) + + assert actual_date == datetime.datetime.strptime(expected_date, date_format) + + +def test_custom_datetime_format(): + config = {"older": "2021-01-01T20:12:19", "middle": "2022-01-01T20:12:19"} + stream_state = {"newer": "2022-06-24T20:12:19"} + + min_max_date = MinMaxDatetime( + datetime="{{ config['middle'] }}", + datetime_format="%Y-%m-%dT%H:%M:%S", + min_datetime="{{ config['older'] }}", + max_datetime="{{ stream_state['newer'] }}", + parameters={}, + ) + actual_date = min_max_date.get_datetime(config, **{"stream_state": stream_state}) + + assert actual_date == datetime.datetime.strptime("2022-01-01T20:12:19", "%Y-%m-%dT%H:%M:%S").replace(tzinfo=datetime.timezone.utc) + + +def test_format_is_a_number(): + config = {"older": "20210101", "middle": "20220101"} + stream_state = {"newer": "20220624"} + + min_max_date = MinMaxDatetime( + datetime="{{ config['middle'] }}", + datetime_format="%Y%m%d", + min_datetime="{{ config['older'] }}", + max_datetime="{{ stream_state['newer'] }}", + parameters={}, + ) + actual_date = min_max_date.get_datetime(config, **{"stream_state": stream_state}) + + assert actual_date == datetime.datetime.strptime("20220101", "%Y%m%d").replace(tzinfo=datetime.timezone.utc) + + +def test_set_datetime_format(): + min_max_date = MinMaxDatetime(datetime="{{ config['middle'] }}", min_datetime="{{ config['older'] }}", parameters={}) + + # Retrieve datetime using the default datetime formatting + default_fmt_config = {"older": "2021-01-01T20:12:19.597854Z", "middle": "2022-01-01T20:12:19.597854Z"} + actual_date = min_max_date.get_datetime(default_fmt_config) + + assert actual_date == datetime.datetime.strptime("2022-01-01T20:12:19.597854Z", "%Y-%m-%dT%H:%M:%S.%f%z") + + # Set a different datetime format and attempt to retrieve datetime using an updated format + min_max_date.datetime_format = "%Y-%m-%dT%H:%M:%S" + + custom_fmt_config = {"older": "2021-01-01T20:12:19", "middle": "2022-01-01T20:12:19"} + actual_date = min_max_date.get_datetime(custom_fmt_config) + + assert actual_date == datetime.datetime.strptime("2022-01-01T20:12:19", "%Y-%m-%dT%H:%M:%S").replace(tzinfo=datetime.timezone.utc) + + +def test_min_max_datetime_lazy_eval(): + kwargs = { + "datetime": "2022-01-10T00:00:00", + "datetime_format": "%Y-%m-%dT%H:%M:%S", + "min_datetime": "{{ parameters.min_datetime }}", + "max_datetime": "{{ parameters.max_datetime }}", + } + + assert datetime.datetime(2022, 1, 10, 0, 0, tzinfo=datetime.timezone.utc) == MinMaxDatetime(**kwargs, parameters={}).get_datetime({}) + assert datetime.datetime(2022, 1, 20, 0, 0, tzinfo=datetime.timezone.utc) == MinMaxDatetime( + **kwargs, parameters={"min_datetime": "2022-01-20T00:00:00"} + ).get_datetime({}) + assert datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) == MinMaxDatetime( + **kwargs, parameters={"max_datetime": "2021-01-01T00:00:00"} + ).get_datetime({}) + + +@pytest.mark.parametrize( + "input_datetime", + [ + pytest.param("2022-01-01T00:00:00", id="test_create_min_max_datetime_from_string"), + pytest.param(InterpolatedString.create("2022-01-01T00:00:00", parameters={}), id="test_create_min_max_datetime_from_string"), + pytest.param(MinMaxDatetime("2022-01-01T00:00:00", parameters={}), id="test_create_min_max_datetime_from_minmaxdatetime"), + ], +) +def test_create_min_max_datetime(input_datetime): + minMaxDatetime = MinMaxDatetime.create(input_datetime, parameters={}) + expected_value = "2022-01-01T00:00:00" + + assert minMaxDatetime.datetime.eval(config={}) == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/decoders/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/decoders/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_json_decoder.py b/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_json_decoder.py new file mode 100644 index 000000000000..1b9a552d6ed5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_json_decoder.py @@ -0,0 +1,111 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import json +import os + +import pytest +import requests +from airbyte_cdk import YamlDeclarativeSource +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder, JsonlDecoder +from airbyte_cdk.sources.declarative.models import DeclarativeStream as DeclarativeStreamModel +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ModelToComponentFactory + + +@pytest.mark.parametrize( + "response_body, first_element", + [("", {}), ("[]", {}), ('{"healthcheck": {"status": "ok"}}', {"healthcheck": {"status": "ok"}})], +) +def test_json_decoder(requests_mock, response_body, first_element): + requests_mock.register_uri("GET", "https://airbyte.io/", text=response_body) + response = requests.get("https://airbyte.io/") + assert next(JsonDecoder(parameters={}).decode(response)) == first_element + + +@pytest.mark.parametrize( + "response_body, expected_json", + [ + ("", []), + ('{"id": 1, "name": "test1"}', [{"id": 1, "name": "test1"}]), + ('{"id": 1, "name": "test1"}\n{"id": 2, "name": "test2"}', [{"id": 1, "name": "test1"}, {"id": 2, "name": "test2"}]), + ], + ids=["empty_response", "one_line_json", "multi_line_json"], +) +def test_jsonl_decoder(requests_mock, response_body, expected_json): + requests_mock.register_uri("GET", "https://airbyte.io/", text=response_body) + response = requests.get("https://airbyte.io/") + assert list(JsonlDecoder(parameters={}).decode(response)) == expected_json + + +@pytest.fixture(name="large_events_response") +def large_event_response_fixture(): + data = {"email": "email1@example.com"} + jsonl_string = f"{json.dumps(data)}\n" + lines_in_response = 2_000_000 # ≈ 58 MB of response + dir_path = os.path.dirname(os.path.realpath(__file__)) + file_path = f"{dir_path}/test_response.txt" + with open(file_path, "w") as file: + for _ in range(lines_in_response): + file.write(jsonl_string) + yield (lines_in_response, file_path) + os.remove(file_path) + + +@pytest.mark.slow +@pytest.mark.limit_memory("20 MB") +def test_jsonl_decoder_memory_usage(requests_mock, large_events_response): + lines_in_response, file_path = large_events_response + content = """ + name: users + type: DeclarativeStream + retriever: + type: SimpleRetriever + decoder: + type: JsonlDecoder + paginator: + type: "NoPagination" + requester: + path: "users/{{ stream_slice.slice }}" + type: HttpRequester + url_base: "https://for-all-mankind.nasa.com/api/v1" + http_method: GET + authenticator: + type: NoAuth + request_headers: {} + request_body_json: {} + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + partition_router: + type: ListPartitionRouter + cursor_field: "slice" + values: + - users1 + - users2 + - users3 + - users4 + primary_key: [] + """ + + factory = ModelToComponentFactory() + stream_manifest = YamlDeclarativeSource._parse(content) + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config={}) + + def get_body(): + return open(file_path, "rb", buffering=30) + + counter = 0 + requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users1", body=get_body()) + requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users2", body=get_body()) + requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users3", body=get_body()) + requests_mock.get("https://for-all-mankind.nasa.com/api/v1/users/users4", body=get_body()) + + stream_slices = list(stream.stream_slices(sync_mode=SyncMode.full_refresh)) + for stream_slice in stream_slices: + for _ in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice): + counter += 1 + + assert counter == lines_in_response * len(stream_slices) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_pagination_decoder_decorator.py b/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_pagination_decoder_decorator.py new file mode 100644 index 000000000000..f440a2b12c9f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_pagination_decoder_decorator.py @@ -0,0 +1,26 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator + + +class StreamingJsonDecoder(JsonDecoder): + def is_stream_response(self) -> bool: + return True + + +@pytest.mark.parametrize( + "decoder_class, expected", + [ + (StreamingJsonDecoder, {}), + (JsonDecoder, {"data": [{"id": 1}, {"id": 2}]}) + ] +) +def test_pagination_decoder_decorator(requests_mock, decoder_class, expected): + decoder = PaginationDecoderDecorator(decoder=decoder_class(parameters={})) + response_body = "{\"data\": [{\"id\": 1}, {\"id\": 2}]}" + requests_mock.register_uri("GET", "https://airbyte.io/", text=response_body) + response = requests.get("https://airbyte.io/") + assert next(decoder.decode(response)) == expected diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_xml_decoder.py b/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_xml_decoder.py new file mode 100644 index 000000000000..87c78dae4fda --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/decoders/test_xml_decoder.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders import XmlDecoder + + +@pytest.mark.parametrize( + "response_body, expected", + [ + ( + "", + {"item": {"@name": "item_1"}} + ), + ( + "Item 1Item 2", + {"data": {"item": [{"@name": "item_1", "#text": "Item 1"}, {"@name": "item_2", "#text": "Item 2"}]}} + ), + ( + None, + {} + ), + ( + "", + {} + ), + ( + "1Item 1", + {'item': {'@xmlns:ns': 'https://airbyte.io', 'ns:id': '1', 'ns:name': 'Item 1'}} + ) + ], + ids=["one_element_response", "multi_element_response", "empty_response", "malformed_xml_response", "xml_with_namespace_response"] +) +def test_xml_decoder(requests_mock, response_body, expected): + requests_mock.register_uri("GET", "https://airbyte.io/", text=response_body) + response = requests.get("https://airbyte.io/") + assert next(XmlDecoder(parameters={}).decode(response)) == expected diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/external_component.py b/airbyte-cdk/python/unit_tests/sources/declarative/external_component.py new file mode 100644 index 000000000000..d9f0ca8cae10 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/external_component.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters import HttpRequester + + +class SampleCustomComponent(HttpRequester): + """ + A test class used to validate manifests that rely on custom defined Python components + """ + + pass diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/compressed_response b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/compressed_response new file mode 100644 index 000000000000..da79e347f053 Binary files /dev/null and b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/compressed_response differ diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/decompressed_response.csv b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/decompressed_response.csv new file mode 100644 index 000000000000..ebef74b8ec70 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/decompressed_response.csv @@ -0,0 +1,25 @@ +"EMAIL","FIRST_NAME","LAST_NAME","ADDRESS_LINE_1","ADDRESS_LINE_2","CITY","STATE_PROVINCE_REGION","POSTAL_CODE","COUNTRY","ALTERNATE_EMAILS","PHONE_NUMBER","WHATSAPP","LINE","FACEBOOK","UNIQUE_NAME","CREATED_AT","UPDATED_AT","CONTACT_ID" +"fake_email_10@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:22Z","2021-02-01T12:35:51Z","eae8c5c8-f97e-40a8-8945-72acca457f5a" +"fake_email_1@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:08Z","2021-02-01T12:35:38Z","198f959f-f441-4d15-a280-9e8f65a90ba5" +"fake_email_12@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:24Z","2021-02-01T12:35:53Z","6975b74c-bb1e-4d54-a251-b934c4193ed4" +"fake_email_8@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:19Z","2021-02-01T12:35:49Z","36ef1a2d-3cc4-4515-9c00-1615c5f860d0" +"fake_email_18@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:30Z","2021-02-01T12:36:00Z","19163421-bb29-495d-950f-edede6218081" +"fake_email_3@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:14Z","2021-02-01T12:35:43Z","d1211b88-e116-4a0b-a823-0361bf059a06" +"fake_email_9@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:21Z","2021-02-01T12:35:50Z","ef4225b0-dff9-4756-af87-c4228d836d53" +"fake_email_4@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:15Z","2021-02-01T12:35:44Z","9adef36c-fe51-421a-9653-6bd010962e98" +"fake_email_2@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:13Z","2021-02-01T12:35:42Z","210d8004-d12a-4f01-815a-f90cfa9e4360" +"fake_email_6@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:17Z","2021-02-01T12:35:46Z","76330f89-5645-4432-b3bb-9e33a9195273" +"fake_email_14@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:26Z","2021-02-01T12:35:55Z","77200269-0b69-462c-bed1-9e6f912d4b83" +"fake_email_13@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:25Z","2021-02-01T12:35:54Z","c91c993b-1dfa-4686-bcf0-31e4aeb2a1a9" +"joepogbm@ggma.co",,,,,,,,,,,,,,,"2021-02-03T19:26:52Z","2021-02-03T19:27:21Z","a2a1f3f4-0170-4fbd-9152-ffe8cbcdb93d" +"fake_email_17@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:29Z","2021-02-01T12:35:59Z","e45af829-de4e-44d6-9c89-bb0c7ce47925" +"fake_email_15@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:27Z","2021-02-01T12:35:56Z","50b36a31-daf8-45c4-bc48-13e150f6746e" +"fake_email_7@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:18Z","2021-02-01T12:35:47Z","353113b9-b41e-480a-bf98-72213350194c" +"y.kurochkin@zazmic.com",,,,,,,,,,,,,,,"2021-02-03T19:34:41Z","2021-02-03T19:35:47Z","0b62947e-de93-419e-8c96-83572bf15ed1" +"fake_email_19@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:31Z","2021-02-01T12:36:01Z","9932d677-1128-47e4-9d97-667c6155bfee" +"joepogbum@ggma.co",,,,,,,,,,,,,,,"2021-02-03T19:22:41Z","2021-02-03T19:23:10Z","ba3c48d5-b63b-48e6-8687-c5034ed0a8dd" +"fake_email_0@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:34:49Z","2021-02-01T12:35:19Z","44ec451f-d401-40d2-831d-3e3ce8a94f66" +"avida.d3@gmail.com","dima","dima",,,,,,,,,,,,,"2021-09-08T09:02:22Z","2021-09-08T09:04:58Z","2f7b13f2-60d2-462a-bfb0-d30bb8eabed8" +"fake_email_16@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:28Z","2021-02-01T12:35:57Z","c6cfd936-e327-48da-aa76-824076461d80" +"fake_email_11@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:23Z","2021-02-01T12:35:52Z","4101feb2-2b07-4aef-8eb5-62878b612fcd" +"fake_email_5@lmail.c","Fake contact","Lastname",,,,,"22341",,,,,,,,"2021-02-01T12:35:16Z","2021-02-01T12:35:45Z","32deb20d-9f8f-44b4-aed2-dc15d5bf45ba" diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_dpath_extractor.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_dpath_extractor.py new file mode 100644 index 000000000000..92b4ffbb4804 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_dpath_extractor.py @@ -0,0 +1,96 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import io +import json +from typing import Dict, List, Union + +import pytest +import requests +from airbyte_cdk import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import IterableDecoder, JsonDecoder, JsonlDecoder +from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor + +config = {"field": "record_array"} +parameters = {"parameters_field": "record_array"} + +decoder_json = JsonDecoder(parameters={}) +decoder_jsonl = JsonlDecoder(parameters={}) +decoder_iterable = IterableDecoder(parameters={}) + + +def create_response(body: Union[Dict, bytes]): + response = requests.Response() + response.raw = io.BytesIO(body if isinstance(body, bytes) else json.dumps(body).encode("utf-8")) + return response + + +@pytest.mark.parametrize( + "field_path, decoder, body, expected_records", + [ + (["data"], decoder_json, {"data": [{"id": 1}, {"id": 2}]}, [{"id": 1}, {"id": 2}]), + (["data"], decoder_json, {"data": {"id": 1}}, [{"id": 1}]), + ([], decoder_json, {"id": 1}, [{"id": 1}]), + ([], decoder_json, [{"id": 1}, {"id": 2}], [{"id": 1}, {"id": 2}]), + (["data", "records"], decoder_json, {"data": {"records": [{"id": 1}, {"id": 2}]}}, [{"id": 1}, {"id": 2}]), + (["{{ config['field'] }}"], decoder_json, {"record_array": [{"id": 1}, {"id": 2}]}, [{"id": 1}, {"id": 2}]), + (["{{ parameters['parameters_field'] }}"], decoder_json, {"record_array": [{"id": 1}, {"id": 2}]}, [{"id": 1}, {"id": 2}]), + (["record"], decoder_json, {"id": 1}, []), + (["list", "*", "item"], decoder_json, {"list": [{"item": {"id": "1"}}]}, [{"id": "1"}]), + ( + ["data", "*", "list", "data2", "*"], + decoder_json, + {"data": [{"list": {"data2": [{"id": 1}, {"id": 2}]}}, {"list": {"data2": [{"id": 3}, {"id": 4}]}}]}, + [{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}], + ), + ([], decoder_jsonl, {"id": 1}, [{"id": 1}]), + ([], decoder_jsonl, [{"id": 1}, {"id": 2}], [{"id": 1}, {"id": 2}]), + (["data"], decoder_jsonl, b'{"data": [{"id": 1}, {"id": 2}]}', [{"id": 1}, {"id": 2}]), + ( + ["data"], + decoder_jsonl, + b'{"data": [{"id": 1}, {"id": 2}]}\n{"data": [{"id": 3}, {"id": 4}]}', + [{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}], + ), + ( + ["data"], + decoder_jsonl, + b'{"data": [{"id": 1, "text_field": "This is a text\\n. New paragraph start here."}]}\n{"data": [{"id": 2, "text_field": "This is another text\\n. New paragraph start here."}]}', + [ + {"id": 1, "text_field": "This is a text\n. New paragraph start here."}, + {"id": 2, "text_field": "This is another text\n. New paragraph start here."}, + ], + ), + ( + [], + decoder_iterable, + b"user1@example.com\nuser2@example.com", + [{"record": "user1@example.com"}, {"record": "user2@example.com"}], + ), + ], + ids=[ + "test_extract_from_array", + "test_extract_single_record", + "test_extract_single_record_from_root", + "test_extract_from_root_array", + "test_nested_field", + "test_field_in_config", + "test_field_in_parameters", + "test_field_does_not_exist", + "test_nested_list", + "test_complex_nested_list", + "test_extract_single_record_from_root_jsonl", + "test_extract_from_root_jsonl", + "test_extract_from_array_jsonl", + "test_extract_from_array_multiline_jsonl", + "test_extract_from_array_multiline_with_escape_character_jsonl", + "test_extract_from_string_per_line_iterable", + ], +) +def test_dpath_extractor(field_path: List, decoder: Decoder, body, expected_records: List): + extractor = DpathExtractor(field_path=field_path, config=config, decoder=decoder, parameters=parameters) + + response = create_response(body) + actual_records = list(extractor.extract_records(response)) + + assert actual_records == expected_records diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_filter.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_filter.py new file mode 100644 index 000000000000..498f61b714bd --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_filter.py @@ -0,0 +1,373 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from typing import List, Mapping, Optional + +import pytest +from airbyte_cdk.sources.declarative.datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.extractors.record_filter import ClientSideIncrementalRecordFilterDecorator, RecordFilter +from airbyte_cdk.sources.declarative.incremental import ( + CursorFactory, + DatetimeBasedCursor, + GlobalSubstreamCursor, + PerPartitionWithGlobalCursor, +) +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.models import CustomRetriever, DeclarativeStream, ParentStreamConfig +from airbyte_cdk.sources.declarative.partition_routers import SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.types import StreamSlice + +DATE_FORMAT = "%Y-%m-%d" +RECORDS_TO_FILTER_DATE_FORMAT = [ + {"id": 1, "created_at": "2020-01-03"}, + {"id": 2, "created_at": "2021-01-03"}, + {"id": 3, "created_at": "2021-01-04"}, + {"id": 4, "created_at": "2021-02-01"}, + {"id": 5, "created_at": "2021-01-02"}, +] + +DATE_TIME_WITH_TZ_FORMAT = "%Y-%m-%dT%H:%M:%S%z" +RECORDS_TO_FILTER_DATE_TIME_WITH_TZ_FORMAT = [ + {"id": 1, "created_at": "2020-01-03T00:00:00+00:00"}, + {"id": 2, "created_at": "2021-01-03T00:00:00+00:00"}, + {"id": 3, "created_at": "2021-01-04T00:00:00+00:00"}, + {"id": 4, "created_at": "2021-02-01T00:00:00+00:00"}, +] + +DATE_TIME_WITHOUT_TZ_FORMAT = "%Y-%m-%dT%H:%M:%S" +RECORDS_TO_FILTER_DATE_TIME_WITHOUT_TZ_FORMAT = [ + {"id": 1, "created_at": "2020-01-03T00:00:00"}, + {"id": 2, "created_at": "2021-01-03T00:00:00"}, + {"id": 3, "created_at": "2021-01-04T00:00:00"}, + {"id": 4, "created_at": "2021-02-01T00:00:00"}, +] + + +@pytest.mark.parametrize( + "filter_template, records, expected_records", + [ + ( + "{{ record['created_at'] > stream_state['created_at'] }}", + [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + [{"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + ), + ( + "{{ record['last_seen'] >= stream_slice['last_seen'] }}", + [{"id": 1, "last_seen": "06-06-21"}, {"id": 2, "last_seen": "06-07-21"}, {"id": 3, "last_seen": "06-10-21"}], + [{"id": 3, "last_seen": "06-10-21"}], + ), + ( + "{{ record['id'] >= next_page_token['last_seen_id'] }}", + [{"id": 11}, {"id": 12}, {"id": 13}, {"id": 14}, {"id": 15}], + [{"id": 14}, {"id": 15}], + ), + ( + "{{ record['id'] >= next_page_token['path_to_nowhere'] }}", + [{"id": 11}, {"id": 12}, {"id": 13}, {"id": 14}, {"id": 15}], + [], + ), + ( + "{{ record['created_at'] > parameters['created_at'] }}", + [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + [{"id": 3, "created_at": "06-08-21"}], + ), + ( + "{{ record['created_at'] > stream_slice.extra_fields['created_at'] }}", + [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + [{"id": 3, "created_at": "06-08-21"}], + ), + ], + ids=[ + "test_using_state_filter", + "test_with_slice_filter", + "test_with_next_page_token_filter", + "test_missing_filter_fields_return_no_results", + "test_using_parameters_filter", + "test_using_extra_fields_filter", + ], +) +def test_record_filter(filter_template: str, records: List[Mapping], expected_records: List[Mapping]): + config = {"response_override": "stop_if_you_see_me"} + parameters = {"created_at": "06-07-21"} + stream_state = {"created_at": "06-06-21"} + stream_slice = StreamSlice(partition={}, cursor_slice={"last_seen": "06-10-21"}, extra_fields={"created_at": "06-07-21"}) + next_page_token = {"last_seen_id": 14} + record_filter = RecordFilter(config=config, condition=filter_template, parameters=parameters) + + actual_records = list( + record_filter.filter_records(records, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + ) + assert actual_records == expected_records + + +@pytest.mark.parametrize( + "datetime_format, stream_state, record_filter_expression, end_datetime, records_to_filter, expected_record_ids", + [ + (DATE_FORMAT, {}, None, "2021-01-05", RECORDS_TO_FILTER_DATE_FORMAT, [2, 3, 5]), + (DATE_FORMAT, {}, None, None, RECORDS_TO_FILTER_DATE_FORMAT, [2, 3, 4, 5]), + (DATE_FORMAT, {"created_at": "2021-01-04"}, None, "2021-01-05", RECORDS_TO_FILTER_DATE_FORMAT, [3]), + (DATE_FORMAT, {"created_at": "2021-01-04"}, None, None, RECORDS_TO_FILTER_DATE_FORMAT, [3, 4]), + (DATE_FORMAT, {}, "{{ record['id'] % 2 == 1 }}", "2021-01-05", RECORDS_TO_FILTER_DATE_FORMAT, [3, 5]), + (DATE_TIME_WITH_TZ_FORMAT, {}, None, "2021-01-05T00:00:00+00:00", RECORDS_TO_FILTER_DATE_TIME_WITH_TZ_FORMAT, [2, 3]), + (DATE_TIME_WITH_TZ_FORMAT, {}, None, None, RECORDS_TO_FILTER_DATE_TIME_WITH_TZ_FORMAT, [2, 3, 4]), + ( + DATE_TIME_WITH_TZ_FORMAT, + {"created_at": "2021-01-04T00:00:00+00:00"}, + None, + "2021-01-05T00:00:00+00:00", + RECORDS_TO_FILTER_DATE_TIME_WITH_TZ_FORMAT, + [3], + ), + ( + DATE_TIME_WITH_TZ_FORMAT, + {"created_at": "2021-01-04T00:00:00+00:00"}, + None, + None, + RECORDS_TO_FILTER_DATE_TIME_WITH_TZ_FORMAT, + [3, 4], + ), + ( + DATE_TIME_WITH_TZ_FORMAT, + {}, + "{{ record['id'] % 2 == 1 }}", + "2021-01-05T00:00:00+00:00", + RECORDS_TO_FILTER_DATE_TIME_WITH_TZ_FORMAT, + [3], + ), + (DATE_TIME_WITHOUT_TZ_FORMAT, {}, None, "2021-01-05T00:00:00", RECORDS_TO_FILTER_DATE_TIME_WITHOUT_TZ_FORMAT, [2, 3]), + (DATE_TIME_WITHOUT_TZ_FORMAT, {}, None, None, RECORDS_TO_FILTER_DATE_TIME_WITHOUT_TZ_FORMAT, [2, 3, 4]), + ( + DATE_TIME_WITHOUT_TZ_FORMAT, + {"created_at": "2021-01-04T00:00:00"}, + None, + "2021-01-05T00:00:00", + RECORDS_TO_FILTER_DATE_TIME_WITHOUT_TZ_FORMAT, + [3], + ), + ( + DATE_TIME_WITHOUT_TZ_FORMAT, + {"created_at": "2021-01-04T00:00:00"}, + None, + None, + RECORDS_TO_FILTER_DATE_TIME_WITHOUT_TZ_FORMAT, + [3, 4], + ), + ( + DATE_TIME_WITHOUT_TZ_FORMAT, + {}, + "{{ record['id'] % 2 == 1 }}", + "2021-01-05T00:00:00", + RECORDS_TO_FILTER_DATE_TIME_WITHOUT_TZ_FORMAT, + [3], + ), + ], + ids=[ + "date_format_no_stream_state_no_record_filter", + "date_format_no_stream_state_no_end_date_no_record_filter", + "date_format_with_stream_state_no_record_filter", + "date_format_with_stream_state_no_end_date_no_record_filter", + "date_format_no_stream_state_with_record_filter", + "date_time_with_tz_format_no_stream_state_no_record_filter", + "date_time_with_tz_format_no_stream_state_no_end_date_no_record_filter", + "date_time_with_tz_format_with_stream_state_no_record_filter", + "date_time_with_tz_format_with_stream_state_no_end_date_no_record_filter", + "date_time_with_tz_format_no_stream_state_with_record_filter", + "date_time_without_tz_format_no_stream_state_no_record_filter", + "date_time_without_tz_format_no_stream_state_no_end_date_no_record_filter", + "date_time_without_tz_format_with_stream_state_no_record_filter", + "date_time_without_tz_format_with_stream_state_no_end_date_no_record_filter", + "date_time_without_tz_format_no_stream_state_with_record_filter", + ], +) +def test_client_side_record_filter_decorator_no_parent_stream( + datetime_format: str, + stream_state: Optional[Mapping], + record_filter_expression: str, + end_datetime: Optional[str], + records_to_filter: List[Mapping], + expected_record_ids: List[int], +): + date_time_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format=DATE_FORMAT, parameters={}), + end_datetime=MinMaxDatetime(datetime=end_datetime, parameters={}) if end_datetime else None, + step="P10Y", + cursor_field=InterpolatedString.create("created_at", parameters={}), + datetime_format=datetime_format, + cursor_granularity="P1D", + config={}, + parameters={}, + ) + date_time_based_cursor.set_initial_state(stream_state) + + record_filter_decorator = ClientSideIncrementalRecordFilterDecorator( + config={}, + condition=record_filter_expression, + parameters={}, + date_time_based_cursor=date_time_based_cursor, + substream_cursor=None, + ) + + filtered_records = list( + record_filter_decorator.filter_records(records=records_to_filter, stream_state=stream_state, stream_slice={}, next_page_token=None) + ) + + assert [x.get("id") for x in filtered_records] == expected_record_ids + + +@pytest.mark.parametrize( + "stream_state, cursor_type, expected_record_ids", + [ + # Use only DatetimeBasedCursor + ({}, 'datetime', [2, 3, 5]), + # Use GlobalSubstreamCursor with no state + ({}, 'global_substream', [2, 3, 5]), + # Use GlobalSubstreamCursor with global state + ( + { + 'state': {'created_at': '2021-01-03'} + }, + 'global_substream', + [2, 3] + ), + # Use PerPartitionWithGlobalCursor with partition state + ( + { + 'use_global_cursor': False, + 'state': {'created_at': '2021-01-10'}, + 'states': [ + { + 'partition': {'id': 'some_parent_id', 'parent_slice': {}}, + 'cursor': {'created_at': '2021-01-03'} + } + ] + }, + 'per_partition_with_global', + [2, 3] + ), + # Use PerPartitionWithGlobalCursor with global state + ( + { + 'use_global_cursor': True, + 'state': {'created_at': '2021-01-03'}, + 'states': [ + { + 'partition': {'id': 'some_parent_id', 'parent_slice': {}}, + 'cursor': {'created_at': '2021-01-13'} + } + ] + }, + 'per_partition_with_global', + [2, 3] + ), + # Use PerPartitionWithGlobalCursor with partition state missing, global cursor used + ( + { + 'use_global_cursor': True, + 'state': {'created_at': '2021-01-03'} + }, + 'per_partition_with_global', + [2, 3] + ), + # Use PerPartitionWithGlobalCursor with partition state missing, global cursor not used + ( + { + 'use_global_cursor': False, + 'state': {'created_at': '2021-01-03'} + }, + 'per_partition_with_global', + [2, 3, 5] # Global cursor not used, start date used + ), + ], + ids=[ + 'datetime_cursor_only', + 'global_substream_no_state', + 'global_substream_with_state', + 'per_partition_with_partition_state', + 'per_partition_with_global_state', + 'per_partition_partition_missing_global_cursor_used', + 'per_partition_partition_missing_global_cursor_not_used', + ] +) +def test_client_side_record_filter_decorator_with_cursor_types( + stream_state: Optional[Mapping], + cursor_type: str, + expected_record_ids: List[int] +): + def date_time_based_cursor_factory() -> DatetimeBasedCursor: + return DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format=DATE_FORMAT, parameters={}), + end_datetime=MinMaxDatetime(datetime="2021-01-05", datetime_format=DATE_FORMAT, parameters={}), + step="P10Y", + cursor_field=InterpolatedString.create("created_at", parameters={}), + datetime_format=DATE_FORMAT, + cursor_granularity="P1D", + config={}, + parameters={}, + ) + + date_time_based_cursor = date_time_based_cursor_factory() + + substream_cursor = None + partition_router = SubstreamPartitionRouter( + config={}, + parameters={}, + parent_stream_configs=[ + ParentStreamConfig( + type="ParentStreamConfig", + parent_key="id", + partition_field="id", + stream=DeclarativeStream( + type="DeclarativeStream", + retriever=CustomRetriever(type="CustomRetriever", class_name="a_class_name") + ), + ) + ], + ) + + if cursor_type == 'datetime': + # Use only DatetimeBasedCursor + pass # No additional cursor needed + elif cursor_type == 'global_substream': + # Create GlobalSubstreamCursor instance + substream_cursor = GlobalSubstreamCursor( + stream_cursor=date_time_based_cursor, + partition_router=partition_router, + ) + if stream_state: + substream_cursor.set_initial_state(stream_state) + elif cursor_type == 'per_partition_with_global': + # Create PerPartitionWithGlobalCursor instance + substream_cursor = PerPartitionWithGlobalCursor( + cursor_factory=CursorFactory(date_time_based_cursor_factory), + partition_router=partition_router, + stream_cursor=date_time_based_cursor, + ) + else: + raise ValueError(f"Unsupported cursor type: {cursor_type}") + + if substream_cursor and stream_state: + substream_cursor.set_initial_state(stream_state) + elif stream_state: + date_time_based_cursor.set_initial_state(stream_state) + + # Create the record_filter_decorator with appropriate cursor + record_filter_decorator = ClientSideIncrementalRecordFilterDecorator( + config={}, + parameters={}, + date_time_based_cursor=date_time_based_cursor, + substream_cursor=substream_cursor, + ) + + # The partition we're testing + stream_slice = StreamSlice(partition={"id": "some_parent_id", "parent_slice": {}}, cursor_slice={}) + + filtered_records = list( + record_filter_decorator.filter_records( + records=RECORDS_TO_FILTER_DATE_FORMAT, + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=None, + ) + ) + + assert [x.get("id") for x in filtered_records] == expected_record_ids diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_selector.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_selector.py new file mode 100644 index 000000000000..fc2bcd6dd51e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_selector.py @@ -0,0 +1,183 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from unittest.mock import Mock, call + +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.types import Record, StreamSlice +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer + + +@pytest.mark.parametrize( + "test_name, field_path, filter_template, body, expected_data", + [ + ( + "test_with_extractor_and_filter", + ["data"], + "{{ record['created_at'] > stream_state['created_at'] }}", + {"data": [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}]}, + [{"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + ), + ( + "test_no_record_filter_returns_all_records", + ["data"], + None, + {"data": [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}]}, + [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}], + ), + ( + "test_with_extractor_and_filter_with_parameters", + ["{{ parameters['parameters_field'] }}"], + "{{ record['created_at'] > parameters['created_at'] }}", + {"data": [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}]}, + [{"id": 3, "created_at": "06-08-21"}], + ), + ( + "test_read_single_record", + ["data"], + None, + {"data": {"id": 1, "created_at": "06-06-21"}}, + [{"id": 1, "created_at": "06-06-21"}], + ), + ( + "test_no_record", + ["data"], + None, + {"data": []}, + [], + ), + ( + "test_no_record_from_root", + [], + None, + [], + [], + ), + ], +) +def test_record_filter(test_name, field_path, filter_template, body, expected_data): + config = {"response_override": "stop_if_you_see_me"} + parameters = {"parameters_field": "data", "created_at": "06-07-21"} + stream_state = {"created_at": "06-06-21"} + stream_slice = StreamSlice(partition={}, cursor_slice={"last_seen": "06-10-21"}) + next_page_token = {"last_seen_id": 14} + schema = create_schema() + first_transformation = Mock(spec=RecordTransformation) + second_transformation = Mock(spec=RecordTransformation) + transformations = [first_transformation, second_transformation] + + response = create_response(body) + decoder = JsonDecoder(parameters={}) + extractor = DpathExtractor(field_path=field_path, decoder=decoder, config=config, parameters=parameters) + if filter_template is None: + record_filter = None + else: + record_filter = RecordFilter(config=config, condition=filter_template, parameters=parameters) + record_selector = RecordSelector( + extractor=extractor, + record_filter=record_filter, + transformations=transformations, + config=config, + parameters=parameters, + schema_normalization=TypeTransformer(TransformConfig.NoTransform), + ) + + actual_records = list( + record_selector.select_records( + response=response, records_schema=schema, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + ) + assert actual_records == [Record(data, stream_slice) for data in expected_data] + + calls = [] + for record in expected_data: + calls.append(call(record, config=config, stream_state=stream_state, stream_slice=stream_slice)) + for transformation in transformations: + assert transformation.transform.call_count == len(expected_data) + transformation.transform.assert_has_calls(calls) + + +@pytest.mark.parametrize( + "test_name, schema, schema_transformation, body, expected_data", + [ + ( + "test_with_empty_schema", + {}, + TransformConfig.NoTransform, + {"data": [{"id": 1, "created_at": "06-06-21", "field_int": "100", "field_float": "123.3"}]}, + [{"id": 1, "created_at": "06-06-21", "field_int": "100", "field_float": "123.3"}], + ), + ( + "test_with_schema_none_normalizer", + {}, + TransformConfig.NoTransform, + {"data": [{"id": 1, "created_at": "06-06-21", "field_int": "100", "field_float": "123.3"}]}, + [{"id": 1, "created_at": "06-06-21", "field_int": "100", "field_float": "123.3"}], + ), + ( + "test_with_schema_and_default_normalizer", + {}, + TransformConfig.DefaultSchemaNormalization, + {"data": [{"id": 1, "created_at": "06-06-21", "field_int": "100", "field_float": "123.3"}]}, + [{"id": "1", "created_at": "06-06-21", "field_int": 100, "field_float": 123.3}], + ), + ], +) +def test_schema_normalization(test_name, schema, schema_transformation, body, expected_data): + config = {"response_override": "stop_if_you_see_me"} + parameters = {"parameters_field": "data", "created_at": "06-07-21"} + stream_state = {"created_at": "06-06-21"} + stream_slice = {"last_seen": "06-10-21"} + next_page_token = {"last_seen_id": 14} + + response = create_response(body) + schema = create_schema() + decoder = JsonDecoder(parameters={}) + extractor = DpathExtractor(field_path=["data"], decoder=decoder, config=config, parameters=parameters) + record_selector = RecordSelector( + extractor=extractor, + record_filter=None, + transformations=[], + config=config, + parameters=parameters, + schema_normalization=TypeTransformer(schema_transformation), + ) + + actual_records = list( + record_selector.select_records( + response=response, + stream_state=stream_state, + stream_slice=stream_slice, + next_page_token=next_page_token, + records_schema=schema, + ) + ) + + assert actual_records == [Record(data, stream_slice) for data in expected_data] + + +def create_response(body): + response = requests.Response() + response._content = json.dumps(body).encode("utf-8") + return response + + +def create_schema(): + return { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": {"type": "string"}, + "created_at": {"type": "string"}, + "field_int": {"type": "integer"}, + "field_float": {"type": "number"}, + }, + } diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_response_to_file_extractor.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_response_to_file_extractor.py new file mode 100644 index 000000000000..8771a70290d7 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_response_to_file_extractor.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +import csv +import os +from io import BytesIO +from pathlib import Path +from unittest import TestCase + +import pytest +import requests +import requests_mock +from airbyte_cdk.sources.declarative.extractors import ResponseToFileExtractor + + +class ResponseToFileExtractorTest(TestCase): + def setUp(self) -> None: + self._extractor = ResponseToFileExtractor() + self._http_mocker = requests_mock.Mocker() + self._http_mocker.__enter__() + + def tearDown(self) -> None: + self._http_mocker.__exit__(None, None, None) + + def test_compressed_response(self) -> None: + response = self._mock_streamed_response_from_file(self._compressed_response_path()) + extracted_records = list(self._extractor.extract_records(response)) + assert len(extracted_records) == 24 + + def test_text_response(self) -> None: + response = self._mock_streamed_response_from_file(self._decompressed_response_path()) + extracted_records = list(self._extractor.extract_records(response)) + assert len(extracted_records) == 24 + + def test_text_response_with_null_bytes(self) -> None: + csv_with_null_bytes = '"FIRST_\x00NAME","LAST_NAME"\n"a first n\x00ame","a last na\x00me"\n' + response = self._mock_streamed_response(BytesIO(csv_with_null_bytes.encode("utf-8"))) + + extracted_records = list(self._extractor.extract_records(response)) + + assert extracted_records == [{"FIRST_NAME": "a first name", "LAST_NAME": "a last name"}] + + def _test_folder_path(self) -> Path: + return Path(__file__).parent.resolve() + + def _compressed_response_path(self) -> Path: + return self._test_folder_path() / "compressed_response" + + def _decompressed_response_path(self) -> Path: + return self._test_folder_path() / "decompressed_response.csv" + + def _mock_streamed_response_from_file(self, path: Path) -> requests.Response: + with path.open("rb") as f: + return self._mock_streamed_response(f) # type: ignore # Could not find the right typing for file io + + def _mock_streamed_response(self, io: BytesIO) -> requests.Response: + any_url = "https://anyurl.com" + self._http_mocker.register_uri("GET", any_url, [{"body": io, "status_code": 200}]) + return requests.get(any_url) + + +@pytest.fixture(name="large_events_response") +def large_event_response_fixture(): + lines_in_response = 2_000_000 # ≈ 62 MB of response + dir_path = os.path.dirname(os.path.realpath(__file__)) + file_path = f"{dir_path}/test_response.csv" + with open(file_path, "w") as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow(["username", "email"]) # headers + for _ in range(lines_in_response): + csv_writer.writerow(["a_username","email1@example.com"]) + yield (lines_in_response, file_path) + os.remove(file_path) + + +@pytest.mark.slow +@pytest.mark.limit_memory("20 MB") +def test_response_to_file_extractor_memory_usage(requests_mock, large_events_response): + lines_in_response, file_path = large_events_response + extractor = ResponseToFileExtractor() + + url = "https://for-all-mankind.nasa.com/api/v1/users/users1" + requests_mock.get(url, body=open(file_path, "rb")) + + counter = 0 + for _ in extractor.extract_records(requests.get(url, stream=True)): + counter += 1 + + assert counter == lines_in_response diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/incremental/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py new file mode 100644 index 000000000000..5b89e04fe640 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py @@ -0,0 +1,892 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime +import unittest + +import pytest +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.incremental import DatetimeBasedCursor +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.types import Record, StreamSlice + +datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" +cursor_granularity = "PT0.000001S" +FAKE_NOW = datetime.datetime(2022, 1, 1, tzinfo=datetime.timezone.utc) + +config = {"start_date": "2021-01-01T00:00:00.000000+0000", "start_date_ymd": "2021-01-01"} +end_date_now = InterpolatedString(string="{{ today_utc() }}", parameters={}) +cursor_field = "created" +timezone = datetime.timezone.utc +NO_STATE = {} +ANY_SLICE = {} + + +class MockedNowDatetime(datetime.datetime): + @classmethod + def now(cls, tz=None): + return FAKE_NOW + + +@pytest.fixture() +def mock_datetime_now(monkeypatch): + monkeypatch.setattr(datetime, "datetime", MockedNowDatetime) + + +@pytest.mark.parametrize( + "test_name, stream_state, start, end, step, cursor_field, lookback_window, datetime_format, cursor_granularity, is_compare_strictly, expected_slices", + [ + ( + "test_1_day", + NO_STATE, + MinMaxDatetime(datetime="{{ config['start_date'] }}", parameters={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + "P1D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-01T23:59:59.999999+0000"}, + {"start_time": "2021-01-02T00:00:00.000000+0000", "end_time": "2021-01-02T23:59:59.999999+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-03T23:59:59.999999+0000"}, + {"start_time": "2021-01-04T00:00:00.000000+0000", "end_time": "2021-01-04T23:59:59.999999+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T23:59:59.999999+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T23:59:59.999999+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T23:59:59.999999+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T23:59:59.999999+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T23:59:59.999999+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_2_day", + NO_STATE, + MinMaxDatetime(datetime="{{ config['start_date'] }}", parameters={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + "P2D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-02T23:59:59.999999+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-04T23:59:59.999999+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-06T23:59:59.999999+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-08T23:59:59.999999+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_1_week", + NO_STATE, + MinMaxDatetime(datetime="{{ config['start_date'] }}", parameters={}), + MinMaxDatetime(datetime="2021-02-10T00:00:00.000000+0000", parameters={}), + "P1W", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-07T23:59:59.999999+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-14T23:59:59.999999+0000"}, + {"start_time": "2021-01-15T00:00:00.000000+0000", "end_time": "2021-01-21T23:59:59.999999+0000"}, + {"start_time": "2021-01-22T00:00:00.000000+0000", "end_time": "2021-01-28T23:59:59.999999+0000"}, + {"start_time": "2021-01-29T00:00:00.000000+0000", "end_time": "2021-02-04T23:59:59.999999+0000"}, + {"start_time": "2021-02-05T00:00:00.000000+0000", "end_time": "2021-02-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_1_month", + NO_STATE, + MinMaxDatetime(datetime="{{ config['start_date'] }}", parameters={}), + MinMaxDatetime(datetime="2021-06-10T00:00:00.000000+0000", parameters={}), + "P1M", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-31T23:59:59.999999+0000"}, + {"start_time": "2021-02-01T00:00:00.000000+0000", "end_time": "2021-02-28T23:59:59.999999+0000"}, + {"start_time": "2021-03-01T00:00:00.000000+0000", "end_time": "2021-03-31T23:59:59.999999+0000"}, + {"start_time": "2021-04-01T00:00:00.000000+0000", "end_time": "2021-04-30T23:59:59.999999+0000"}, + {"start_time": "2021-05-01T00:00:00.000000+0000", "end_time": "2021-05-31T23:59:59.999999+0000"}, + {"start_time": "2021-06-01T00:00:00.000000+0000", "end_time": "2021-06-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_1_year", + NO_STATE, + MinMaxDatetime(datetime="{{ config['start_date'] }}", parameters={}), + MinMaxDatetime(datetime="2022-06-10T00:00:00.000000+0000", parameters={}), + "P1Y", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-12-31T23:59:59.999999+0000"}, + {"start_time": "2022-01-01T00:00:00.000000+0000", "end_time": "2022-01-01T00:00:00.000000+0000"}, + ], + ), + ( + "test_from_stream_state", + {cursor_field: "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="2020-01-05T00:00:00.000000+0000", parameters={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + "P1D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T23:59:59.999999+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T23:59:59.999999+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T23:59:59.999999+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T23:59:59.999999+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T23:59:59.999999+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_12_day", + NO_STATE, + MinMaxDatetime(datetime="{{ config['start_date'] }}", parameters={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + "P12D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_end_time_greater_than_now", + NO_STATE, + MinMaxDatetime(datetime="2021-12-28T00:00:00.000000+0000", parameters={}), + MinMaxDatetime(datetime=f"{(FAKE_NOW + datetime.timedelta(days=1)).strftime(datetime_format)}", parameters={}), + "P1D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-12-28T00:00:00.000000+0000", "end_time": "2021-12-28T23:59:59.999999+0000"}, + {"start_time": "2021-12-29T00:00:00.000000+0000", "end_time": "2021-12-29T23:59:59.999999+0000"}, + {"start_time": "2021-12-30T00:00:00.000000+0000", "end_time": "2021-12-30T23:59:59.999999+0000"}, + {"start_time": "2021-12-31T00:00:00.000000+0000", "end_time": "2021-12-31T23:59:59.999999+0000"}, + {"start_time": "2022-01-01T00:00:00.000000+0000", "end_time": "2022-01-01T00:00:00.000000+0000"}, + ], + ), + ( + "test_start_date_greater_than_end_time", + NO_STATE, + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + MinMaxDatetime(datetime="2021-01-05T00:00:00.000000+0000", parameters={}), + "P1D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + ], + ), + ( + "test_cursor_date_greater_than_start_date", + {cursor_field: "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + "P1D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T23:59:59.999999+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T23:59:59.999999+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T23:59:59.999999+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T23:59:59.999999+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T23:59:59.999999+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_cursor_date_greater_than_start_date_multiday_step", + {cursor_field: "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="2021-01-03T00:00:00.000000+0000", parameters={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + "P2D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-06T23:59:59.999999+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-08T23:59:59.999999+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_with_lookback_window_from_start_date", + NO_STATE, + MinMaxDatetime(datetime="2021-01-05", datetime_format="%Y-%m-%d", parameters={}), + MinMaxDatetime(datetime="2021-01-08", datetime_format="%Y-%m-%d", parameters={}), + "P1D", + cursor_field, + "P3D", + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T23:59:59.999999+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T23:59:59.999999+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T23:59:59.999999+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T00:00:00.000000+0000"}, + ], + ), + ( + "test_with_lookback_window_from_cursor", + {cursor_field: "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + MinMaxDatetime(datetime="2021-01-06T00:00:00.000000+0000", parameters={}), + "P1D", + cursor_field, + "P3D", + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-02T00:00:00.000000+0000", "end_time": "2021-01-02T23:59:59.999999+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-03T23:59:59.999999+0000"}, + {"start_time": "2021-01-04T00:00:00.000000+0000", "end_time": "2021-01-04T23:59:59.999999+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T23:59:59.999999+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T00:00:00.000000+0000"}, + ], + ), + ( + "test_with_lookback_window_defaults_to_0d", + {}, + MinMaxDatetime(datetime="2021-01-01", datetime_format="%Y-%m-%d", parameters={}), + MinMaxDatetime(datetime="2021-01-05", datetime_format="%Y-%m-%d", parameters={}), + "P1D", + cursor_field, + "{{ config['does_not_exist'] }}", + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-01T23:59:59.999999+0000"}, + {"start_time": "2021-01-02T00:00:00.000000+0000", "end_time": "2021-01-02T23:59:59.999999+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-03T23:59:59.999999+0000"}, + {"start_time": "2021-01-04T00:00:00.000000+0000", "end_time": "2021-01-04T23:59:59.999999+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + ], + ), + ( + "test_start_is_after_stream_state", + {cursor_field: "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + "P1D", + cursor_field, + None, + datetime_format, + cursor_granularity, + None, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T23:59:59.999999+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T23:59:59.999999+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T23:59:59.999999+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T23:59:59.999999+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T23:59:59.999999+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_slices_without_intersections", + NO_STATE, + MinMaxDatetime(datetime="{{ config['start_date'] }}", parameters={}), + MinMaxDatetime(datetime="2021-02-01T00:00:00.000000+0000", parameters={}), + "P1M", + cursor_field, + None, + datetime_format, + cursor_granularity, + True, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-31T23:59:59.999999+0000"}, + ], + ), + ], +) +def test_stream_slices( + mock_datetime_now, + test_name, + stream_state, + start, + end, + step, + cursor_field, + lookback_window, + datetime_format, + cursor_granularity, + is_compare_strictly, + expected_slices, +): + lookback_window = InterpolatedString(string=lookback_window, parameters={}) if lookback_window else None + cursor = DatetimeBasedCursor( + start_datetime=start, + end_datetime=end, + step=step, + cursor_field=cursor_field, + datetime_format=datetime_format, + cursor_granularity=cursor_granularity, + lookback_window=lookback_window, + is_compare_strictly=is_compare_strictly, + config=config, + parameters={}, + ) + cursor.set_initial_state(stream_state) + stream_slices = cursor.stream_slices() + + assert stream_slices == expected_slices + + +@pytest.mark.parametrize( + "test_name, previous_cursor, stream_slice, observed_records, expected_state", + [ + ( + "test_close_slice_previous_cursor_is_highest", + "2023-01-01", + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [{cursor_field: "2021-01-01"}], + {cursor_field: "2023-01-01"}, + ), + ( + "test_close_slice_stream_slice_partition_end_is_highest", + "2020-01-01", + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2023-01-01"}), + [{cursor_field: "2021-01-01"}], + {cursor_field: "2021-01-01"}, + ), + ( + "test_close_slice_latest_record_cursor_value_is_higher_than_slice_end", + "2021-01-01", + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [{cursor_field: "2023-01-01"}], + {cursor_field: "2021-01-01"}, + ), + ( + "test_close_slice_with_no_records_observed", + "2021-01-01", + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [], + {cursor_field: "2021-01-01"}, + ), + ( + "test_close_slice_with_no_records_observed_and_no_previous_state", + None, + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [], + {}, + ), + ( + "test_close_slice_without_previous_cursor", + None, + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2023-01-01"}), + [{cursor_field: "2022-01-01"}], + {cursor_field: "2022-01-01"}, + ), + ( + "test_close_slice_with_out_of_order_records", + "2021-01-01", + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [{cursor_field: "2021-04-01"}, {cursor_field: "2021-02-01"}, {cursor_field: "2021-03-01"}], + {cursor_field: "2021-04-01"}, + ), + ( + "test_close_slice_with_some_records_out_of_slice_boundaries", + "2021-01-01", + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [{cursor_field: "2021-02-01"}, {cursor_field: "2021-03-01"}, {cursor_field: "2023-01-01"}], + {cursor_field: "2021-03-01"}, + ), + ( + "test_close_slice_with_all_records_out_of_slice_boundaries", + "2021-01-01", + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [{cursor_field: "2023-01-01"}], + {cursor_field: "2021-01-01"}, + ), + ( + "test_close_slice_with_all_records_out_of_slice_and_no_previous_cursor", + None, + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2022-01-01"}), + [{cursor_field: "2023-01-01"}], + {}, + ), + ], +) +def test_close_slice(test_name, previous_cursor, stream_slice, observed_records, expected_state): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + cursor_field=InterpolatedString(string=cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + partition_field_start="start_time", + partition_field_end="end_time", + ) + cursor.set_initial_state({cursor_field: previous_cursor}) + for record_data in observed_records: + record = Record(record_data, stream_slice) + cursor.observe(stream_slice, record) + cursor.close_slice(stream_slice) + updated_state = cursor.get_stream_state() + assert updated_state == expected_state + + +def test_close_slice_fails_if_slice_has_a_partition(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + cursor_field=InterpolatedString(string=cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + stream_slice = StreamSlice(partition={"key": "value"}, cursor_slice={"end_time": "2022-01-01"}) + with pytest.raises(ValueError): + cursor.close_slice(stream_slice) + + +def test_compares_cursor_values_by_chronological_order(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + cursor_field=cursor_field, + datetime_format="%d-%m-%Y", + config=config, + parameters={}, + ) + + _slice = StreamSlice(partition={}, cursor_slice={"start_time": "01-01-2023", "end_time": "01-04-2023"}) + first_record = Record({cursor_field: "21-02-2023"}, _slice) + cursor.observe(_slice, first_record) + second_record = Record({cursor_field: "01-03-2023"}, _slice) + cursor.observe(_slice, second_record) + cursor.close_slice(_slice) + + assert cursor.get_stream_state()[cursor_field] == "01-03-2023" + + +def test_given_different_format_and_slice_is_highest_when_close_slice_then_state_uses_record_format(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + cursor_field=cursor_field, + datetime_format="%Y-%m-%dT%H:%M:%S.%fZ", + cursor_datetime_formats=["%Y-%m-%d"], + config=config, + parameters={}, + ) + + _slice = StreamSlice(partition={}, cursor_slice={"start_time": "2023-01-01T17:30:19.000Z", "end_time": "2023-01-04T17:30:19.000Z"}) + record_cursor_value = "2023-01-03" + record = Record({cursor_field: record_cursor_value}, _slice) + cursor.observe(_slice, record) + cursor.close_slice(_slice) + + assert cursor.get_stream_state()[cursor_field] == "2023-01-03" + + +@pytest.mark.parametrize( + "test_name, inject_into, field_name, expected_req_params, expected_headers, expected_body_json, expected_body_data", + [ + ("test_start_time_inject_into_none", None, None, {}, {}, {}, {}), + ( + "test_start_time_passed_by_req_param", + RequestOptionType.request_parameter, + "start_time", + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + {}, + {}, + {}, + ), + ( + "test_start_time_inject_into_header", + RequestOptionType.header, + "start_time", + {}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + {}, + {}, + ), + ( + "test_start_time_inject_intoy_body_json", + RequestOptionType.body_json, + "start_time", + {}, + {}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + {}, + ), + ( + "test_start_time_inject_into_body_data", + RequestOptionType.body_data, + "start_time", + {}, + {}, + {}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + ), + ], +) +def test_request_option(test_name, inject_into, field_name, expected_req_params, expected_headers, expected_body_json, expected_body_data): + start_request_option = RequestOption(inject_into=inject_into, parameters={}, field_name=field_name) if inject_into else None + end_request_option = RequestOption(inject_into=inject_into, parameters={}, field_name="endtime") if inject_into else None + slicer = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + end_datetime=MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + step="P1D", + cursor_field=InterpolatedString(string=cursor_field, parameters={}), + datetime_format=datetime_format, + cursor_granularity=cursor_granularity, + lookback_window=InterpolatedString(string="P0D", parameters={}), + start_time_option=start_request_option, + end_time_option=end_request_option, + config=config, + parameters={}, + ) + stream_slice = {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-04T00:00:00.000000+0000"} + assert slicer.get_request_params(stream_slice=stream_slice) == expected_req_params + assert slicer.get_request_headers(stream_slice=stream_slice) == expected_headers + assert slicer.get_request_body_json(stream_slice=stream_slice) == expected_body_json + assert slicer.get_request_body_data(stream_slice=stream_slice) == expected_body_data + + +@pytest.mark.parametrize( + "stream_slice", + [ + pytest.param(None, id="test_none_stream_slice"), + pytest.param({}, id="test_none_stream_slice"), + ], +) +def test_request_option_with_empty_stream_slice(stream_slice): + start_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="starttime") + end_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="endtime") + slicer = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", parameters={}), + end_datetime=MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", parameters={}), + step="P1D", + cursor_field=InterpolatedString(string=cursor_field, parameters={}), + datetime_format=datetime_format, + cursor_granularity=cursor_granularity, + lookback_window=InterpolatedString(string="P0D", parameters={}), + start_time_option=start_request_option, + end_time_option=end_request_option, + config=config, + parameters={}, + ) + assert {} == slicer.get_request_params(stream_slice=stream_slice) + + +@pytest.mark.parametrize( + "test_name, input_date, date_format, date_format_granularity, expected_output_date", + [ + ( + "test_parse_date_iso", + "2021-01-01T00:00:00.000000+0000", + "%Y-%m-%dT%H:%M:%S.%f%z", + "PT0.000001S", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ( + "test_parse_timestamp", + "1609459200", + "%s", + "PT1S", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ("test_parse_date_number", "20210101", "%Y%m%d", "P1D", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)), + ], +) +def test_parse_date_legacy_merge_datetime_format_in_cursor_datetime_format( + test_name, input_date, date_format, date_format_granularity, expected_output_date +): + slicer = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", parameters={}), + end_datetime=MinMaxDatetime("2021-01-10T00:00:00.000000+0000", parameters={}), + step="P1D", + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format=date_format, + cursor_granularity=date_format_granularity, + lookback_window=InterpolatedString("P0D", parameters={}), + config=config, + parameters={}, + ) + output_date = slicer.parse_date(input_date) + assert output_date == expected_output_date + + +@pytest.mark.parametrize( + "test_name, input_date, date_formats, expected_output_date", + [ + ( + "test_match_first_format", + "2021-01-01T00:00:00.000000+0000", + ["%Y-%m-%dT%H:%M:%S.%f%z", "%s"], + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ( + "test_match_second_format", + "1609459200", + ["%Y-%m-%dT%H:%M:%S.%f%z", "%s"], + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ], +) +def test_parse_date(test_name, input_date, date_formats, expected_output_date): + slicer = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + cursor_datetime_formats=date_formats, + config=config, + parameters={}, + ) + assert slicer.parse_date(input_date) == expected_output_date + + +def test_given_unknown_format_when_parse_date_then_raise_error(): + slicer = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + cursor_datetime_formats=["%Y-%m-%d", "%s"], + config=config, + parameters={}, + ) + with pytest.raises(ValueError): + slicer.parse_date("2021-01-01T00:00:00.000000+0000") + + +@pytest.mark.parametrize( + "test_name, input_dt, datetimeformat, datetimeformat_granularity, expected_output", + [ + ("test_format_timestamp", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%s", "PT1S", "1609459200"), + ("test_format_string", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%Y-%m-%d", "P1D", "2021-01-01"), + ("test_format_to_number", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%Y%m%d", "P1D", "20210101"), + ], +) +def test_format_datetime(test_name, input_dt, datetimeformat, datetimeformat_granularity, expected_output): + slicer = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", parameters={}), + end_datetime=MinMaxDatetime("2021-01-10T00:00:00.000000+0000", parameters={}), + step="P1D", + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format=datetimeformat, + cursor_granularity=datetimeformat_granularity, + lookback_window=InterpolatedString("P0D", parameters={}), + config=config, + parameters={}, + ) + + output_date = slicer._format_datetime(input_dt) + assert output_date == expected_output + + +def test_step_but_no_cursor_granularity(): + with pytest.raises(ValueError): + DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", parameters={}), + end_datetime=MinMaxDatetime("2021-01-10T00:00:00.000000+0000", parameters={}), + step="P1D", + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + + +def test_cursor_granularity_but_no_step(): + with pytest.raises(ValueError): + DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", parameters={}), + end_datetime=MinMaxDatetime("2021-01-10T00:00:00.000000+0000", parameters={}), + cursor_granularity="P1D", + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + + +def test_given_multiple_cursor_datetime_format_then_slice_using_first_format(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01", parameters={}), + end_datetime=MinMaxDatetime("2023-01-10", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + cursor_datetime_formats=["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"], + config=config, + parameters={}, + ) + stream_slices = cursor.stream_slices() + assert stream_slices == [{"start_time": "2021-01-01", "end_time": "2023-01-10"}] + + +def test_no_cursor_granularity_and_no_step_then_only_return_one_slice(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01", parameters={}), + end_datetime=MinMaxDatetime("2023-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + stream_slices = cursor.stream_slices() + assert stream_slices == [{"start_time": "2021-01-01", "end_time": "2023-01-01"}] + + +def test_no_end_datetime(mock_datetime_now): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + stream_slices = cursor.stream_slices() + assert stream_slices == [{"start_time": "2021-01-01", "end_time": FAKE_NOW.strftime("%Y-%m-%d")}] + + +def test_given_no_state_and_start_before_cursor_value_when_should_be_synced_then_return_true(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert cursor.should_be_synced(Record({cursor_field: "2022-01-01"}, ANY_SLICE)) + + +def test_given_no_state_and_start_after_cursor_value_when_should_be_synced_then_return_false(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2022-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert not cursor.should_be_synced(Record({cursor_field: "2021-01-01"}, ANY_SLICE)) + + +def test_given_state_earliest_to_start_datetime_when_should_be_synced_then_use_state_as_earliest_boundary(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2021-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + cursor.set_initial_state({cursor_field: "2023-01-01"}) + assert not cursor.should_be_synced(Record({cursor_field: "2022-01-01"}, ANY_SLICE)) + + +def test_given_start_datetime_earliest_to_state_when_should_be_synced_then_use_start_datetime_as_earliest_boundary(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2023-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + cursor.set_initial_state({cursor_field: "2021-01-01"}) + assert not cursor.should_be_synced(Record({cursor_field: "2022-01-01"}, ANY_SLICE)) + + +def test_given_end_datetime_before_cursor_value_when_should_be_synced_then_return_false(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("2023-01-01", parameters={}), + end_datetime=MinMaxDatetime("2025-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert not cursor.should_be_synced(Record({cursor_field: "2030-01-01"}, ANY_SLICE)) + + +def test_given_record_without_cursor_value_when_should_be_synced_then_return_true(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("3000-01-01", parameters={}), + cursor_field=InterpolatedString(cursor_field, parameters={}), + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert cursor.should_be_synced(Record({"record without cursor value": "any"}, ANY_SLICE)) + + +def test_given_first_greater_than_second_then_return_true(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("3000-01-01", parameters={}), + cursor_field="cursor_field", + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert cursor.is_greater_than_or_equal(Record({"cursor_field": "2023-01-01"}, {}), Record({"cursor_field": "2021-01-01"}, {})) + + +def test_given_first_lesser_than_second_then_return_false(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("3000-01-01", parameters={}), + cursor_field="cursor_field", + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert not cursor.is_greater_than_or_equal(Record({"cursor_field": "2021-01-01"}, {}), Record({"cursor_field": "2023-01-01"}, {})) + + +def test_given_no_cursor_value_for_second_than_second_then_return_true(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("3000-01-01", parameters={}), + cursor_field="cursor_field", + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert cursor.is_greater_than_or_equal(Record({"cursor_field": "2021-01-01"}, {}), Record({}, {})) + + +def test_given_no_cursor_value_for_first_than_second_then_return_false(): + cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime("3000-01-01", parameters={}), + cursor_field="cursor_field", + datetime_format="%Y-%m-%d", + config=config, + parameters={}, + ) + assert not cursor.is_greater_than_or_equal(Record({}, {}), Record({"cursor_field": "2021-01-01"}, {})) + + +if __name__ == "__main__": + unittest.main() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_per_partition_cursor.py b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_per_partition_cursor.py new file mode 100644 index 000000000000..823405cb5152 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_per_partition_cursor.py @@ -0,0 +1,553 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from collections import OrderedDict +from unittest.mock import Mock + +import pytest +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import PerPartitionCursor, PerPartitionKeySerializer, StreamSlice +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.types import Record + +PARTITION = { + "partition_key string": "partition value", + "partition_key int": 1, + "partition_key list str": ["list item 1", "list item 2"], + "partition_key list dict": [ + {"dict within list key 1-1": "dict within list value 1-1", "dict within list key 1-2": "dict within list value 1-2"}, + {"dict within list key 2": "dict within list value 2"}, + ], + "partition_key nested dict": { + "nested_partition_key 1": "a nested value", + "nested_partition_key 2": "another nested value", + }, +} + +CURSOR_SLICE_FIELD = "cursor slice field" +CURSOR_STATE_KEY = "cursor state" +CURSOR_STATE = {CURSOR_STATE_KEY: "a state value"} +NOT_CONSIDERED_BECAUSE_MOCKED_CURSOR_HAS_NO_STATE = "any" +STATE = { + "states": [ + { + "partition": { + "partition_router_field_1": "X1", + "partition_router_field_2": "Y1", + }, + "cursor": {"cursor state field": 1}, + }, + { + "partition": { + "partition_router_field_1": "X2", + "partition_router_field_2": "Y2", + }, + "cursor": {"cursor state field": 2}, + }, + ] +} + + +def test_partition_serialization(): + serializer = PerPartitionKeySerializer() + assert serializer.to_partition(serializer.to_partition_key(PARTITION)) == PARTITION + + +def test_partition_with_different_key_orders(): + ordered_dict = OrderedDict({"1": 1, "2": 2}) + same_dict_with_different_order = OrderedDict({"2": 2, "1": 1}) + serializer = PerPartitionKeySerializer() + + assert serializer.to_partition_key(ordered_dict) == serializer.to_partition_key(same_dict_with_different_order) + + +def test_given_tuples_in_json_then_deserialization_convert_to_list(): + """ + This is a known issue with the current implementation. However, the assumption is that this wouldn't be a problem as we only use the + immutability and we expect stream slices to be immutable anyway + """ + serializer = PerPartitionKeySerializer() + partition_with_tuple = {"key": (1, 2, 3)} + + assert partition_with_tuple != serializer.to_partition(serializer.to_partition_key(partition_with_tuple)) + + +def test_stream_slice_merge_dictionaries(): + stream_slice = StreamSlice(partition={"partition key": "partition value"}, cursor_slice={"cursor key": "cursor value"}) + assert stream_slice == {"partition key": "partition value", "cursor key": "cursor value"} + + +def test_overlapping_slice_keys_raise_error(): + with pytest.raises(ValueError): + StreamSlice(partition={"overlapping key": "partition value"}, cursor_slice={"overlapping key": "cursor value"}) + + +class MockedCursorBuilder: + def __init__(self): + self._stream_slices = [] + self._stream_state = {} + + def with_stream_slices(self, stream_slices): + self._stream_slices = stream_slices + return self + + def with_stream_state(self, stream_state): + self._stream_state = stream_state + return self + + def build(self): + cursor = Mock(spec=DeclarativeCursor) + cursor.get_stream_state.return_value = self._stream_state + cursor.stream_slices.return_value = self._stream_slices + return cursor + + +@pytest.fixture() +def mocked_partition_router(): + return Mock(spec=PartitionRouter) + + +@pytest.fixture() +def mocked_cursor_factory(): + cursor_factory = Mock() + cursor_factory.create.return_value = MockedCursorBuilder().build() + return cursor_factory + + +def test_given_no_partition_when_stream_slices_then_no_slices(mocked_cursor_factory, mocked_partition_router): + mocked_partition_router.stream_slices.return_value = [] + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + + slices = cursor.stream_slices() + + assert not next(slices, None) + + +def test_given_partition_router_without_state_has_one_partition_then_return_one_slice_per_cursor_slice( + mocked_cursor_factory, mocked_partition_router +): + partition = StreamSlice(partition={"partition_field_1": "a value", "partition_field_2": "another value"}, cursor_slice={}) + mocked_partition_router.stream_slices.return_value = [partition] + cursor_slices = [{"start_datetime": 1}, {"start_datetime": 2}] + mocked_cursor_factory.create.return_value = MockedCursorBuilder().with_stream_slices(cursor_slices).build() + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + + slices = cursor.stream_slices() + + assert list(slices) == [StreamSlice(partition=partition, cursor_slice=cursor_slice) for cursor_slice in cursor_slices] + + +def test_given_partition_associated_with_state_when_stream_slices_then_do_not_recreate_cursor( + mocked_cursor_factory, mocked_partition_router +): + partition = StreamSlice(partition={"partition_field_1": "a value", "partition_field_2": "another value"}, cursor_slice={}) + mocked_partition_router.stream_slices.return_value = [partition] + cursor_slices = [{"start_datetime": 1}] + mocked_cursor_factory.create.return_value = MockedCursorBuilder().with_stream_slices(cursor_slices).build() + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + + cursor.set_initial_state({"states": [{"partition": partition.partition, "cursor": CURSOR_STATE}]}) + mocked_cursor_factory.create.assert_called_once() + slices = list(cursor.stream_slices()) + + mocked_cursor_factory.create.assert_called_once() + assert len(slices) == 1 + + +def test_given_multiple_partitions_then_each_have_their_state(mocked_cursor_factory, mocked_partition_router): + first_partition = {"first_partition_key": "first_partition_value"} + mocked_partition_router.stream_slices.return_value = [ + StreamSlice(partition=first_partition, cursor_slice={}), + StreamSlice(partition={"second_partition_key": "second_partition_value"}, cursor_slice={}), + ] + first_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + second_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "second slice cursor value"}]).build() + mocked_cursor_factory.create.side_effect = [first_cursor, second_cursor] + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + + cursor.set_initial_state({"states": [{"partition": first_partition, "cursor": CURSOR_STATE}]}) + slices = list(cursor.stream_slices()) + + first_cursor.stream_slices.assert_called_once() + second_cursor.stream_slices.assert_called_once() + assert slices == [ + StreamSlice( + partition={"first_partition_key": "first_partition_value"}, cursor_slice={CURSOR_SLICE_FIELD: "first slice cursor value"} + ), + StreamSlice( + partition={"second_partition_key": "second_partition_value"}, cursor_slice={CURSOR_SLICE_FIELD: "second slice cursor value"} + ), + ] + + +def test_given_stream_slices_when_get_stream_state_then_return_updated_state(mocked_cursor_factory, mocked_partition_router): + mocked_cursor_factory.create.side_effect = [ + MockedCursorBuilder().with_stream_state({CURSOR_STATE_KEY: "first slice cursor value"}).build(), + MockedCursorBuilder().with_stream_state({CURSOR_STATE_KEY: "second slice cursor value"}).build(), + ] + mocked_partition_router.stream_slices.return_value = [ + StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}), + StreamSlice(partition={"partition key": "second partition"}, cursor_slice={}), + ] + + # Mock the get_parent_state method to return the parent state + mocked_partition_router.get_stream_state.return_value = {} + + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + list(cursor.stream_slices()) + assert cursor.get_stream_state() == { + "states": [ + {"partition": {"partition key": "first partition"}, "cursor": {CURSOR_STATE_KEY: "first slice cursor value"}}, + {"partition": {"partition key": "second partition"}, "cursor": {CURSOR_STATE_KEY: "second slice cursor value"}}, + ] + } + + +def test_when_get_stream_state_then_delegate_to_underlying_cursor(mocked_cursor_factory, mocked_partition_router): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + mocked_cursor_factory.create.side_effect = [underlying_cursor] + mocked_partition_router.stream_slices.return_value = [StreamSlice(partition={"partition key": "first partition"}, cursor_slice={})] + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + first_slice = list(cursor.stream_slices())[0] + + cursor.should_be_synced(Record({}, first_slice)) + + underlying_cursor.should_be_synced.assert_called_once_with(Record({}, first_slice.cursor_slice)) + + +def test_close_slice(mocked_cursor_factory, mocked_partition_router): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + mocked_cursor_factory.create.side_effect = [underlying_cursor] + stream_slice = StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}) + mocked_partition_router.stream_slices.return_value = [stream_slice] + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + list(cursor.stream_slices()) # generate internal state + + cursor.close_slice(stream_slice) + + underlying_cursor.close_slice.assert_called_once_with(stream_slice.cursor_slice) + + +def test_given_no_last_record_when_close_slice_then_do_not_raise_error(mocked_cursor_factory, mocked_partition_router): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + mocked_cursor_factory.create.side_effect = [underlying_cursor] + stream_slice = StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}) + mocked_partition_router.stream_slices.return_value = [stream_slice] + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + list(cursor.stream_slices()) # generate internal state + + cursor.close_slice(stream_slice) + + underlying_cursor.close_slice.assert_called_once_with(stream_slice.cursor_slice) + + +def test_given_unknown_partition_when_close_slice_then_raise_error(): + any_cursor_factory = Mock() + any_partition_router = Mock() + cursor = PerPartitionCursor(any_cursor_factory, any_partition_router) + stream_slice = StreamSlice(partition={"unknown_partition": "unknown"}, cursor_slice={}) + with pytest.raises(ValueError): + cursor.close_slice(stream_slice) + + +def test_given_unknown_partition_when_should_be_synced_then_raise_error(): + any_cursor_factory = Mock() + any_partition_router = Mock() + cursor = PerPartitionCursor(any_cursor_factory, any_partition_router) + with pytest.raises(ValueError): + cursor.should_be_synced(Record({}, StreamSlice(partition={"unknown_partition": "unknown"}, cursor_slice={}))) + + +def test_given_records_with_different_slice_when_is_greater_than_or_equal_then_raise_error(): + any_cursor_factory = Mock() + any_partition_router = Mock() + cursor = PerPartitionCursor(any_cursor_factory, any_partition_router) + with pytest.raises(ValueError): + cursor.is_greater_than_or_equal( + Record({}, StreamSlice(partition={"a slice": "value"}, cursor_slice={})), + Record({}, StreamSlice(partition={"another slice": "value"}, cursor_slice={})), + ) + + +@pytest.mark.parametrize( + "first_record_slice, second_record_slice", + [ + pytest.param(StreamSlice(partition={"a slice": "value"}, cursor_slice={}), None, id="second record does not have a slice"), + pytest.param(None, StreamSlice(partition={"a slice": "value"}, cursor_slice={}), id="first record does not have a slice"), + ], +) +def test_given_records_without_a_slice_when_is_greater_than_or_equal_then_raise_error(first_record_slice, second_record_slice): + any_cursor_factory = Mock() + any_partition_router = Mock() + cursor = PerPartitionCursor(any_cursor_factory, any_partition_router) + with pytest.raises(ValueError): + cursor.is_greater_than_or_equal(Record({}, first_record_slice), Record({}, second_record_slice)) + + +def test_given_slice_is_unknown_when_is_greater_than_or_equal_then_raise_error(): + any_cursor_factory = Mock() + any_partition_router = Mock() + cursor = PerPartitionCursor(any_cursor_factory, any_partition_router) + with pytest.raises(ValueError): + cursor.is_greater_than_or_equal( + Record({}, StreamSlice(partition={"a slice": "value"}, cursor_slice={})), + Record({}, StreamSlice(partition={"a slice": "value"}, cursor_slice={})), + ) + + +def test_when_is_greater_than_or_equal_then_return_underlying_cursor_response(mocked_cursor_factory, mocked_partition_router): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + mocked_cursor_factory.create.side_effect = [underlying_cursor] + stream_slice = StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}) + mocked_partition_router.stream_slices.return_value = [stream_slice] + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + first_record = Record({"first": "value"}, stream_slice) + second_record = Record({"second": "value"}, stream_slice) + list(cursor.stream_slices()) # generate internal state + + result = cursor.is_greater_than_or_equal(first_record, second_record) + + assert result == underlying_cursor.is_greater_than_or_equal.return_value + underlying_cursor.is_greater_than_or_equal.assert_called_once_with(first_record, second_record) + + +@pytest.mark.parametrize( + "stream_slice, expected_output", + [ + pytest.param( + StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}), + {"cursor": "params", "router": "params"}, + id="first partition", + ), + pytest.param(None, None, id="first partition"), + ], +) +def test_get_request_params(mocked_cursor_factory, mocked_partition_router, stream_slice, expected_output): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + underlying_cursor.get_request_params.return_value = {"cursor": "params"} + mocked_cursor_factory.create.side_effect = [underlying_cursor] + mocked_partition_router.stream_slices.return_value = [stream_slice] + mocked_partition_router.get_request_params.return_value = {"router": "params"} + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + if stream_slice: + cursor.set_initial_state({"states": [{"partition": stream_slice.partition, "cursor": CURSOR_STATE}]}) + params = cursor.get_request_params(stream_slice=stream_slice) + assert params == expected_output + mocked_partition_router.get_request_params.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=None + ) + underlying_cursor.get_request_params.assert_called_once_with(stream_state=None, stream_slice={}, next_page_token=None) + else: + with pytest.raises(ValueError): + cursor.get_request_params(stream_slice=stream_slice) + + +@pytest.mark.parametrize( + "stream_slice, expected_output", + [ + pytest.param( + StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}), + {"cursor": "params", "router": "params"}, + id="first partition", + ), + pytest.param(None, None, id="first partition"), + ], +) +def test_get_request_headers(mocked_cursor_factory, mocked_partition_router, stream_slice, expected_output): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + underlying_cursor.get_request_headers.return_value = {"cursor": "params"} + mocked_cursor_factory.create.side_effect = [underlying_cursor] + mocked_partition_router.stream_slices.return_value = [stream_slice] + mocked_partition_router.get_request_headers.return_value = {"router": "params"} + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + if stream_slice: + cursor.set_initial_state({"states": [{"partition": stream_slice.partition, "cursor": CURSOR_STATE}]}) + params = cursor.get_request_headers(stream_slice=stream_slice) + assert params == expected_output + mocked_partition_router.get_request_headers.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=None + ) + underlying_cursor.get_request_headers.assert_called_once_with(stream_state=None, stream_slice={}, next_page_token=None) + else: + with pytest.raises(ValueError): + cursor.get_request_headers(stream_slice=stream_slice) + + +@pytest.mark.parametrize( + "stream_slice, expected_output", + [ + pytest.param( + StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}), + {"cursor": "params", "router": "params"}, + id="first partition", + ), + pytest.param(None, None, id="first partition"), + ], +) +def test_get_request_body_data(mocked_cursor_factory, mocked_partition_router, stream_slice, expected_output): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + underlying_cursor.get_request_body_data.return_value = {"cursor": "params"} + mocked_cursor_factory.create.side_effect = [underlying_cursor] + mocked_partition_router.stream_slices.return_value = [stream_slice] + mocked_partition_router.get_request_body_data.return_value = {"router": "params"} + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + if stream_slice: + cursor.set_initial_state({"states": [{"partition": stream_slice.partition, "cursor": CURSOR_STATE}]}) + params = cursor.get_request_body_data(stream_slice=stream_slice) + assert params == expected_output + mocked_partition_router.get_request_body_data.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=None + ) + underlying_cursor.get_request_body_data.assert_called_once_with(stream_state=None, stream_slice={}, next_page_token=None) + else: + with pytest.raises(ValueError): + cursor.get_request_body_data(stream_slice=stream_slice) + + +@pytest.mark.parametrize( + "stream_slice, expected_output", + [ + pytest.param( + StreamSlice(partition={"partition key": "first partition"}, cursor_slice={}), + {"cursor": "params", "router": "params"}, + id="first partition", + ), + pytest.param(None, None, id="first partition"), + ], +) +def test_get_request_body_json(mocked_cursor_factory, mocked_partition_router, stream_slice, expected_output): + underlying_cursor = MockedCursorBuilder().with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]).build() + underlying_cursor.get_request_body_json.return_value = {"cursor": "params"} + mocked_cursor_factory.create.side_effect = [underlying_cursor] + mocked_partition_router.stream_slices.return_value = [stream_slice] + mocked_partition_router.get_request_body_json.return_value = {"router": "params"} + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + if stream_slice: + cursor.set_initial_state({"states": [{"partition": stream_slice.partition, "cursor": CURSOR_STATE}]}) + params = cursor.get_request_body_json(stream_slice=stream_slice) + assert params == expected_output + mocked_partition_router.get_request_body_json.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=None + ) + underlying_cursor.get_request_body_json.assert_called_once_with(stream_state=None, stream_slice={}, next_page_token=None) + else: + with pytest.raises(ValueError): + cursor.get_request_body_json(stream_slice=stream_slice) + + +def test_parent_state_is_set_for_per_partition_cursor(mocked_cursor_factory, mocked_partition_router): + # Define the parent state to be used in the test + parent_state = {"parent_cursor": "parent_state_value"} + + # Mock the partition router to return a stream slice + partition = StreamSlice(partition={"partition_field_1": "a value", "partition_field_2": "another value"}, cursor_slice={}) + mocked_partition_router.stream_slices.return_value = [partition] + + # Mock the cursor factory to create cursors with specific states + mocked_cursor_factory.create.side_effect = [ + MockedCursorBuilder() + .with_stream_slices([{CURSOR_SLICE_FIELD: "first slice cursor value"}]) + .with_stream_state(CURSOR_STATE) + .build(), + ] + + # Mock the get_parent_state method to return the parent state + mocked_partition_router.get_stream_state.return_value = parent_state + + # Initialize the PerPartitionCursor with the mocked cursor factory and partition router + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + + # Set the initial state, including the parent state + initial_state = { + "states": [{"partition": partition.partition, "cursor": CURSOR_STATE}], + "parent_state": parent_state, + } + cursor.set_initial_state(initial_state) + + # Verify that the parent state has been set correctly + assert cursor.get_stream_state()["parent_state"] == parent_state + + # Verify that set_parent_state was called on the partition router with the initial state + mocked_partition_router.set_initial_state.assert_called_once_with(initial_state) + + +def test_get_stream_state_includes_parent_state(mocked_cursor_factory, mocked_partition_router): + # Define the parent state to be used in the test + parent_state = {"parent_cursor": "parent_state_value"} + + # Define the expected cursor states + cursor_state_1 = {CURSOR_STATE_KEY: "first slice cursor value"} + cursor_state_2 = {CURSOR_STATE_KEY: "second slice cursor value"} + + # Mock the partition router to return stream slices + partition_1 = {"partition_field_1": "a value", "partition_field_2": "another value"} + partition_2 = {"partition_field_1": "another value", "partition_field_2": "yet another value"} + mocked_partition_router.stream_slices.return_value = [ + StreamSlice(partition=partition_1, cursor_slice={}), + StreamSlice(partition=partition_2, cursor_slice={}), + ] + + # Mock the cursor factory to create cursors with specific states + mocked_cursor_factory.create.side_effect = [ + MockedCursorBuilder().with_stream_state(cursor_state_1).build(), + MockedCursorBuilder().with_stream_state(cursor_state_2).build(), + ] + + # Mock the get_parent_state method to return the parent state + mocked_partition_router.get_stream_state.return_value = parent_state + + # Initialize the PerPartitionCursor with the mocked cursor factory and partition router + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + + # Simulate reading the records to initialize the internal state + list(cursor.stream_slices()) + + # Get the combined stream state + stream_state = cursor.get_stream_state() + + # Verify that the combined state includes both partition states and the parent state + expected_state = { + "states": [ + {"partition": partition_1, "cursor": cursor_state_1}, + {"partition": partition_2, "cursor": cursor_state_2}, + ], + "parent_state": parent_state, + } + assert stream_state == expected_state + + +def test_per_partition_state_when_set_initial_global_state(mocked_cursor_factory, mocked_partition_router) -> None: + first_partition = {"first_partition_key": "first_partition_value"} + second_partition = {"second_partition_key": "second_partition_value"} + global_state = {"global_state_format_key": "global_state_format_value"} + + mocked_partition_router.stream_slices.return_value = [ + StreamSlice(partition=first_partition, cursor_slice={}), + StreamSlice(partition=second_partition, cursor_slice={}), + ] + mocked_cursor_factory.create.side_effect = [ + MockedCursorBuilder().with_stream_state(global_state).build(), + MockedCursorBuilder().with_stream_state(global_state).build(), + ] + cursor = PerPartitionCursor(mocked_cursor_factory, mocked_partition_router) + global_state = {"global_state_format_key": "global_state_format_value"} + cursor.set_initial_state(global_state) + assert cursor._state_to_migrate_from == global_state + list(cursor.stream_slices()) + assert cursor._cursor_per_partition['{"first_partition_key":"first_partition_value"}'].set_initial_state.call_count == 1 + assert cursor._cursor_per_partition['{"first_partition_key":"first_partition_value"}'].set_initial_state.call_args[0] == ( + {"global_state_format_key": "global_state_format_value"}, + ) + assert cursor._cursor_per_partition['{"second_partition_key":"second_partition_value"}'].set_initial_state.call_count == 1 + assert cursor._cursor_per_partition['{"second_partition_key":"second_partition_value"}'].set_initial_state.call_args[0] == ( + {"global_state_format_key": "global_state_format_value"}, + ) + expected_state = [ + {"cursor": {"global_state_format_key": "global_state_format_value"}, "partition": {"first_partition_key": "first_partition_value"}}, + { + "cursor": {"global_state_format_key": "global_state_format_value"}, + "partition": {"second_partition_key": "second_partition_value"}, + }, + ] + assert cursor.get_stream_state()["states"] == expected_state diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py new file mode 100644 index 000000000000..4fff298b99aa --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py @@ -0,0 +1,571 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from unittest.mock import MagicMock, patch + +from airbyte_cdk.models import ( + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + StreamDescriptor, + SyncMode, +) +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import PerPartitionCursor, StreamSlice +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.types import Record +from orjson import orjson + +CURSOR_FIELD = "cursor_field" +SYNC_MODE = SyncMode.incremental + + +class ManifestBuilder: + def __init__(self): + self._incremental_sync = {} + self._partition_router = {} + self._substream_partition_router = {} + + def with_list_partition_router(self, stream_name, cursor_field, partitions): + self._partition_router[stream_name] = { + "type": "ListPartitionRouter", + "cursor_field": cursor_field, + "values": partitions, + } + return self + + def with_substream_partition_router(self, stream_name): + self._substream_partition_router[stream_name] = { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "type": "ParentStreamConfig", + "stream": "#/definitions/Rates", + "parent_key": "id", + "partition_field": "parent_id", + } + ], + } + return self + + def with_incremental_sync(self, stream_name, start_datetime, end_datetime, datetime_format, cursor_field, step, cursor_granularity): + self._incremental_sync[stream_name] = { + "type": "DatetimeBasedCursor", + "start_datetime": start_datetime, + "end_datetime": end_datetime, + "datetime_format": datetime_format, + "cursor_field": cursor_field, + "step": step, + "cursor_granularity": cursor_granularity, + } + return self + + def build(self): + manifest = { + "version": "0.34.2", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "definitions": { + "AnotherStream": { + "type": "DeclarativeStream", + "name": "AnotherStream", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$schema": "http://json-schema.org/schema#", "properties": {"id": {"type": "string"}}, "type": "object"}, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + }, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": []}}, + }, + }, + "Rates": { + "type": "DeclarativeStream", + "name": "Rates", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$schema": "http://json-schema.org/schema#", "properties": {}, "type": "object"}, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + }, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": []}}, + }, + }, + }, + "streams": [{"$ref": "#/definitions/Rates"}, {"$ref": "#/definitions/AnotherStream"}], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [], + "properties": {}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, + } + for stream_name, incremental_sync_definition in self._incremental_sync.items(): + manifest["definitions"][stream_name]["incremental_sync"] = incremental_sync_definition + for stream_name, partition_router_definition in self._partition_router.items(): + manifest["definitions"][stream_name]["retriever"]["partition_router"] = partition_router_definition + for stream_name, partition_router_definition in self._substream_partition_router.items(): + manifest["definitions"][stream_name]["retriever"]["partition_router"] = partition_router_definition + return manifest + + +def test_given_state_for_only_some_partition_when_stream_slices_then_create_slices_using_state_or_start_from_start_datetime(): + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + stream_instance = source.streams({})[0] + stream_instance.state = { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-02-01"}, + } + ] + } + + slices = stream_instance.stream_slices( + sync_mode=SYNC_MODE, + stream_state={}, + ) + + assert list(slices) == [ + {"partition_field": "1", "start_time": "2022-02-01", "end_time": "2022-02-28"}, + {"partition_field": "2", "start_time": "2022-01-01", "end_time": "2022-01-31"}, + {"partition_field": "2", "start_time": "2022-02-01", "end_time": "2022-02-28"}, + ] + + +def test_given_record_for_partition_when_read_then_update_state(): + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + stream_instance = source.streams({})[0] + list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) + + stream_slice = StreamSlice(partition={"partition_field": "1"}, cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}) + with patch.object( + SimpleRetriever, "_read_pages", side_effect=[[Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, stream_slice)]] + ): + list( + stream_instance.read_records( + sync_mode=SYNC_MODE, + stream_slice=stream_slice, + stream_state={"states": []}, + cursor_field=CURSOR_FIELD, + ) + ) + + assert stream_instance.state == { + "state": {}, + "use_global_cursor": False, + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-01-15"}, + } + ], + } + + +def test_substream_without_input_state(): + test_source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_substream_partition_router("AnotherStream") + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .with_incremental_sync( + "AnotherStream", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + stream_instance = test_source.streams({})[1] + + parent_stream_slice = StreamSlice(partition={}, cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}) + + # This mocks the resulting records of the Rates stream which acts as the parent stream of the SubstreamPartitionRouter being tested + with patch.object( + SimpleRetriever, + "_read_pages", + side_effect=[ + [Record({"id": "1", CURSOR_FIELD: "2022-01-15"}, parent_stream_slice)], + [Record({"id": "2", CURSOR_FIELD: "2022-01-15"}, parent_stream_slice)], + ], + ): + slices = list(stream_instance.stream_slices(sync_mode=SYNC_MODE)) + assert list(slices) == [ + StreamSlice( + partition={ + "parent_id": "1", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ), + StreamSlice( + partition={ + "parent_id": "1", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + StreamSlice( + partition={ + "parent_id": "2", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-01-01", "end_time": "2022-01-31"}, + ), + StreamSlice( + partition={ + "parent_id": "2", + "parent_slice": {}, + }, + cursor_slice={"start_time": "2022-02-01", "end_time": "2022-02-28"}, + ), + ] + + +def test_partition_limitation(caplog): + """ + Test that when the number of partitions exceeds the maximum allowed limit in PerPartitionCursor, + the oldest partitions are dropped, and the state is updated accordingly. + + In this test, we set the maximum number of partitions to 2 and provide 3 partitions. + We verify that the state only retains information for the two most recent partitions. + """ + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2", "3"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + partition_slices = [ + StreamSlice(partition={"partition_field": "1"}, cursor_slice={}), + StreamSlice(partition={"partition_field": "2"}, cursor_slice={}), + StreamSlice(partition={"partition_field": "3"}, cursor_slice={}), + ] + + records_list = [ + [ + Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, partition_slices[0]), + Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[0]), + ], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, partition_slices[0])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[1])], + [], + [], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-17"}, partition_slices[2])], + ] + + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="Rates", json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) + + initial_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-01-01"}, + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-02"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-01-03"}, + }, + ] + } + ), + ), + ) + ] + logger = MagicMock() + + # Use caplog to capture logs + with caplog.at_level(logging.WARNING, logger="airbyte"): + with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): + with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + output = list(source.read(logger, {}, catalog, initial_state)) + + # Check if the warning was logged + logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] + warning_message = ( + 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.' + ) + assert warning_message in logged_messages + + final_state = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + assert final_state[-1] == { + "lookback_window": 1, + "state": {"cursor_field": "2022-02-17"}, + "use_global_cursor": False, + "states": [ + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-16"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-02-17"}, + }, + ], + } + + +def test_perpartition_with_fallback(caplog): + """ + Test that when the number of partitions exceeds the limit in PerPartitionCursor, + the cursor falls back to using the global cursor for state management. + + This test also checks that the appropriate warning logs are emitted when the partition limit is exceeded. + """ + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2", "3", "4", "5", "6"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-02-28", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + partition_slices = [StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 7)] + + records_list = [ + [ + Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, partition_slices[0]), + Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[0]), + ], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-15"}, partition_slices[0])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[1])], + [], + [], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-17"}, partition_slices[2])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, partition_slices[3])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, partition_slices[3])], + [], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, partition_slices[4])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-13"}, partition_slices[3])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, partition_slices[3])], + ] + + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="Rates", json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) + + initial_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="Rates", namespace=None), + stream_state=AirbyteStateBlob( + { + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-01-01"}, + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-01-02"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-01-03"}, + }, + ] + } + ), + ), + ) + ] + logger = MagicMock() + + # Use caplog to capture logs + with caplog.at_level(logging.WARNING, logger="airbyte"): + with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): + with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 2): + output = list(source.read(logger, {}, catalog, initial_state)) + + # Check if the warnings were logged + expected_warning_messages = [ + 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"1"}. Over limit: 1.', + 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"2"}. Over limit: 2.', + 'The maximum number of partitions has been reached. Dropping the oldest partition: {"partition_field":"3"}. Over limit: 3.', + ] + + logged_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] + + for expected_message in expected_warning_messages: + assert expected_message in logged_messages + + # Proceed with existing assertions + final_state = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + assert final_state[-1] == {"use_global_cursor": True, "state": {"cursor_field": "2022-02-19"}, "lookback_window": 1} + + +def test_per_partition_cursor_within_limit(caplog): + """ + Test that the PerPartitionCursor correctly updates the state for each partition + when the number of partitions is within the allowed limit. + + This test also checks that no warning logs are emitted when the partition limit is not exceeded. + """ + source = ManifestDeclarativeSource( + source_config=ManifestBuilder() + .with_list_partition_router("Rates", "partition_field", ["1", "2", "3"]) + .with_incremental_sync( + "Rates", + start_datetime="2022-01-01", + end_datetime="2022-03-31", + datetime_format="%Y-%m-%d", + cursor_field=CURSOR_FIELD, + step="P1M", + cursor_granularity="P1D", + ) + .build() + ) + + partition_slices = [StreamSlice(partition={"partition_field": str(i)}, cursor_slice={}) for i in range(1, 4)] + + records_list = [ + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-15"}, partition_slices[0])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-20"}, partition_slices[0])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-03-25"}, partition_slices[0])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-16"}, partition_slices[1])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-18"}, partition_slices[1])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-03-28"}, partition_slices[1])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-01-17"}, partition_slices[2])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-02-19"}, partition_slices[2])], + [Record({"a record key": "a record value", CURSOR_FIELD: "2022-03-29"}, partition_slices[2])], + ] + + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="Rates", json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream]) + + initial_state = {} + logger = MagicMock() + + # Use caplog to capture logs + with caplog.at_level(logging.WARNING, logger="airbyte"): + with patch.object(SimpleRetriever, "_read_pages", side_effect=records_list): + with patch.object(PerPartitionCursor, "DEFAULT_MAX_PARTITIONS_NUMBER", 5): + output = list(source.read(logger, {}, catalog, initial_state)) + + # Since the partition limit is not exceeded, we expect no warnings + logged_warnings = [record.message for record in caplog.records if record.levelname == "WARNING"] + assert len(logged_warnings) == 0 + + # Proceed with existing assertions + final_state = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + assert final_state[-1] == { + "lookback_window": 1, + "state": {"cursor_field": "2022-03-29"}, + "use_global_cursor": False, + "states": [ + { + "partition": {"partition_field": "1"}, + "cursor": {CURSOR_FIELD: "2022-03-25"}, + }, + { + "partition": {"partition_field": "2"}, + "cursor": {CURSOR_FIELD: "2022-03-28"}, + }, + { + "partition": {"partition_field": "3"}, + "cursor": {CURSOR_FIELD: "2022-03-29"}, + }, + ], + } diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_resumable_full_refresh_cursor.py b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_resumable_full_refresh_cursor.py new file mode 100644 index 000000000000..b45973283aad --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/incremental/test_resumable_full_refresh_cursor.py @@ -0,0 +1,41 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import pytest +from airbyte_cdk.sources.declarative.incremental import ChildPartitionResumableFullRefreshCursor, ResumableFullRefreshCursor +from airbyte_cdk.sources.types import StreamSlice + + +@pytest.mark.parametrize( + "stream_state, cursor, expected_slice", + [ + pytest.param( + {"updated_at": "2024-04-30"}, + ResumableFullRefreshCursor, + StreamSlice(cursor_slice={"updated_at": "2024-04-30"}, partition={}), + id="test_set_resumable_full_refresh_incoming_stream_state", + ), + pytest.param( + {"updated_at": "2024-05-32"}, + ChildPartitionResumableFullRefreshCursor, + StreamSlice(cursor_slice={"updated_at": "2024-05-32"}, partition={}), + id="test_set_substream_resumable_full_refresh_incoming_stream_state", + ), + pytest.param( + {}, + ResumableFullRefreshCursor, + StreamSlice(cursor_slice={}, partition={}), + id="test_empty_resumable_full_refresh_stream_state", + ), + pytest.param( + {}, + ChildPartitionResumableFullRefreshCursor, + StreamSlice(cursor_slice={}, partition={}), + id="test_empty_substream_resumable_full_refresh_stream_state", + ), + ], +) +def test_stream_slices(stream_state, cursor, expected_slice): + cursor = cursor(parameters={}) + cursor.set_initial_state(stream_state=stream_state) + actual_slices = [stream_slice for stream_slice in cursor.stream_slices()] + assert actual_slices == [expected_slice] diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_filters.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_filters.py new file mode 100644 index 000000000000..912abf3d2d0c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_filters.py @@ -0,0 +1,104 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import base64 +import hashlib + +import pytest +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation + +interpolation = JinjaInterpolation() + + +def test_hash_md5_no_salt() -> None: + input_string = "abcd" + s = "{{ '%s' | hash('md5') }}" % input_string + filter_hash = interpolation.eval(s, config={}) + + # compute expected hash calling hashlib directly + hash_obj = hashlib.md5() + hash_obj.update(str(input_string).encode("utf-8")) + hashlib_computed_hash = hash_obj.hexdigest() + + assert filter_hash == hashlib_computed_hash + + +def test_hash_md5_on_numeric_value() -> None: + input_value = 123.456 + s = "{{ %f | hash('md5') }}" % input_value + filter_hash = interpolation.eval(s, config={}) + + # compute expected hash calling hashlib directly + hash_obj = hashlib.md5() + hash_obj.update(str(input_value).encode("utf-8")) + hashlib_computed_hash = hash_obj.hexdigest() + + assert filter_hash == hashlib_computed_hash + + +def test_hash_md5_with_salt() -> None: + input_string = "test_input_string" + input_salt = "test_input_salt" + + s = "{{ '%s' | hash('md5', '%s' ) }}" % (input_string, input_salt) + filter_hash = interpolation.eval(s, config={}) + + # compute expected value calling hashlib directly + hash_obj = hashlib.md5() + hash_obj.update(str(input_string + input_salt).encode("utf-8")) + hashlib_computed_hash = hash_obj.hexdigest() + + assert filter_hash == hashlib_computed_hash + + +@pytest.mark.parametrize( + "input_string", + ["test_input_client_id", "some_client_secret_1", "12345", "775.78"], +) +def test_base64encode(input_string: str) -> None: + s = "{{ '%s' | base64encode }}" % input_string + filter_base64encode = interpolation.eval(s, config={}) + + # compute expected base64encode calling base64 library directly + base64_obj = base64.b64encode(input_string.encode("utf-8")).decode() + + assert filter_base64encode == base64_obj + + +@pytest.mark.parametrize( + "input_string, expected_string", + [ + ("aW5wdXRfc3RyaW5n", "input_string"), + ("YWlyYnl0ZQ==", "airbyte"), + ("cGFzc3dvcmQ=", "password"), + ], +) +def test_base64decode(input_string: str, expected_string: str) -> None: + s = "{{ '%s' | base64decode }}" % input_string + filter_base64decode = interpolation.eval(s, config={}) + + assert filter_base64decode == expected_string + + +def test_regex_search_valid() -> None: + expression_with_regex = "{{ '; rel=\"next\"' | regex_search('<(.*)>; rel=.*') }}" + + val = interpolation.eval(expression_with_regex, {}) + assert val == "https://this-is-test-link.com/?page=2" + + +def test_regex_search_no_match_group() -> None: + # If no group is set in the regular expression, the result will be an empty string + expression_with_regex = "{{ '; rel=\"next\"' | regex_search('<.*>; rel=.*') }}" + + val = interpolation.eval(expression_with_regex, {}) + assert val is None + + +def test_regex_search_no_match() -> None: + # If no group is set in the regular expression, the result will be an empty string + expression_with_regex = "{{ '; rel=\"next\"' | regex_search('WATWATWAT') }}" + + val = interpolation.eval(expression_with_regex, {}) + + assert val is None diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py new file mode 100644 index 000000000000..6b4ba5349247 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean + +config = { + "parent": {"key_with_true": True}, + "string_key": "compare_me", + "zero_value": 0, + "empty_array": [], + "non_empty_array": [1], + "empty_dict": {}, + "empty_tuple": (), +} + + +@pytest.mark.parametrize( + "test_name, template, expected_result", + [ + ("test_interpolated_true_value", "{{ config['parent']['key_with_true'] }}", True), + ("test_interpolated_true_comparison", "{{ config['string_key'] == \"compare_me\" }}", True), + ("test_interpolated_false_condition", "{{ config['string_key'] == \"witness_me\" }}", False), + ("test_path_has_value_returns_true", "{{ config['string_key'] }}", True), + ("test_zero_is_false", "{{ config['zero_value'] }}", False), + ("test_empty_array_is_false", "{{ config['empty_array'] }}", False), + ("test_empty_dict_is_false", "{{ config['empty_dict'] }}", False), + ("test_empty_tuple_is_false", "{{ config['empty_tuple'] }}", False), + ("test_lowercase_false", '{{ "false" }}', False), + ("test_False", "{{ False }}", False), + ("test_True", "{{ True }}", True), + ("test_value_in_array", "{{ 1 in config['non_empty_array'] }}", True), + ("test_value_not_in_array", "{{ 2 in config['non_empty_array'] }}", False), + ("test_interpolation_using_parameters", "{{ parameters['from_parameters'] == \"come_find_me\" }}", True), + ], +) +def test_interpolated_boolean(test_name, template, expected_result): + interpolated_bool = InterpolatedBoolean(condition=template, parameters={"from_parameters": "come_find_me"}) + assert interpolated_bool.eval(config) == expected_result diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py new file mode 100644 index 000000000000..f4c77c3cec4d --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping + + +@pytest.mark.parametrize( + "test_name, key, expected_value", + [ + ("test_field_value", "field", "value"), + ("test_number", "number", 100), + ("test_field_to_interpolate_from_config", "field_to_interpolate_from_config", "VALUE_FROM_CONFIG"), + ("test_field_to_interpolate_from_kwargs", "field_to_interpolate_from_kwargs", "VALUE_FROM_KWARGS"), + ("test_field_to_interpolate_from_parameters", "field_to_interpolate_from_parameters", "VALUE_FROM_PARAMETERS"), + ("test_key_is_interpolated", "key", "VALUE"), + ], +) +def test(test_name, key, expected_value): + d = { + "field": "value", + "number": 100, + "field_to_interpolate_from_config": "{{ config['c'] }}", + "field_to_interpolate_from_kwargs": "{{ kwargs['a'] }}", + "field_to_interpolate_from_parameters": "{{ parameters['b'] }}", + "{{ parameters.k }}": "VALUE", + } + config = {"c": "VALUE_FROM_CONFIG"} + kwargs = {"a": "VALUE_FROM_KWARGS"} + mapping = InterpolatedMapping(mapping=d, parameters={"b": "VALUE_FROM_PARAMETERS", "k": "key"}) + + interpolated = mapping.eval(config, **{"kwargs": kwargs}) + + assert interpolated[key] == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py new file mode 100644 index 000000000000..c1dea1d62fb2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import dpath +import pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_nested_mapping import InterpolatedNestedMapping + + +@pytest.mark.parametrize( + "test_name, path, expected_value", + [ + ("test_field_value", "nested/field", "value"), + ("test_number", "nested/number", 100), + ("test_interpolated_number", "nested/nested_array/1/value", 5), + ("test_interpolated_boolean", "nested/nested_array/2/value", True), + ("test_field_to_interpolate_from_config", "nested/config_value", "VALUE_FROM_CONFIG"), + ("test_field_to_interpolate_from_kwargs", "nested/kwargs_value", "VALUE_FROM_KWARGS"), + ("test_field_to_interpolate_from_parameters", "nested/parameters_value", "VALUE_FROM_PARAMETERS"), + ("test_key_is_interpolated", "nested/nested_array/0/key", "VALUE"), + ], +) +def test(test_name, path, expected_value): + d = { + "nested": { + "field": "value", + "number": 100, + "nested_array": [ + {"{{ parameters.k }}": "VALUE"}, + {"value": "{{ config['num_value'] | int + 2 }}"}, + {"value": "{{ True }}"}, + ], + "config_value": "{{ config['c'] }}", + "parameters_value": "{{ parameters['b'] }}", + "kwargs_value": "{{ kwargs['a'] }}", + } + } + + config = {"c": "VALUE_FROM_CONFIG", "num_value": 3} + kwargs = {"a": "VALUE_FROM_KWARGS"} + mapping = InterpolatedNestedMapping(mapping=d, parameters={"b": "VALUE_FROM_PARAMETERS", "k": "key"}) + + interpolated = mapping.eval(config, **{"kwargs": kwargs}) + + assert dpath.get(interpolated, path) == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_string.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_string.py new file mode 100644 index 000000000000..f0f1a9952506 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_string.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + +config = {"field": "value"} +parameters = {"hello": "world"} +kwargs = {"c": "airbyte"} + + +@pytest.mark.parametrize( + "test_name, input_string, expected_value", + [ + ("test_static_value", "HELLO WORLD", "HELLO WORLD"), + ("test_eval_from_parameters", "{{ parameters['hello'] }}", "world"), + ("test_eval_from_config", "{{ config['field'] }}", "value"), + ("test_eval_from_kwargs", "{{ kwargs['c'] }}", "airbyte"), + ("test_eval_from_kwargs", "{{ kwargs['c'] }}", "airbyte"), + ], +) +def test_interpolated_string(test_name, input_string, expected_value): + s = InterpolatedString.create(input_string, parameters=parameters) + assert s.eval(config, **{"kwargs": kwargs}) == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_jinja.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_jinja.py new file mode 100644 index 000000000000..207e6fae75ee --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_jinja.py @@ -0,0 +1,303 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime + +import pytest +from airbyte_cdk import StreamSlice +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from freezegun import freeze_time +from jinja2.exceptions import TemplateSyntaxError + +interpolation = JinjaInterpolation() + + +def test_get_value_from_config(): + s = "{{ config['date'] }}" + config = {"date": "2022-01-01"} + val = interpolation.eval(s, config) + assert val == "2022-01-01" + + +def test_get_missing_value_from_config(): + s = "{{ config['date'] }}" + config = {} + val = interpolation.eval(s, config) + assert val is None + + +@pytest.mark.parametrize( + "valid_types, expected_value", + [ + pytest.param((str,), "1234J", id="test_value_is_a_string_if_valid_types_is_str"), + pytest.param(None, 1234j, id="test_value_is_interpreted_as_complex_number_by_default"), + ], +) +def test_get_value_with_complex_number(valid_types, expected_value): + s = "{{ config['value'] }}" + config = {"value": "1234J"} + val = interpolation.eval(s, config, valid_types=valid_types) + assert val == expected_value + + +def test_get_value_from_stream_slice(): + s = "{{ stream_slice['date'] }}" + config = {"date": "2022-01-01"} + stream_slice = {"date": "2020-09-09"} + val = interpolation.eval(s, config, **{"stream_slice": stream_slice}) + assert val == "2020-09-09" + + +def test_get_missing_value_from_stream_slice(): + s = "{{ stream_slice['date'] }}" + config = {"date": "2022-01-01"} + stream_slice = {} + val = interpolation.eval(s, config, **{"stream_slice": stream_slice}) + assert val is None + + +def test_get_value_from_a_list_of_mappings(): + s = "{{ records[0]['date'] }}" + config = {"date": "2022-01-01"} + records = [{"date": "2020-09-09"}] + val = interpolation.eval(s, config, **{"records": records}) + assert val == "2020-09-09" + + +@pytest.mark.parametrize( + "s, value", + [ + pytest.param("{{1}}", 1, id="test_number"), + pytest.param("{{1}}", 1, id="test_number"), + pytest.param("{{[1,2]}}", [1, 2], id="test_list"), + pytest.param("{{ {1:2} }}", {1: 2}, id="test_dict"), + pytest.param("{{ 1+2 }}", 3, id="test_addition"), + ], +) +def test_literals(s, value): + val = interpolation.eval(s, None) + assert val == value + + +@pytest.mark.parametrize( + "context, input_string, expected_value", + [ + pytest.param( + {"stream_slice": {"stream_slice_key": "hello"}}, + "{{ stream_slice['stream_slice_key'] }}", + "hello", + id="test_get_value_from_stream_slice", + ), + pytest.param( + {"stream_slice": {"stream_slice_key": "hello"}}, + "{{ stream_partition['stream_slice_key'] }}", + "hello", + id="test_get_value_from_stream_slicer", + ), + pytest.param( + {"stream_slice": {"stream_slice_key": "hello"}}, + "{{ stream_interval['stream_slice_key'] }}", + "hello", + id="test_get_value_from_stream_interval", + ), + ], +) +def test_stream_slice_alias(context, input_string, expected_value): + config = {} + val = interpolation.eval(input_string, config, **context) + assert val == expected_value + + +@pytest.mark.parametrize( + "alias", + [ + pytest.param("stream_interval", id="test_error_is_raised_if_stream_interval_in_context"), + pytest.param("stream_partition", id="test_error_is_raised_if_stream_partition_in_context"), + ], +) +def test_error_is_raised_if_alias_is_already_in_context(alias): + config = {} + context = {alias: "a_value"} + with pytest.raises(ValueError): + interpolation.eval("a_key", config, **context) + + +def test_positive_day_delta(): + delta_template = "{{ day_delta(25) }}" + interpolation = JinjaInterpolation() + val = interpolation.eval(delta_template, {}) + + # We need to assert against an earlier delta since the interpolation function runs datetime.now() a few milliseconds earlier + assert val > (datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(days=24, hours=23)).strftime("%Y-%m-%dT%H:%M:%S.%f%z") + + +def test_positive_day_delta_with_format(): + delta_template = "{{ day_delta(25,format='%Y-%m-%d') }}" + + with freeze_time("2021-01-01 03:04:05"): + val = interpolation.eval(delta_template, {}) + assert val == "2021-01-26" + + +def test_negative_day_delta(): + delta_template = "{{ day_delta(-25) }}" + val = interpolation.eval(delta_template, {}) + + assert val <= (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=25)).strftime("%Y-%m-%dT%H:%M:%S.%f%z") + + +@pytest.mark.parametrize( + "test_name, input_value, expected_output", + [ + ("test_string_to_string", "hello world", "hello world"), + ("test_int_to_string", 1, "1"), + ("test_number_to_string", 1.52, "1.52"), + ("test_true_to_string", True, "true"), + ("test_false_to_string", False, "false"), + ("test_array_to_string", ["hello", "world"], '["hello", "world"]'), + ("test_object_to_array", {"hello": "world"}, '{"hello": "world"}'), + ], +) +def test_to_string(test_name, input_value, expected_output): + interpolation = JinjaInterpolation() + config = {"key": input_value} + template = "{{ config['key'] | string }}" + actual_output = interpolation.eval(template, config, {}) + assert isinstance(actual_output, str) + assert actual_output == expected_output + + +@pytest.mark.parametrize( + "s, expected_value", + [ + pytest.param("{{ timestamp(1621439283) }}", 1621439283, id="test_timestamp_from_timestamp"), + pytest.param("{{ timestamp('2021-05-19') }}", 1621382400, id="test_timestamp_from_string"), + pytest.param("{{ timestamp('2017-01-01T00:00:00.0Z') }}", 1483228800, id="test_timestamp_from_rfc3339"), + pytest.param("{{ max(1,2) }}", 2, id="test_max"), + ], +) +def test_macros(s, expected_value): + config = {} + val = interpolation.eval(s, config) + assert val == expected_value + + +@pytest.mark.parametrize( + "template_string", + [ + pytest.param("{{ import os) }}", id="test_jinja_with_import"), + pytest.param("{{ [a for a in range(1000000000)] }}", id="test_jinja_with_list_comprehension"), + ], +) +def test_invalid_jinja_statements(template_string): + config = {"key": "value"} + with pytest.raises(TemplateSyntaxError): + interpolation.eval(template_string, config=config) + + +@pytest.mark.parametrize( + "template_string", + [ + # This test stalls if range is removed from JinjaInterpolation.RESTRICTED_BUILTIN_FUNCTIONS + pytest.param( + """ + {% set a = 1 %} + {% set b = 1 %} + {% for i in range(1000000000) %} + {% endfor %} + {{ a }}""", + id="test_jinja_with_very_long_running_compute", + ), + pytest.param("{{ eval ('2+2') }}", id="test_jinja_with_eval"), + pytest.param("{{ getattr(config, 'key') }}", id="test_getattr"), + pytest.param("{{ setattr(config, 'password', 'hunter2') }}", id="test_setattr"), + pytest.param("{{ globals() }}", id="test_jinja_with_globals"), + pytest.param("{{ locals() }}", id="test_jinja_with_globals"), + pytest.param("{{ eval ('2+2') }}", id="test_jinja_with_eval"), + pytest.param("{{ eval }}", id="test_jinja_with_eval"), + ], +) +def test_restricted_builtin_functions_are_not_executed(template_string): + config = {"key": JinjaInterpolation} + with pytest.raises(ValueError): + interpolation.eval(template_string, config=config) + + +@pytest.mark.parametrize( + "template_string, expected_value, expected_error", + [ + pytest.param("{{ to_be }}", "that_is_the_question", None, id="valid_template_variable"), + pytest.param("{{ missingno }}", None, ValueError, id="undeclared_template_variable"), + pytest.param("{{ to_be and or_not_to_be }}", None, ValueError, id="one_undeclared_template_variable"), + ], +) +def test_undeclared_variables(template_string, expected_error, expected_value): + config = {"key": JinjaInterpolation} + + if expected_error: + with pytest.raises(expected_error): + interpolation.eval(template_string, config=config, **{"to_be": "that_is_the_question"}) + else: + actual_value = interpolation.eval(template_string, config=config, **{"to_be": "that_is_the_question"}) + assert actual_value == expected_value + + +@freeze_time("2021-09-01") +@pytest.mark.parametrize( + "template_string, expected_value", + [ + pytest.param("{{ now_utc() }}", "2021-09-01 00:00:00+00:00", id="test_now_utc"), + pytest.param("{{ now_utc().strftime('%Y-%m-%d') }}", "2021-09-01", id="test_now_utc_strftime"), + pytest.param("{{ today_utc() }}", "2021-09-01", id="test_today_utc"), + pytest.param("{{ today_utc().strftime('%Y/%m/%d') }}", "2021/09/01", id="test_todat_utc_stftime"), + pytest.param("{{ timestamp(1646006400) }}", 1646006400, id="test_timestamp_from_timestamp"), + pytest.param("{{ timestamp('2022-02-28') }}", 1646006400, id="test_timestamp_from_timestamp"), + pytest.param("{{ timestamp('2022-02-28T00:00:00Z') }}", 1646006400, id="test_timestamp_from_timestamp"), + pytest.param("{{ timestamp('2022-02-28 00:00:00Z') }}", 1646006400, id="test_timestamp_from_timestamp"), + pytest.param("{{ timestamp('2022-02-28T00:00:00-08:00') }}", 1646035200, id="test_timestamp_from_date_with_tz"), + pytest.param("{{ max(2, 3) }}", 3, id="test_max_with_arguments"), + pytest.param("{{ max([2, 3]) }}", 3, id="test_max_with_list"), + pytest.param("{{ day_delta(1) }}", "2021-09-02T00:00:00.000000+0000", id="test_day_delta"), + pytest.param("{{ day_delta(-1) }}", "2021-08-31T00:00:00.000000+0000", id="test_day_delta_negative"), + pytest.param("{{ day_delta(1, format='%Y-%m-%d') }}", "2021-09-02", id="test_day_delta_with_format"), + pytest.param("{{ duration('P1D') }}", "1 day, 0:00:00", id="test_duration_one_day"), + pytest.param("{{ duration('P6DT23H') }}", "6 days, 23:00:00", id="test_duration_six_days_and_23_hours"), + pytest.param( + "{{ (now_utc() - duration('P1D')).strftime('%Y-%m-%dT%H:%M:%SZ') }}", + "2021-08-31T00:00:00Z", + id="test_now_utc_with_duration_and_format", + ), + pytest.param("{{ 1 | string }}", "1", id="test_int_to_string"), + pytest.param('{{ ["hello", "world"] | string }}', '["hello", "world"]', id="test_array_to_string"), + ], +) +def test_macros_examples(template_string, expected_value): + # The outputs of this test are referenced in declarative_component_schema.yaml + # If you change the expected output, you must also change the expected output in declarative_component_schema.yaml + now_utc = interpolation.eval(template_string, {}) + assert now_utc == expected_value + + +@freeze_time("2021-09-01 12:00:00", tz_offset=5) +@pytest.mark.parametrize( + "template_string, expected_value", + [ + pytest.param("{{ today_with_timezone('Pacific/Kiritimati') }}", "2021-09-02", id="test_today_timezone_pacific"), + ], +) +def test_macros_timezone(template_string: str, expected_value: str): + interpolated_string = interpolation.eval(template_string, {}) + assert interpolated_string == expected_value + + +def test_interpolation_private_partition_attribute(): + inner_partition = StreamSlice(partition={}, cursor_slice={}) + expected_output = "value" + setattr(inner_partition, "parent_stream_fields", expected_output) + stream_slice = StreamSlice(partition=inner_partition, cursor_slice={}) + template = "{{ stream_slice._partition.parent_stream_fields }}" + + actual_output = JinjaInterpolation().eval(template, {}, **{"stream_slice": stream_slice}) + + assert actual_output == expected_output diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_macros.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_macros.py new file mode 100644 index 000000000000..d2a2a291ec1a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_macros.py @@ -0,0 +1,81 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime + +import pytest +from airbyte_cdk.sources.declarative.interpolation.macros import macros + + +@pytest.mark.parametrize( + "test_name, fn_name, found_in_macros", + [ + ("test_now_utc", "now_utc", True), + ("test_today_utc", "today_utc", True), + ("test_max", "max", True), + ("test_day_delta", "day_delta", True), + ("test_format_datetime", "format_datetime", True), + ("test_duration", "duration", True), + ("test_not_a_macro", "thisisnotavalidmacro", False), + ], +) +def test_macros_export(test_name, fn_name, found_in_macros): + if found_in_macros: + assert fn_name in macros + else: + assert fn_name not in macros + + +@pytest.mark.parametrize( + "test_name, input_value, format, input_format, expected_output", + [ + ("test_datetime_string_to_date", "2022-01-01T01:01:01Z", "%Y-%m-%d", None, "2022-01-01"), + ("test_date_string_to_date", "2022-01-01", "%Y-%m-%d", None, "2022-01-01"), + ("test_datetime_string_to_date", "2022-01-01T00:00:00Z", "%Y-%m-%d", None, "2022-01-01"), + ("test_datetime_with_tz_string_to_date", "2022-01-01T00:00:00Z", "%Y-%m-%d", None, "2022-01-01"), + ("test_datetime_string_to_datetime", "2022-01-01T01:01:01Z", "%Y-%m-%dT%H:%M:%SZ", None, "2022-01-01T01:01:01Z"), + ("test_datetime_string_with_tz_to_datetime", "2022-01-01T01:01:01-0800", "%Y-%m-%dT%H:%M:%SZ", None, "2022-01-01T09:01:01Z"), + ("test_datetime_object_tz_to_date", datetime.datetime(2022, 1, 1, 1, 1, 1), "%Y-%m-%d", None, "2022-01-01"), + ("test_datetime_object_tz_to_datetime", datetime.datetime(2022, 1, 1, 1, 1, 1), "%Y-%m-%dT%H:%M:%SZ", None, "2022-01-01T01:01:01Z"), + ("test_datetime_string_to_rfc2822_date", "Sat, 01 Jan 2022 01:01:01 +0000", "%Y-%m-%d", "%a, %d %b %Y %H:%M:%S %z", "2022-01-01"), + ], +) +def test_format_datetime(test_name, input_value, format, input_format, expected_output): + format_datetime = macros["format_datetime"] + assert format_datetime(input_value, format, input_format) == expected_output + + +@pytest.mark.parametrize( + "test_name, input_value, expected_output", + [("test_one_day", "P1D", datetime.timedelta(days=1)), ("test_6_days_23_hours", "P6DT23H", datetime.timedelta(days=6, hours=23))], +) +def test_duration(test_name, input_value, expected_output): + duration_fn = macros["duration"] + assert duration_fn(input_value) == expected_output + + +@pytest.mark.parametrize( + "test_name, input_value, expected_output", + [ + ("test_int_input", 1646006400, 1646006400), + ("test_float_input", 100.0, 100), + ("test_float_input_is_floored", 100.9, 100), + ("test_string_date_iso8601", "2022-02-28", 1646006400), + ("test_string_datetime_midnight_iso8601", "2022-02-28T00:00:00Z", 1646006400), + ("test_string_datetime_midnight_iso8601_with_tz", "2022-02-28T00:00:00-08:00", 1646035200), + ("test_string_datetime_midnight_iso8601_no_t", "2022-02-28 00:00:00Z", 1646006400), + ("test_string_datetime_iso8601", "2022-02-28T10:11:12", 1646043072), + ], +) +def test_timestamp(test_name, input_value, expected_output): + timestamp_function = macros["timestamp"] + actual_output = timestamp_function(input_value) + assert actual_output == expected_output + + +def test_utc_datetime_to_local_timestamp_conversion(): + """ + This test ensures correct timezone handling independent of the timezone of the system on which the sync is running. + """ + assert macros["format_datetime"](dt="2020-10-01T00:00:00Z", format="%s") == "1601510400" diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/migrations/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/migrations/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/migrations/test_legacy_to_per_partition_migration.py b/airbyte-cdk/python/unit_tests/sources/declarative/migrations/test_legacy_to_per_partition_migration.py new file mode 100644 index 000000000000..97e5efd69f97 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/migrations/test_legacy_to_per_partition_migration.py @@ -0,0 +1,277 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import LegacyToPerPartitionStateMigration +from airbyte_cdk.sources.declarative.models import CustomPartitionRouter, CustomRetriever, DatetimeBasedCursor, DeclarativeStream +from airbyte_cdk.sources.declarative.models import LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel +from airbyte_cdk.sources.declarative.models import ParentStreamConfig, SimpleRetriever, SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import ManifestComponentTransformer +from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ManifestReferenceResolver +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ModelToComponentFactory +from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource + +factory = ModelToComponentFactory() + +resolver = ManifestReferenceResolver() + +transformer = ManifestComponentTransformer() + + +def test_migrate_a_valid_legacy_state_to_per_partition(): + input_state = { + "13506132": {"last_changed": "2022-12-27T08:34:39+00:00"}, + "14351124": {"last_changed": "2022-12-27T08:35:39+00:00"}, + } + + migrator = _migrator() + + assert migrator.should_migrate(input_state) + + expected_state = { + "states": [ + {"partition": {"parent_id": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + {"partition": {"parent_id": "14351124"}, "cursor": {"last_changed": "2022-12-27T08:35:39+00:00"}}, + ] + } + + assert migrator.migrate(input_state) == expected_state + + +@pytest.mark.parametrize( + "input_state", + [ + pytest.param( + { + "states": [ + {"partition": {"id": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + {"partition": {"id": "14351124"}, "cursor": {"last_changed": "2022-12-27T08:35:39+00:00"}}, + ] + }, + id="test_should_not_migrate_a_per_partition_state", + ), + pytest.param( + { + "states": [ + {"partition": {"id": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + { + "partition": {"id": "14351124"}, + }, + ] + }, + id="test_should_not_migrate_state_without_a_cursor_component", + ), + pytest.param( + { + "states": [ + {"partition": {"id": "13506132"}, "cursor": {"updated_at": "2022-12-27T08:34:39+00:00"}}, + {"partition": {"id": "14351124"}, "cursor": {"updated_at": "2022-12-27T08:35:39+00:00"}}, + ] + }, + id="test_should_not_migrate_a_per_partition_state_with_wrong_cursor_field", + ), + pytest.param( + { + "states": [ + {"partition": {"id": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + {"partition": {"id": "14351124"}, "cursor": {"last_changed": "2022-12-27T08:35:39+00:00", "updated_at": "2021-01-01"}}, + ] + }, + id="test_should_not_migrate_a_per_partition_state_with_multiple_cursor_fields", + ), + pytest.param( + { + "states": [ + {"partition": {"id": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + {"cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + ] + }, + id="test_should_not_migrate_state_without_a_partition_component", + ), + pytest.param( + { + "states": [ + {"partition": {"id": "13506132", "another_id": "A"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + {"partition": {"id": "13506134"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + ] + }, + id="test_should_not_migrate_state_if_multiple_partition_keys", + ), + pytest.param( + { + "states": [ + {"partition": {"identifier": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + {"partition": {"id": "13506134"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}}, + ] + }, + id="test_should_not_migrate_state_if_invalid_partition_key", + ), + pytest.param( + { + "13506132": {"last_changed": "2022-12-27T08:34:39+00:00"}, + "14351124": {"last_changed": "2022-12-27T08:35:39+00:00", "another_key": "2022-12-27T08:35:39+00:00"}, + }, + id="test_should_not_migrate_if_the_partitioned_state_has_more_than_one_key", + ), + pytest.param( + { + "13506132": {"last_changed": "2022-12-27T08:34:39+00:00"}, + "14351124": {"another_key": "2022-12-27T08:35:39+00:00"}, + }, + id="test_should_not_migrate_if_the_partitioned_state_key_is_not_the_cursor_field", + ), + ], +) +def test_should_not_migrate(input_state): + migrator = _migrator() + assert not migrator.should_migrate(input_state) + + +def test_should_not_migrate_stream_with_multiple_parent_streams(): + input_state = { + "13506132": {"last_changed": "2022-12-27T08:34:39+00:00"}, + "14351124": {"last_changed": "2022-12-27T08:35:39+00:00"}, + } + + migrator = _migrator_with_multiple_parent_streams() + + assert not migrator.should_migrate(input_state) + + +def _migrator(): + partition_router = SubstreamPartitionRouter( + type="SubstreamPartitionRouter", + parent_stream_configs=[ + ParentStreamConfig( + type="ParentStreamConfig", + parent_key="{{ parameters['parent_key_id'] }}", + partition_field="parent_id", + stream=DeclarativeStream( + type="DeclarativeStream", retriever=CustomRetriever(type="CustomRetriever", class_name="a_class_name") + ), + ) + ], + ) + cursor = DatetimeBasedCursor( + type="DatetimeBasedCursor", + cursor_field="{{ parameters['cursor_field'] }}", + datetime_format="%Y-%m-%dT%H:%M:%S.%fZ", + start_datetime="1970-01-01T00:00:00.0Z", + ) + config = {} + parameters = {"cursor_field": "last_changed", "parent_key_id": "id"} + return LegacyToPerPartitionStateMigration(partition_router, cursor, config, parameters) + + +def _migrator_with_multiple_parent_streams(): + partition_router = SubstreamPartitionRouter( + type="SubstreamPartitionRouter", + parent_stream_configs=[ + ParentStreamConfig( + type="ParentStreamConfig", + parent_key="id", + partition_field="parent_id", + stream=DeclarativeStream( + type="DeclarativeStream", retriever=CustomRetriever(type="CustomRetriever", class_name="a_class_name") + ), + ), + ParentStreamConfig( + type="ParentStreamConfig", + parent_key="id", + partition_field="parent_id", + stream=DeclarativeStream( + type="DeclarativeStream", retriever=CustomRetriever(type="CustomRetriever", class_name="a_class_name") + ), + ), + ], + ) + cursor = DatetimeBasedCursor( + type="DatetimeBasedCursor", + cursor_field="{{ parameters['cursor_field'] }}", + datetime_format="%Y-%m-%dT%H:%M:%S.%fZ", + start_datetime="1970-01-01T00:00:00.0Z", + ) + config = {} + parameters = {} + return LegacyToPerPartitionStateMigration(partition_router, cursor, config, parameters) + + +@pytest.mark.parametrize( + "retriever_type, partition_router_class, is_parent_stream_config, expected_exception, expected_error_message", + [ + (SimpleRetriever, CustomPartitionRouter, True, None, None), + ( + None, + CustomPartitionRouter, + True, + ValueError, + "LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever. Got ", + ), + ( + SimpleRetriever, + None, + False, + ValueError, + "LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got ", + ), + ( + SimpleRetriever, + CustomPartitionRouter, + False, + ValueError, + "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration.", + ), + ], +) +def test_create_legacy_to_per_partition_state_migration( + retriever_type, + partition_router_class, + is_parent_stream_config, + expected_exception, + expected_error_message, +): + partition_router = partition_router_class(type="CustomPartitionRouter", class_name="a_class_namer") if partition_router_class else None + + stream = MagicMock() + stream.retriever = MagicMock(spec=retriever_type) + stream.retriever.partition_router = partition_router + + content = """ + state_migrations: + - type: LegacyToPerPartitionStateMigration + """ + + resolved_manifest = resolver.preprocess_manifest(YamlDeclarativeSource._parse(content)) + state_migrations_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["state_migrations"][0], {}) + + if is_parent_stream_config: + parent_stream_config = ParentStreamConfig( + type="ParentStreamConfig", + parent_key="id", + partition_field="parent_id", + stream=DeclarativeStream( + type="DeclarativeStream", retriever=CustomRetriever(type="CustomRetriever", class_name="a_class_name") + ), + ) + partition_router.parent_stream_configs = [parent_stream_config] + + if expected_exception: + with pytest.raises(expected_exception) as excinfo: + factory.create_component( + model_type=LegacyToPerPartitionStateMigrationModel, + component_definition=state_migrations_manifest, + config={}, + declarative_stream=stream, + ) + assert str(excinfo.value) == expected_error_message + else: + migration_instance = factory.create_component( + model_type=LegacyToPerPartitionStateMigrationModel, + component_definition=state_migrations_manifest, + config={}, + declarative_stream=stream, + ) + assert migration_instance is not None diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/parsers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py new file mode 100644 index 000000000000..63efac6688d5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py @@ -0,0 +1,406 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import ManifestComponentTransformer + + +@pytest.mark.parametrize( + "component, expected_component", + [ + pytest.param( + {"type": "DeclarativeSource", "streams": [{"type": "DeclarativeStream", "retriever": {}, "schema_loader": {}}]}, + { + "type": "DeclarativeSource", + "streams": [ + { + "type": "DeclarativeStream", + "retriever": {"type": "SimpleRetriever"}, + "schema_loader": {"type": "JsonFileSchemaLoader"}, + } + ], + }, + id="test_declarative_stream", + ), + pytest.param( + { + "type": "DeclarativeStream", + "retriever": {"type": "SimpleRetriever", "paginator": {}, "record_selector": {}, "requester": {}}, + }, + { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "paginator": {"type": "NoPagination"}, + "record_selector": {"type": "RecordSelector"}, + "requester": {"type": "HttpRequester"}, + }, + }, + id="test_simple_retriever", + ), + pytest.param( + {"type": "DeclarativeStream", "requester": {"type": "HttpRequester", "error_handler": {}}}, + { + "type": "DeclarativeStream", + "requester": { + "type": "HttpRequester", + "error_handler": {"type": "DefaultErrorHandler"}, + }, + }, + id="test_http_requester", + ), + pytest.param( + {"type": "SimpleRetriever", "paginator": {"type": "DefaultPaginator", "page_size_option": {}, "page_token_option": {}}}, + { + "type": "SimpleRetriever", + "paginator": { + "type": "DefaultPaginator", + "page_size_option": {"type": "RequestOption"}, + "page_token_option": {}, + }, + }, + id="test_default_paginator", + ), + pytest.param( + {"type": "SimpleRetriever", "partition_router": {"type": "SubstreamPartitionRouter", "parent_stream_configs": [{}, {}, {}]}}, + { + "type": "SimpleRetriever", + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + {"type": "ParentStreamConfig"}, + {"type": "ParentStreamConfig"}, + {"type": "ParentStreamConfig"}, + ], + }, + }, + id="test_substream_slicer", + ), + ], +) +def test_find_default_types(component, expected_component): + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == expected_component + + +@pytest.mark.parametrize( + "component, expected_component", + [ + pytest.param( + { + "type": "SimpleRetriever", + "requester": {"type": "HttpRequester", "authenticator": {"class_name": "source_greenhouse.components.NewAuthenticator"}}, + }, + { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "authenticator": {"type": "CustomAuthenticator", "class_name": "source_greenhouse.components.NewAuthenticator"}, + }, + }, + id="test_custom_authenticator", + ), + pytest.param( + { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": {"class_name": "source_greenhouse.components.NewRecordExtractor"}, + }, + }, + { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "CustomRecordExtractor", "class_name": "source_greenhouse.components.NewRecordExtractor"}, + }, + }, + id="test_custom_extractor", + ), + ], +) +def test_transform_custom_components(component, expected_component): + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == expected_component + + +def test_propagate_parameters_to_all_components(): + component = { + "type": "DeclarativeSource", + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "roasters", "primary_key": "id"}, + "retriever": { + "type": "SimpleRetriever", + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": []}}, + "requester": { + "type": "HttpRequester", + "name": '{{ parameters["name"] }}', + "url_base": "https://coffee.example.io/v1/", + "http_method": "GET", + }, + }, + } + ], + } + + expected_component = { + "type": "DeclarativeSource", + "streams": [ + { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "name": "roasters", + "primary_key": "id", + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [], + "name": "roasters", + "primary_key": "id", + "$parameters": {"name": "roasters", "primary_key": "id"}, + }, + "name": "roasters", + "primary_key": "id", + "$parameters": {"name": "roasters", "primary_key": "id"}, + }, + "requester": { + "type": "HttpRequester", + "name": '{{ parameters["name"] }}', + "url_base": "https://coffee.example.io/v1/", + "http_method": "GET", + "primary_key": "id", + "$parameters": {"name": "roasters", "primary_key": "id"}, + }, + "$parameters": {"name": "roasters", "primary_key": "id"}, + }, + "name": "roasters", + "primary_key": "id", + "$parameters": {"name": "roasters", "primary_key": "id"}, + } + ], + } + + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == expected_component + + +def test_component_parameters_take_precedence_over_parent_parameters(): + component = { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "name": "high_priority", + "url_base": "https://coffee.example.io/v1/", + "http_method": "GET", + "primary_key": "id", + "$parameters": { + "name": "high_priority", + }, + }, + "$parameters": { + "name": "low_priority", + }, + }, + } + + expected_component = { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "name": "low_priority", + "requester": { + "type": "HttpRequester", + "name": "high_priority", + "url_base": "https://coffee.example.io/v1/", + "http_method": "GET", + "primary_key": "id", + "$parameters": { + "name": "high_priority", + }, + }, + "$parameters": { + "name": "low_priority", + }, + }, + } + + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == expected_component + + +def test_do_not_propagate_parameters_that_have_the_same_field_name(): + component = { + "type": "DeclarativeStream", + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": { + "name": "roasters", + "primary_key": "id", + "schema_loader": {"type": "JsonFileSchemaLoader", "file_path": './source_coffee/schemas/{{ parameters["name"] }}.json'}, + }, + } + ], + } + + expected_component = { + "type": "DeclarativeStream", + "streams": [ + { + "type": "DeclarativeStream", + "name": "roasters", + "primary_key": "id", + "schema_loader": { + "type": "JsonFileSchemaLoader", + "file_path": './source_coffee/schemas/{{ parameters["name"] }}.json', + "name": "roasters", + "primary_key": "id", + "$parameters": { + "name": "roasters", + "primary_key": "id", + }, + }, + "$parameters": { + "name": "roasters", + "primary_key": "id", + "schema_loader": {"type": "JsonFileSchemaLoader", "file_path": './source_coffee/schemas/{{ parameters["name"] }}.json'}, + }, + } + ], + } + + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == expected_component + + +def test_ignore_empty_parameters(): + component = { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": []}}, + }, + } + + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == component + + +def test_only_propagate_parameters_to_components(): + component = { + "type": "ParentComponent", + "component_with_object_properties": { + "type": "TestComponent", + "subcomponent": { + "type": "TestSubComponent", + "some_field": "high_priority", + "$parameters": { + "some_option": "already", + }, + }, + "dictionary_field": {"details": "should_not_contain_parameters", "other": "no_parameters_as_fields"}, + "$parameters": { + "included": "not!", + }, + }, + } + + expected_component = { + "type": "ParentComponent", + "component_with_object_properties": { + "type": "TestComponent", + "subcomponent": { + "type": "TestSubComponent", + "some_field": "high_priority", + "some_option": "already", + "included": "not!", + "$parameters": {"some_option": "already", "included": "not!"}, + }, + "dictionary_field": {"details": "should_not_contain_parameters", "other": "no_parameters_as_fields"}, + "included": "not!", + "$parameters": { + "included": "not!", + }, + }, + } + + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == expected_component + + +def test_do_not_propagate_parameters_on_json_schema_object(): + component = { + "type": "DeclarativeStream", + "streams": [ + { + "type": "DeclarativeStream", + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/schema#", + "properties": {"id": {"type": "string"}}, + }, + }, + "$parameters": { + "name": "roasters", + "primary_key": "id", + }, + } + ], + } + + expected_component = { + "type": "DeclarativeStream", + "streams": [ + { + "type": "DeclarativeStream", + "name": "roasters", + "primary_key": "id", + "schema_loader": { + "type": "InlineSchemaLoader", + "name": "roasters", + "primary_key": "id", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/schema#", + "properties": {"id": {"type": "string"}}, + }, + "$parameters": { + "name": "roasters", + "primary_key": "id", + }, + }, + "$parameters": { + "name": "roasters", + "primary_key": "id", + }, + } + ], + } + + transformer = ManifestComponentTransformer() + actual_component = transformer.propagate_types_and_parameters("", component, {}) + + assert actual_component == expected_component diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py new file mode 100644 index 000000000000..75ee51c8c4d2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.parsers.custom_exceptions import CircularReferenceException, UndefinedReferenceException +from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ManifestReferenceResolver, _parse_path + +resolver = ManifestReferenceResolver() + + +# @ +def test_refer(): + content = {"limit": 50, "limit_ref": "#/limit"} + config = resolver.preprocess_manifest(content) + assert config["limit_ref"] == 50 + + +def test_refer_to_inner(): + content = {"dict": {"limit": 50}, "limit_ref": "#/dict/limit"} + config = resolver.preprocess_manifest(content) + assert config["limit_ref"] == 50 + + +def test_refer_to_non_existant_struct(): + content = {"dict": {"limit": 50}, "limit_ref": "#/not_dict"} + with pytest.raises(UndefinedReferenceException): + resolver.preprocess_manifest(content) + + +def test_refer_in_dict(): + content = {"limit": 50, "offset_request_parameters": {"offset": "{{ next_page_token['offset'] }}", "limit": "#/limit"}} + config = resolver.preprocess_manifest(content) + assert config["offset_request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + assert config["offset_request_parameters"]["limit"] == 50 + + +def test_refer_to_dict(): + content = { + "limit": 50, + "offset_request_parameters": {"offset": "{{ next_page_token['offset'] }}", "limit": "#/limit"}, + "offset_pagination_request_parameters": { + "class": "InterpolatedRequestParameterProvider", + "request_parameters": "#/offset_request_parameters", + }, + } + config = resolver.preprocess_manifest(content) + assert config["limit"] == 50 + assert config["offset_request_parameters"]["limit"] == 50 + assert len(config["offset_pagination_request_parameters"]) == 2 + assert config["offset_pagination_request_parameters"]["request_parameters"]["limit"] == 50 + assert config["offset_pagination_request_parameters"]["request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + + +def test_refer_and_overwrite(): + content = { + "limit": 50, + "custom_limit": 25, + "offset_request_parameters": {"offset": "{{ next_page_token['offset'] }}", "limit": "#/limit"}, + "custom_request_parameters": {"$ref": "#/offset_request_parameters", "limit": "#/custom_limit"}, + } + config = resolver.preprocess_manifest(content) + assert config["offset_request_parameters"]["limit"] == 50 + assert config["custom_request_parameters"]["limit"] == 25 + + assert config["offset_request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + assert config["custom_request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + + +def test_collision(): + content = { + "example": { + "nested": {"path": "first one", "more_nested": {"value": "found it!"}}, + "nested/path": "uh oh", + }, + "reference_to_nested_path": {"$ref": "#/example/nested/path"}, + "reference_to_nested_nested_value": {"$ref": "#/example/nested/more_nested/value"}, + } + config = resolver.preprocess_manifest(content) + assert config["example"]["nested"]["path"] == "first one" + assert config["example"]["nested/path"] == "uh oh" + assert config["reference_to_nested_path"] == "uh oh" + assert config["example"]["nested"]["more_nested"]["value"] == "found it!" + assert config["reference_to_nested_nested_value"] == "found it!" + + +def test_internal_collision(): + content = { + "example": { + "nested": {"path": {"internal": "uh oh"}, "path/internal": "found it!"}, + }, + "reference": {"$ref": "#/example/nested/path/internal"}, + } + config = resolver.preprocess_manifest(content) + assert config["example"]["nested"]["path"]["internal"] == "uh oh" + assert config["example"]["nested"]["path/internal"] == "found it!" + assert config["reference"] == "found it!" + + +def test_parse_path(): + assert _parse_path("foo/bar") == ("foo", "bar") + assert _parse_path("foo/7/8/bar") == ("foo", "7/8/bar") + assert _parse_path("7/8/bar") == (7, "8/bar") + assert _parse_path("8/bar") == (8, "bar") + assert _parse_path("8foo/bar") == ("8foo", "bar") + + +def test_list(): + content = {"list": ["A", "B"], "elem_ref": "#/list/0"} + config = resolver.preprocess_manifest(content) + elem_ref = config["elem_ref"] + assert elem_ref == "A" + + +def test_nested_list(): + content = {"list": [["A"], ["B"]], "elem_ref": "#/list/1/0"} + config = resolver.preprocess_manifest(content) + elem_ref = config["elem_ref"] + assert elem_ref == "B" + + +def test_list_of_dicts(): + content = {"list": [{"A": "a"}, {"B": "b"}], "elem_ref": "#/list/1/B"} + config = resolver.preprocess_manifest(content) + elem_ref = config["elem_ref"] + assert elem_ref == "b" + + +def test_multiple_levels_of_indexing(): + content = {"list": [{"A": ["a1", "a2"]}, {"B": ["b1", "b2"]}], "elem_ref": "#/list/1/B/0"} + config = resolver.preprocess_manifest(content) + elem_ref = config["elem_ref"] + assert elem_ref == "b1" + + +def test_circular_reference(): + content = {"elem_ref1": "#/elem_ref2", "elem_ref2": "#/elem_ref1"} + with pytest.raises(CircularReferenceException): + resolver.preprocess_manifest(content) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py new file mode 100644 index 000000000000..f47a890bbcaf --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -0,0 +1,2792 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +# mypy: ignore-errors +import datetime +from typing import Any, Mapping + +import freezegun +import pendulum +import pytest +from airbyte_cdk import AirbyteTracedException +from airbyte_cdk.models import FailureType, Level +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator +from airbyte_cdk.sources.declarative.auth.token import ( + ApiKeyAuthenticator, + BasicHttpAuthenticator, + BearerAuthenticator, + LegacySessionTokenAuthenticator, +) +from airbyte_cdk.sources.declarative.auth.token_provider import SessionTokenProvider +from airbyte_cdk.sources.declarative.checks import CheckStream +from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel +from airbyte_cdk.sources.declarative.datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator +from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector +from airbyte_cdk.sources.declarative.extractors.record_filter import ClientSideIncrementalRecordFilterDecorator +from airbyte_cdk.sources.declarative.incremental import ( + CursorFactory, + DatetimeBasedCursor, + PerPartitionCursor, + PerPartitionWithGlobalCursor, + ResumableFullRefreshCursor, +) +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.models import CheckStream as CheckStreamModel +from airbyte_cdk.sources.declarative.models import CompositeErrorHandler as CompositeErrorHandlerModel +from airbyte_cdk.sources.declarative.models import ConcurrencyLevel as ConcurrencyLevelModel +from airbyte_cdk.sources.declarative.models import CustomErrorHandler as CustomErrorHandlerModel +from airbyte_cdk.sources.declarative.models import CustomPartitionRouter as CustomPartitionRouterModel +from airbyte_cdk.sources.declarative.models import CustomSchemaLoader as CustomSchemaLoaderModel +from airbyte_cdk.sources.declarative.models import DatetimeBasedCursor as DatetimeBasedCursorModel +from airbyte_cdk.sources.declarative.models import DeclarativeStream as DeclarativeStreamModel +from airbyte_cdk.sources.declarative.models import DefaultPaginator as DefaultPaginatorModel +from airbyte_cdk.sources.declarative.models import HttpRequester as HttpRequesterModel +from airbyte_cdk.sources.declarative.models import JwtAuthenticator as JwtAuthenticatorModel +from airbyte_cdk.sources.declarative.models import ListPartitionRouter as ListPartitionRouterModel +from airbyte_cdk.sources.declarative.models import OAuthAuthenticator as OAuthAuthenticatorModel +from airbyte_cdk.sources.declarative.models import RecordSelector as RecordSelectorModel +from airbyte_cdk.sources.declarative.models import SimpleRetriever as SimpleRetrieverModel +from airbyte_cdk.sources.declarative.models import Spec as SpecModel +from airbyte_cdk.sources.declarative.models import SubstreamPartitionRouter as SubstreamPartitionRouterModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import OffsetIncrement as OffsetIncrementModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import PageIncrement as PageIncrementModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import SelectiveAuthenticator +from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import ManifestComponentTransformer +from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ManifestReferenceResolver +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ModelToComponentFactory +from airbyte_cdk.sources.declarative.partition_routers import ( + CartesianProductStreamSlicer, + ListPartitionRouter, + SinglePartitionRouter, + SubstreamPartitionRouter, +) +from airbyte_cdk.sources.declarative.requesters import HttpRequester +from airbyte_cdk.sources.declarative.requesters.error_handlers import CompositeErrorHandler, DefaultErrorHandler, HttpResponseFilter +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( + ConstantBackoffStrategy, + ExponentialBackoffStrategy, + WaitTimeFromHeaderBackoffStrategy, + WaitUntilTimeFromHeaderBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.paginators import DefaultPaginator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( + CursorPaginationStrategy, + OffsetIncrement, + PageIncrement, + StopConditionPaginationStrategyDecorator, +) +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.requesters.request_options import ( + DatetimeBasedRequestOptionsProvider, + DefaultRequestOptionsProvider, + InterpolatedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever, SimpleRetrieverTestReadDecorator +from airbyte_cdk.sources.declarative.schema import JsonFileSchemaLoader +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.declarative.spec import Spec +from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields +from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition +from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + CustomFormatConcurrentStreamStateConverter, +) +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction +from airbyte_cdk.sources.streams.http.requests_native_auth.oauth import SingleUseRefreshTokenOauth2Authenticator +from unit_tests.sources.declarative.parsers.testing_components import TestingCustomSubstreamPartitionRouter, TestingSomeComponent + +factory = ModelToComponentFactory() + +resolver = ManifestReferenceResolver() + +transformer = ManifestComponentTransformer() + +input_config = {"apikey": "verysecrettoken", "repos": ["airbyte", "airbyte-cloud"]} + + +def test_create_check_stream(): + manifest = {"check": {"type": "CheckStream", "stream_names": ["list_stream"]}} + + check = factory.create_component(CheckStreamModel, manifest["check"], {}) + + assert isinstance(check, CheckStream) + assert check.stream_names == ["list_stream"] + + +def test_create_component_type_mismatch(): + manifest = {"check": {"type": "MismatchType", "stream_names": ["list_stream"]}} + + with pytest.raises(ValueError): + factory.create_component(CheckStreamModel, manifest["check"], {}) + + +def test_full_config_stream(): + content = """ +decoder: + type: JsonDecoder +extractor: + type: DpathExtractor +selector: + type: RecordSelector + record_filter: + type: RecordFilter + condition: "{{ record['id'] > stream_state['id'] }}" +metadata_paginator: + type: DefaultPaginator + page_size_option: + type: RequestOption + inject_into: request_parameter + field_name: page_size + page_token_option: + type: RequestPath + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 +requester: + type: HttpRequester + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['apikey'] }}" + request_parameters: + unit: "day" +retriever: + paginator: + type: NoPagination + decoder: + $ref: "#/decoder" +partial_stream: + type: DeclarativeStream + schema_loader: + type: JsonFileSchemaLoader + file_path: "./source_sendgrid/schemas/{{ parameters.name }}.json" +list_stream: + $ref: "#/partial_stream" + $parameters: + name: "lists" + extractor: + $ref: "#/extractor" + field_path: ["{{ parameters['name'] }}"] + name: "lists" + primary_key: "id" + retriever: + $ref: "#/retriever" + requester: + $ref: "#/requester" + path: "{{ next_page_token['next_page_url'] }}" + paginator: + $ref: "#/metadata_paginator" + record_selector: + $ref: "#/selector" + transformations: + - type: AddFields + fields: + - path: ["extra"] + value: "{{ response.to_add }}" + incremental_sync: + type: DatetimeBasedCursor + start_datetime: "{{ config['start_time'] }}" + end_datetime: "{{ config['end_time'] }}" + step: "P10D" + cursor_field: "created" + cursor_granularity: "PT0.000001S" + start_time_option: + type: RequestOption + inject_into: request_parameter + field_name: after + end_time_option: + type: RequestOption + inject_into: request_parameter + field_name: before + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" +check: + type: CheckStream + stream_names: ["list_stream"] +concurrency_level: + type: ConcurrencyLevel + default_concurrency: "{{ config['num_workers'] or 10 }}" + max_concurrency: 25 +spec: + type: Spec + documentation_url: https://airbyte.com/#yaml-from-manifest + connection_specification: + title: Test Spec + type: object + required: + - api_key + additionalProperties: false + properties: + api_key: + type: string + airbyte_secret: true + title: API Key + description: Test API Key + order: 0 + advanced_auth: + auth_flow_type: "oauth2.0" + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + resolved_manifest["type"] = "DeclarativeSource" + manifest = transformer.propagate_types_and_parameters("", resolved_manifest, {}) + + stream_manifest = manifest["list_stream"] + assert stream_manifest["type"] == "DeclarativeStream" + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream, DeclarativeStream) + assert stream.primary_key == "id" + assert stream.name == "lists" + assert stream._stream_cursor_field.string == "created" + + assert isinstance(stream.schema_loader, JsonFileSchemaLoader) + assert stream.schema_loader._get_json_filepath() == "./source_sendgrid/schemas/lists.json" + + assert len(stream.retriever.record_selector.transformations) == 1 + add_fields = stream.retriever.record_selector.transformations[0] + assert isinstance(add_fields, AddFields) + assert add_fields.fields[0].path == ["extra"] + assert add_fields.fields[0].value.string == "{{ response.to_add }}" + + assert isinstance(stream.retriever, SimpleRetriever) + assert stream.retriever.primary_key == stream.primary_key + assert stream.retriever.name == stream.name + + assert isinstance(stream.retriever.record_selector, RecordSelector) + + assert isinstance(stream.retriever.record_selector.extractor, DpathExtractor) + assert isinstance(stream.retriever.record_selector.extractor.decoder, JsonDecoder) + assert [fp.eval(input_config) for fp in stream.retriever.record_selector.extractor._field_path] == ["lists"] + + assert isinstance(stream.retriever.record_selector.record_filter, RecordFilter) + assert stream.retriever.record_selector.record_filter._filter_interpolator.condition == "{{ record['id'] > stream_state['id'] }}" + + assert isinstance(stream.retriever.paginator, DefaultPaginator) + assert isinstance(stream.retriever.paginator.decoder, PaginationDecoderDecorator) + assert stream.retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" + assert stream.retriever.paginator.page_size_option.inject_into == RequestOptionType.request_parameter + assert isinstance(stream.retriever.paginator.page_token_option, RequestPath) + assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" + assert stream.retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" + + assert isinstance(stream.retriever.paginator.pagination_strategy, CursorPaginationStrategy) + assert isinstance(stream.retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator) + assert stream.retriever.paginator.pagination_strategy._cursor_value.string == "{{ response._metadata.next }}" + assert stream.retriever.paginator.pagination_strategy._cursor_value.default == "{{ response._metadata.next }}" + assert stream.retriever.paginator.pagination_strategy.page_size == 10 + + assert isinstance(stream.retriever.requester, HttpRequester) + assert stream.retriever.requester.http_method == HttpMethod.GET + assert stream.retriever.requester.name == stream.name + assert stream.retriever.requester._path.string == "{{ next_page_token['next_page_url'] }}" + assert stream.retriever.requester._path.default == "{{ next_page_token['next_page_url'] }}" + + assert isinstance(stream.retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) + assert stream.retriever.request_option_provider.start_time_option.inject_into == RequestOptionType.request_parameter + assert stream.retriever.request_option_provider.start_time_option.field_name.eval(config=input_config) == "after" + assert stream.retriever.request_option_provider.end_time_option.inject_into == RequestOptionType.request_parameter + assert stream.retriever.request_option_provider.end_time_option.field_name.eval(config=input_config) == "before" + assert stream.retriever.request_option_provider._partition_field_start.string == "start_time" + assert stream.retriever.request_option_provider._partition_field_end.string == "end_time" + + assert isinstance(stream.retriever.requester.authenticator, BearerAuthenticator) + assert stream.retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" + + assert isinstance(stream.retriever.requester.request_options_provider, InterpolatedRequestOptionsProvider) + assert stream.retriever.requester.request_options_provider.request_parameters.get("unit") == "day" + + checker = factory.create_component(model_type=CheckStreamModel, component_definition=manifest["check"], config=input_config) + + assert isinstance(checker, CheckStream) + streams_to_check = checker.stream_names + assert len(streams_to_check) == 1 + assert list(streams_to_check)[0] == "list_stream" + + spec = factory.create_component(model_type=SpecModel, component_definition=manifest["spec"], config=input_config) + + assert isinstance(spec, Spec) + documentation_url = spec.documentation_url + connection_specification = spec.connection_specification + assert documentation_url == "https://airbyte.com/#yaml-from-manifest" + assert connection_specification["title"] == "Test Spec" + assert connection_specification["required"] == ["api_key"] + assert connection_specification["properties"]["api_key"] == { + "type": "string", + "airbyte_secret": True, + "title": "API Key", + "description": "Test API Key", + "order": 0, + } + advanced_auth = spec.advanced_auth + assert advanced_auth.auth_flow_type.value == "oauth2.0" + + concurrency_level = factory.create_component( + model_type=ConcurrencyLevelModel, component_definition=manifest["concurrency_level"], config=input_config + ) + assert isinstance(concurrency_level, ConcurrencyLevel) + assert isinstance(concurrency_level._default_concurrency, InterpolatedString) + assert concurrency_level._default_concurrency.string == "{{ config['num_workers'] or 10 }}" + assert concurrency_level.max_concurrency == 25 + + +def test_interpolate_config(): + content = """ + authenticator: + type: OAuthAuthenticator + client_id: "some_client_id" + client_secret: "some_client_secret" + token_refresh_endpoint: "https://api.sendgrid.com/v3/auth" + refresh_token: "{{ config['apikey'] }}" + refresh_request_body: + body_field: "yoyoyo" + interpolated_body_field: "{{ config['apikey'] }}" + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + authenticator_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["authenticator"], {}) + + authenticator = factory.create_component( + model_type=OAuthAuthenticatorModel, component_definition=authenticator_manifest, config=input_config + ) + + assert isinstance(authenticator, DeclarativeOauth2Authenticator) + assert authenticator._client_id.eval(input_config) == "some_client_id" + assert authenticator._client_secret.string == "some_client_secret" + assert authenticator._token_refresh_endpoint.eval(input_config) == "https://api.sendgrid.com/v3/auth" + assert authenticator._refresh_token.eval(input_config) == "verysecrettoken" + assert authenticator._refresh_request_body.mapping == {"body_field": "yoyoyo", "interpolated_body_field": "{{ config['apikey'] }}"} + assert authenticator.get_refresh_request_body() == {"body_field": "yoyoyo", "interpolated_body_field": "verysecrettoken"} + + +def test_interpolate_config_with_token_expiry_date_format(): + content = """ + authenticator: + type: OAuthAuthenticator + client_id: "some_client_id" + client_secret: "some_client_secret" + token_refresh_endpoint: "https://api.sendgrid.com/v3/auth" + refresh_token: "{{ config['apikey'] }}" + token_expiry_date_format: "%Y-%m-%d %H:%M:%S.%f+00:00" + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + authenticator_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["authenticator"], {}) + + authenticator = factory.create_component( + model_type=OAuthAuthenticatorModel, component_definition=authenticator_manifest, config=input_config + ) + + assert isinstance(authenticator, DeclarativeOauth2Authenticator) + assert authenticator.token_expiry_date_format == "%Y-%m-%d %H:%M:%S.%f+00:00" + assert authenticator.token_expiry_is_time_of_expiration + assert authenticator._client_id.eval(input_config) == "some_client_id" + assert authenticator._client_secret.string == "some_client_secret" + assert authenticator._token_refresh_endpoint.eval(input_config) == "https://api.sendgrid.com/v3/auth" + + +def test_single_use_oauth_branch(): + single_use_input_config = { + "apikey": "verysecrettoken", + "repos": ["airbyte", "airbyte-cloud"], + "credentials": {"access_token": "access_token", "token_expiry_date": "1970-01-01"}, + } + + content = """ + authenticator: + type: OAuthAuthenticator + client_id: "some_client_id" + client_secret: "some_client_secret" + token_refresh_endpoint: "https://api.sendgrid.com/v3/auth" + refresh_token: "{{ config['apikey'] }}" + refresh_request_body: + body_field: "yoyoyo" + interpolated_body_field: "{{ config['apikey'] }}" + refresh_token_updater: + refresh_token_name: "the_refresh_token" + refresh_token_error_status_codes: [400] + refresh_token_error_key: "error" + refresh_token_error_values: ["invalid_grant"] + refresh_token_config_path: + - apikey + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + authenticator_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["authenticator"], {}) + + authenticator: SingleUseRefreshTokenOauth2Authenticator = factory.create_component( + model_type=OAuthAuthenticatorModel, component_definition=authenticator_manifest, config=single_use_input_config + ) + + assert isinstance(authenticator, SingleUseRefreshTokenOauth2Authenticator) + assert authenticator._client_id == "some_client_id" + assert authenticator._client_secret == "some_client_secret" + assert authenticator._token_refresh_endpoint == "https://api.sendgrid.com/v3/auth" + assert authenticator._refresh_token == "verysecrettoken" + assert authenticator._refresh_request_body == {"body_field": "yoyoyo", "interpolated_body_field": "verysecrettoken"} + assert authenticator._refresh_token_name == "the_refresh_token" + assert authenticator._refresh_token_config_path == ["apikey"] + # default values + assert authenticator._access_token_config_path == ["credentials", "access_token"] + assert authenticator._token_expiry_date_config_path == ["credentials", "token_expiry_date"] + assert authenticator._refresh_token_error_status_codes == [400] + assert authenticator._refresh_token_error_key == "error" + assert authenticator._refresh_token_error_values == ["invalid_grant"] + + +def test_list_based_stream_slicer_with_values_refd(): + content = """ + repositories: ["airbyte", "airbyte-cloud"] + partition_router: + type: ListPartitionRouter + values: "#/repositories" + cursor_field: repository + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + partition_router_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["partition_router"], {}) + + partition_router = factory.create_component( + model_type=ListPartitionRouterModel, component_definition=partition_router_manifest, config=input_config + ) + + assert isinstance(partition_router, ListPartitionRouter) + assert partition_router.values == ["airbyte", "airbyte-cloud"] + + +def test_list_based_stream_slicer_with_values_defined_in_config(): + content = """ + partition_router: + type: ListPartitionRouter + values: "{{config['repos']}}" + cursor_field: repository + request_option: + type: RequestOption + inject_into: header + field_name: repository + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + partition_router_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["partition_router"], {}) + + partition_router = factory.create_component( + model_type=ListPartitionRouterModel, component_definition=partition_router_manifest, config=input_config + ) + + assert isinstance(partition_router, ListPartitionRouter) + assert partition_router.values == ["airbyte", "airbyte-cloud"] + assert partition_router.request_option.inject_into == RequestOptionType.header + assert partition_router.request_option.field_name.eval(config=input_config) == "repository" + + +def test_create_substream_partition_router(): + content = """ + schema_loader: + file_path: "./source_sendgrid/schemas/{{ parameters['name'] }}.yaml" + name: "{{ parameters['stream_name'] }}" + retriever: + requester: + type: "HttpRequester" + path: "kek" + record_selector: + extractor: + field_path: [] + stream_A: + type: DeclarativeStream + name: "A" + primary_key: "id" + $parameters: + retriever: "#/retriever" + url_base: "https://airbyte.io" + schema_loader: "#/schema_loader" + stream_B: + type: DeclarativeStream + name: "B" + primary_key: "id" + $parameters: + retriever: "#/retriever" + url_base: "https://airbyte.io" + schema_loader: "#/schema_loader" + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - stream: "#/stream_A" + parent_key: id + partition_field: repository_id + request_option: + type: RequestOption + inject_into: request_parameter + field_name: repository_id + - stream: "#/stream_B" + parent_key: someid + partition_field: word_id + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + partition_router_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["partition_router"], {}) + + partition_router = factory.create_component( + model_type=SubstreamPartitionRouterModel, component_definition=partition_router_manifest, config=input_config + ) + + assert isinstance(partition_router, SubstreamPartitionRouter) + parent_stream_configs = partition_router.parent_stream_configs + assert len(parent_stream_configs) == 2 + assert isinstance(parent_stream_configs[0].stream, DeclarativeStream) + assert isinstance(parent_stream_configs[1].stream, DeclarativeStream) + + assert partition_router.parent_stream_configs[0].parent_key.eval({}) == "id" + assert partition_router.parent_stream_configs[0].partition_field.eval({}) == "repository_id" + assert partition_router.parent_stream_configs[0].request_option.inject_into == RequestOptionType.request_parameter + assert partition_router.parent_stream_configs[0].request_option.field_name.eval(config=input_config) == "repository_id" + + assert partition_router.parent_stream_configs[1].parent_key.eval({}) == "someid" + assert partition_router.parent_stream_configs[1].partition_field.eval({}) == "word_id" + assert partition_router.parent_stream_configs[1].request_option is None + + +def test_datetime_based_cursor(): + content = """ + incremental: + type: DatetimeBasedCursor + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: + type: MinMaxDatetime + datetime: "{{ config['start_time'] }}" + min_datetime: "{{ config['start_time'] + day_delta(2) }}" + end_datetime: "{{ config['end_time'] }}" + step: "P10D" + cursor_field: "created" + cursor_granularity: "PT0.000001S" + lookback_window: "P5D" + start_time_option: + type: RequestOption + inject_into: request_parameter + field_name: "since_{{ config['cursor_field'] }}" + end_time_option: + type: RequestOption + inject_into: body_json + field_name: "before_{{ parameters['cursor_field'] }}" + partition_field_start: star + partition_field_end: en + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + slicer_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["incremental"], {"cursor_field": "created_at"}) + + stream_slicer = factory.create_component(model_type=DatetimeBasedCursorModel, component_definition=slicer_manifest, config=input_config) + + assert isinstance(stream_slicer, DatetimeBasedCursor) + assert stream_slicer._step == datetime.timedelta(days=10) + assert stream_slicer.cursor_field.string == "created" + assert stream_slicer.cursor_granularity == "PT0.000001S" + assert stream_slicer._lookback_window.string == "P5D" + assert stream_slicer.start_time_option.inject_into == RequestOptionType.request_parameter + assert stream_slicer.start_time_option.field_name.eval(config=input_config | {"cursor_field": "updated_at"}) == "since_updated_at" + assert stream_slicer.end_time_option.inject_into == RequestOptionType.body_json + assert stream_slicer.end_time_option.field_name.eval({}) == "before_created_at" + assert stream_slicer._partition_field_start.eval({}) == "star" + assert stream_slicer._partition_field_end.eval({}) == "en" + + assert isinstance(stream_slicer._start_datetime, MinMaxDatetime) + assert stream_slicer.start_datetime._datetime_format == "%Y-%m-%dT%H:%M:%S.%f%z" + assert stream_slicer.start_datetime.datetime.string == "{{ config['start_time'] }}" + assert stream_slicer.start_datetime.min_datetime.string == "{{ config['start_time'] + day_delta(2) }}" + + assert isinstance(stream_slicer._end_datetime, MinMaxDatetime) + assert stream_slicer._end_datetime.datetime.string == "{{ config['end_time'] }}" + + +def test_stream_with_incremental_and_retriever_with_partition_router(): + content = """ +decoder: + type: JsonDecoder +extractor: + type: DpathExtractor +selector: + type: RecordSelector + record_filter: + type: RecordFilter + condition: "{{ record['id'] > stream_state['id'] }}" +requester: + type: HttpRequester + name: "{{ parameters['name'] }}" + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" + authenticator: + type: SessionTokenAuthenticator + decoder: + type: JsonDecoder + expiration_duration: P10D + login_requester: + path: /session + type: HttpRequester + url_base: 'https://api.sendgrid.com' + http_method: POST + request_body_json: + password: '{{ config.apikey }}' + username: '{{ parameters.name }}' + session_token_path: + - id + request_authentication: + type: ApiKey + inject_into: + type: RequestOption + field_name: X-Metabase-Session + inject_into: header + request_parameters: + unit: "day" +list_stream: + type: DeclarativeStream + schema_loader: + type: JsonFileSchemaLoader + file_path: "./source_sendgrid/schemas/{{ parameters.name }}.json" + incremental_sync: + type: DatetimeBasedCursor + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: "{{ config['start_time'] }}" + end_datetime: "{{ config['end_time'] }}" + step: "P10D" + cursor_field: "created" + cursor_granularity: "PT0.000001S" + lookback_window: "P5D" + start_time_option: + inject_into: request_parameter + field_name: created[gte] + end_time_option: + inject_into: body_json + field_name: end_time + partition_field_start: star + partition_field_end: en + retriever: + type: SimpleRetriever + name: "{{ parameters['name'] }}" + decoder: + $ref: "#/decoder" + partition_router: + type: ListPartitionRouter + values: "{{config['repos']}}" + cursor_field: a_key + request_option: + inject_into: header + field_name: a_key + paginator: + type: DefaultPaginator + page_size_option: + inject_into: request_parameter + field_name: page_size + page_token_option: + inject_into: path + type: RequestPath + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 + requester: + $ref: "#/requester" + path: "{{ next_page_token['next_page_url'] }}" + record_selector: + $ref: "#/selector" + $parameters: + name: "lists" + primary_key: "id" + extractor: + $ref: "#/extractor" + field_path: ["{{ parameters['name'] }}"] + """ + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["list_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream, DeclarativeStream) + assert isinstance(stream.retriever, SimpleRetriever) + assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) + + datetime_stream_slicer = stream.retriever.stream_slicer._per_partition_cursor._cursor_factory.create() + assert isinstance(datetime_stream_slicer, DatetimeBasedCursor) + assert isinstance(datetime_stream_slicer._start_datetime, MinMaxDatetime) + assert datetime_stream_slicer._start_datetime.datetime.string == "{{ config['start_time'] }}" + assert isinstance(datetime_stream_slicer._end_datetime, MinMaxDatetime) + assert datetime_stream_slicer._end_datetime.datetime.string == "{{ config['end_time'] }}" + assert datetime_stream_slicer.step == "P10D" + assert datetime_stream_slicer.cursor_field.string == "created" + + list_stream_slicer = stream.retriever.stream_slicer._partition_router + assert isinstance(list_stream_slicer, ListPartitionRouter) + assert list_stream_slicer.values == ["airbyte", "airbyte-cloud"] + assert list_stream_slicer._cursor_field.string == "a_key" + + +def test_resumable_full_refresh_stream(): + content = """ +decoder: + type: JsonDecoder +extractor: + type: DpathExtractor +selector: + type: RecordSelector + record_filter: + type: RecordFilter + condition: "{{ record['id'] > stream_state['id'] }}" +metadata_paginator: + type: DefaultPaginator + page_size_option: + type: RequestOption + inject_into: request_parameter + field_name: page_size + page_token_option: + type: RequestPath + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 +requester: + type: HttpRequester + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['apikey'] }}" + request_parameters: + unit: "day" +retriever: + paginator: + type: NoPagination + decoder: + $ref: "#/decoder" +partial_stream: + type: DeclarativeStream + schema_loader: + type: JsonFileSchemaLoader + file_path: "./source_sendgrid/schemas/{{ parameters.name }}.json" +list_stream: + $ref: "#/partial_stream" + $parameters: + name: "lists" + extractor: + $ref: "#/extractor" + field_path: ["{{ parameters['name'] }}"] + name: "lists" + primary_key: "id" + retriever: + $ref: "#/retriever" + requester: + $ref: "#/requester" + path: "{{ next_page_token['next_page_url'] }}" + paginator: + $ref: "#/metadata_paginator" + record_selector: + $ref: "#/selector" + transformations: + - type: AddFields + fields: + - path: ["extra"] + value: "{{ response.to_add }}" +check: + type: CheckStream + stream_names: ["list_stream"] +spec: + type: Spec + documentation_url: https://airbyte.com/#yaml-from-manifest + connection_specification: + title: Test Spec + type: object + required: + - api_key + additionalProperties: false + properties: + api_key: + type: string + airbyte_secret: true + title: API Key + description: Test API Key + order: 0 + advanced_auth: + auth_flow_type: "oauth2.0" + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + resolved_manifest["type"] = "DeclarativeSource" + manifest = transformer.propagate_types_and_parameters("", resolved_manifest, {}) + + stream_manifest = manifest["list_stream"] + assert stream_manifest["type"] == "DeclarativeStream" + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream, DeclarativeStream) + assert stream.primary_key == "id" + assert stream.name == "lists" + assert stream._stream_cursor_field.string == "" + + assert isinstance(stream.retriever, SimpleRetriever) + assert stream.retriever.primary_key == stream.primary_key + assert stream.retriever.name == stream.name + + assert isinstance(stream.retriever.record_selector, RecordSelector) + + assert isinstance(stream.retriever.stream_slicer, ResumableFullRefreshCursor) + assert isinstance(stream.retriever.cursor, ResumableFullRefreshCursor) + + assert isinstance(stream.retriever.paginator, DefaultPaginator) + assert isinstance(stream.retriever.paginator.decoder, PaginationDecoderDecorator) + assert stream.retriever.paginator.page_size_option.field_name.eval(input_config) == "page_size" + assert stream.retriever.paginator.page_size_option.inject_into == RequestOptionType.request_parameter + assert isinstance(stream.retriever.paginator.page_token_option, RequestPath) + assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com/v3/" + assert stream.retriever.paginator.url_base.default == "https://api.sendgrid.com/v3/" + + assert isinstance(stream.retriever.paginator.pagination_strategy, CursorPaginationStrategy) + assert isinstance(stream.retriever.paginator.pagination_strategy.decoder, PaginationDecoderDecorator) + assert stream.retriever.paginator.pagination_strategy._cursor_value.string == "{{ response._metadata.next }}" + assert stream.retriever.paginator.pagination_strategy._cursor_value.default == "{{ response._metadata.next }}" + assert stream.retriever.paginator.pagination_strategy.page_size == 10 + + checker = factory.create_component(model_type=CheckStreamModel, component_definition=manifest["check"], config=input_config) + + assert isinstance(checker, CheckStream) + streams_to_check = checker.stream_names + assert len(streams_to_check) == 1 + assert list(streams_to_check)[0] == "list_stream" + + +def test_incremental_data_feed(): + content = """ +selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: ["extractor_path"] + record_filter: + type: RecordFilter + condition: "{{ record['id'] > stream_state['id'] }}" +requester: + type: HttpRequester + name: "{{ parameters['name'] }}" + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" +list_stream: + type: DeclarativeStream + incremental_sync: + type: DatetimeBasedCursor + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: "{{ config['start_time'] }}" + cursor_field: "created" + is_data_feed: true + retriever: + type: SimpleRetriever + name: "{{ parameters['name'] }}" + paginator: + type: DefaultPaginator + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 + requester: + $ref: "#/requester" + path: "/" + record_selector: + $ref: "#/selector" + $parameters: + name: "lists" + """ + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["list_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream.retriever.paginator.pagination_strategy, StopConditionPaginationStrategyDecorator) + + +def test_given_data_feed_and_incremental_then_raise_error(): + content = """ +incremental_sync: + type: DatetimeBasedCursor + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: "{{ config['start_time'] }}" + end_datetime: "2023-01-01" + cursor_field: "created" + is_data_feed: true""" + + parsed_incremental_sync = YamlDeclarativeSource._parse(content) + resolved_incremental_sync = resolver.preprocess_manifest(parsed_incremental_sync) + datetime_based_cursor_definition = transformer.propagate_types_and_parameters("", resolved_incremental_sync["incremental_sync"], {}) + + with pytest.raises(ValueError): + factory.create_component( + model_type=DatetimeBasedCursorModel, component_definition=datetime_based_cursor_definition, config=input_config + ) + + +def test_client_side_incremental(): + content = """ +selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: ["extractor_path"] +requester: + type: HttpRequester + name: "{{ parameters['name'] }}" + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" +list_stream: + type: DeclarativeStream + incremental_sync: + type: DatetimeBasedCursor + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: + type: MinMaxDatetime + datetime: "{{ config.get('start_date', '1970-01-01T00:00:00.0Z') }}" + datetime_format: "%Y-%m-%dT%H:%M:%S.%fZ" + cursor_field: "created" + is_client_side_incremental: true + retriever: + type: SimpleRetriever + name: "{{ parameters['name'] }}" + paginator: + type: DefaultPaginator + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 + requester: + $ref: "#/requester" + path: "/" + record_selector: + $ref: "#/selector" + $parameters: + name: "lists" + """ + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["list_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream.retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator) + + +def test_client_side_incremental_with_partition_router(): + content = """ +selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: ["extractor_path"] +requester: + type: HttpRequester + name: "{{ parameters['name'] }}" + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" +schema_loader: + file_path: "./source_sendgrid/schemas/{{ parameters['name'] }}.yaml" + name: "{{ parameters['stream_name'] }}" +retriever: + requester: + type: "HttpRequester" + path: "kek" + record_selector: + extractor: + field_path: [] +stream_A: + type: DeclarativeStream + name: "A" + primary_key: "id" + $parameters: + retriever: "#/retriever" + url_base: "https://airbyte.io" + schema_loader: "#/schema_loader" +list_stream: + type: DeclarativeStream + incremental_sync: + type: DatetimeBasedCursor + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: + type: MinMaxDatetime + datetime: "{{ config.get('start_date', '1970-01-01T00:00:00.0Z') }}" + datetime_format: "%Y-%m-%dT%H:%M:%S.%fZ" + cursor_field: "created" + is_client_side_incremental: true + retriever: + type: SimpleRetriever + name: "{{ parameters['name'] }}" + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - stream: "#/stream_A" + parent_key: id + partition_field: id + paginator: + type: DefaultPaginator + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 + requester: + $ref: "#/requester" + path: "/" + record_selector: + $ref: "#/selector" + $parameters: + name: "lists" + """ + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["list_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream.retriever.record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator) + assert isinstance(stream.retriever.record_selector.record_filter._substream_cursor, PerPartitionWithGlobalCursor) + + +def test_given_data_feed_and_client_side_incremental_then_raise_error(): + content = """ +incremental_sync: + type: DatetimeBasedCursor + $parameters: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: "{{ config['start_time'] }}" + cursor_field: "created" + is_data_feed: true + is_client_side_incremental: true + """ + + parsed_incremental_sync = YamlDeclarativeSource._parse(content) + resolved_incremental_sync = resolver.preprocess_manifest(parsed_incremental_sync) + datetime_based_cursor_definition = transformer.propagate_types_and_parameters("", resolved_incremental_sync["incremental_sync"], {}) + + with pytest.raises(ValueError) as e: + factory.create_component( + model_type=DatetimeBasedCursorModel, component_definition=datetime_based_cursor_definition, config=input_config + ) + assert e.value.args[0] == "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." + + +@pytest.mark.parametrize( + "test_name, record_selector, expected_runtime_selector", + [("test_static_record_selector", "result", "result"), ("test_options_record_selector", "{{ parameters['name'] }}", "lists")], +) +def test_create_record_selector(test_name, record_selector, expected_runtime_selector): + content = f""" + extractor: + type: DpathExtractor + selector: + $parameters: + name: "lists" + type: RecordSelector + record_filter: + type: RecordFilter + condition: "{{{{ record['id'] > stream_state['id'] }}}}" + extractor: + $ref: "#/extractor" + field_path: ["{record_selector}"] + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + selector_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["selector"], {}) + + selector = factory.create_component( + model_type=RecordSelectorModel, component_definition=selector_manifest, decoder=None, transformations=[], config=input_config + ) + + assert isinstance(selector, RecordSelector) + assert isinstance(selector.extractor, DpathExtractor) + assert [fp.eval(input_config) for fp in selector.extractor._field_path] == [expected_runtime_selector] + assert isinstance(selector.record_filter, RecordFilter) + assert selector.record_filter.condition == "{{ record['id'] > stream_state['id'] }}" + + +@pytest.mark.parametrize( + "test_name, error_handler, expected_backoff_strategy_type", + [ + ( + "test_create_requester_constant_error_handler", + """ + error_handler: + backoff_strategies: + - type: "ConstantBackoffStrategy" + backoff_time_in_seconds: 5 + """, + ConstantBackoffStrategy, + ), + ( + "test_create_requester_exponential_error_handler", + """ + error_handler: + backoff_strategies: + - type: "ExponentialBackoffStrategy" + factor: 5 + """, + ExponentialBackoffStrategy, + ), + ( + "test_create_requester_wait_time_from_header_error_handler", + """ + error_handler: + backoff_strategies: + - type: "WaitTimeFromHeader" + header: "a_header" + """, + WaitTimeFromHeaderBackoffStrategy, + ), + ( + "test_create_requester_wait_time_until_from_header_error_handler", + """ + error_handler: + backoff_strategies: + - type: "WaitUntilTimeFromHeader" + header: "a_header" + """, + WaitUntilTimeFromHeaderBackoffStrategy, + ), + ("test_create_requester_no_error_handler", """""", None), + ], +) +def test_create_requester(test_name, error_handler, expected_backoff_strategy_type): + content = f""" +requester: + type: HttpRequester + path: "/v3/marketing/lists" + $parameters: + name: 'lists' + url_base: "https://api.sendgrid.com" + authenticator: + type: "BasicHttpAuthenticator" + username: "{{{{ parameters.name}}}}" + password: "{{{{ config.apikey }}}}" + request_parameters: + a_parameter: "something_here" + request_headers: + header: header_value + {error_handler} + """ + name = "name" + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + requester_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["requester"], {}) + + selector = factory.create_component( + model_type=HttpRequesterModel, + component_definition=requester_manifest, + config=input_config, + name=name, + decoder=None, + ) + + assert isinstance(selector, HttpRequester) + assert selector.http_method == HttpMethod.GET + assert selector.name == "name" + assert selector._path.string == "/v3/marketing/lists" + assert selector._url_base.string == "https://api.sendgrid.com" + + assert isinstance(selector.error_handler, DefaultErrorHandler) + if expected_backoff_strategy_type: + assert len(selector.error_handler.backoff_strategies) == 1 + assert isinstance(selector.error_handler.backoff_strategies[0], expected_backoff_strategy_type) + + assert isinstance(selector.authenticator, BasicHttpAuthenticator) + assert selector.authenticator._username.eval(input_config) == "lists" + assert selector.authenticator._password.eval(input_config) == "verysecrettoken" + + assert isinstance(selector._request_options_provider, InterpolatedRequestOptionsProvider) + assert selector._request_options_provider._parameter_interpolator._interpolator.mapping["a_parameter"] == "something_here" + assert selector._request_options_provider._headers_interpolator._interpolator.mapping["header"] == "header_value" + + +def test_create_request_with_legacy_session_authenticator(): + content = """ +requester: + type: HttpRequester + path: "/v3/marketing/lists" + $parameters: + name: 'lists' + url_base: "https://api.sendgrid.com" + authenticator: + type: "LegacySessionTokenAuthenticator" + username: "{{ parameters.name}}" + password: "{{ config.apikey }}" + login_url: "login" + header: "token" + session_token_response_key: "session" + validate_session_url: validate + request_parameters: + a_parameter: "something_here" + request_headers: + header: header_value + """ + name = "name" + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + requester_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["requester"], {}) + + selector = factory.create_component( + model_type=HttpRequesterModel, component_definition=requester_manifest, config=input_config, name=name, decoder=None + ) + + assert isinstance(selector, HttpRequester) + assert isinstance(selector.authenticator, LegacySessionTokenAuthenticator) + assert selector.authenticator._username.eval(input_config) == "lists" + assert selector.authenticator._password.eval(input_config) == "verysecrettoken" + assert selector.authenticator._api_url.eval(input_config) == "https://api.sendgrid.com" + + +def test_create_request_with_session_authenticator(): + content = """ +requester: + type: HttpRequester + path: "/v3/marketing/lists" + $parameters: + name: 'lists' + url_base: "https://api.sendgrid.com" + authenticator: + type: SessionTokenAuthenticator + decoder: + type: JsonDecoder + expiration_duration: P10D + login_requester: + path: /session + type: HttpRequester + url_base: 'https://api.sendgrid.com' + http_method: POST + request_body_json: + password: '{{ config.apikey }}' + username: '{{ parameters.name }}' + session_token_path: + - id + request_authentication: + type: ApiKey + inject_into: + type: RequestOption + field_name: X-Metabase-Session + inject_into: header + request_parameters: + a_parameter: "something_here" + request_headers: + header: header_value + """ + name = "name" + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + requester_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["requester"], {}) + + selector = factory.create_component( + model_type=HttpRequesterModel, component_definition=requester_manifest, config=input_config, name=name, decoder=None + ) + + assert isinstance(selector.authenticator, ApiKeyAuthenticator) + assert isinstance(selector.authenticator.token_provider, SessionTokenProvider) + assert selector.authenticator.token_provider.session_token_path == ["id"] + assert isinstance(selector.authenticator.token_provider.login_requester, HttpRequester) + assert selector.authenticator.token_provider.session_token_path == ["id"] + assert selector.authenticator.token_provider.login_requester._url_base.eval(input_config) == "https://api.sendgrid.com" + assert selector.authenticator.token_provider.login_requester.get_request_body_json() == { + "username": "lists", + "password": "verysecrettoken", + } + + +def test_given_composite_error_handler_does_not_match_response_then_fallback_on_default_error_handler(requests_mock): + content = """ +requester: + type: HttpRequester + path: "/v3/marketing/lists" + $parameters: + name: 'lists' + url_base: "https://api.sendgrid.com" + error_handler: + type: CompositeErrorHandler + error_handlers: + - type: DefaultErrorHandler + response_filters: + - type: HttpResponseFilter + action: FAIL + http_codes: + - 500 + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + requester_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["requester"], {}) + http_requester = factory.create_component( + model_type=HttpRequesterModel, + component_definition=requester_manifest, + config=input_config, + name="any name", + decoder=JsonDecoder(parameters={}), + ) + requests_mock.get("https://api.sendgrid.com/v3/marketing/lists", status_code=401) + + with pytest.raises(AirbyteTracedException) as exception: + http_requester.send_request() + + # The default behavior when we don't know about an error is to return a system_error. + # Here, we can confirm that we return a config_error which means we picked up the default error mapper + assert exception.value.failure_type == FailureType.config_error + + +@pytest.mark.parametrize( + "input_config, expected_authenticator_class", + [ + pytest.param( + {"auth": {"type": "token"}, "credentials": {"api_key": "some_key"}}, + ApiKeyAuthenticator, + id="test_create_requester_with_selective_authenticator_and_token_selected", + ), + pytest.param( + {"auth": {"type": "oauth"}, "credentials": {"client_id": "ABC"}}, + DeclarativeOauth2Authenticator, + id="test_create_requester_with_selective_authenticator_and_oauth_selected", + ), + ], +) +def test_create_requester_with_selective_authenticator(input_config, expected_authenticator_class): + content = """ +authenticator: + type: SelectiveAuthenticator + authenticator_selection_path: + - auth + - type + authenticators: + token: + type: ApiKeyAuthenticator + header: "Authorization" + api_token: "api_key={{ config['credentials']['api_key'] }}" + oauth: + type: OAuthAuthenticator + token_refresh_endpoint: https://api.url.com + client_id: "{{ config['credentials']['client_id'] }}" + client_secret: some_secret + refresh_token: some_token + """ + name = "name" + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + authenticator_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["authenticator"], {}) + + authenticator = factory.create_component( + model_type=SelectiveAuthenticator, component_definition=authenticator_manifest, config=input_config, name=name + ) + + assert isinstance(authenticator, expected_authenticator_class) + + +def test_create_composite_error_handler(): + content = """ + error_handler: + type: "CompositeErrorHandler" + error_handlers: + - response_filters: + - predicate: "{{ 'code' in response }}" + action: RETRY + - response_filters: + - http_codes: [ 403 ] + action: RETRY + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + error_handler_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["error_handler"], {}) + + error_handler = factory.create_component( + model_type=CompositeErrorHandlerModel, component_definition=error_handler_manifest, config=input_config + ) + + assert isinstance(error_handler, CompositeErrorHandler) + assert len(error_handler.error_handlers) == 2 + + error_handler_0 = error_handler.error_handlers[0] + assert isinstance(error_handler_0, DefaultErrorHandler) + assert isinstance(error_handler_0.response_filters[0], HttpResponseFilter) + assert error_handler_0.response_filters[0].predicate.condition == "{{ 'code' in response }}" + assert error_handler_0.response_filters[0].action == ResponseAction.RETRY + + error_handler_1 = error_handler.error_handlers[1] + assert isinstance(error_handler_1, DefaultErrorHandler) + assert isinstance(error_handler_1.response_filters[0], HttpResponseFilter) + assert error_handler_1.response_filters[0].http_codes == {403} + assert error_handler_1.response_filters[0].action == ResponseAction.RETRY + + +# This might be a better test for the manifest transformer but also worth testing end-to-end here as well +def test_config_with_defaults(): + content = """ + lists_stream: + type: "DeclarativeStream" + name: "lists" + primary_key: id + $parameters: + name: "lists" + url_base: "https://api.sendgrid.com" + schema_loader: + name: "{{ parameters.stream_name }}" + file_path: "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + retriever: + paginator: + type: "DefaultPaginator" + page_size_option: + type: RequestOption + inject_into: request_parameter + field_name: page_size + page_token_option: + type: RequestPath + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 + requester: + path: "/v3/marketing/lists" + authenticator: + type: "BearerAuthenticator" + api_token: "{{ config.apikey }}" + request_parameters: + page_size: 10 + record_selector: + extractor: + field_path: ["result"] + streams: + - "#/lists_stream" + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + resolved_manifest["type"] = "DeclarativeSource" + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["lists_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream, DeclarativeStream) + assert stream.primary_key == "id" + assert stream.name == "lists" + assert isinstance(stream.retriever, SimpleRetriever) + assert stream.retriever.name == stream.name + assert stream.retriever.primary_key == stream.primary_key + + assert isinstance(stream.schema_loader, JsonFileSchemaLoader) + assert stream.schema_loader.file_path.string == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + assert stream.schema_loader.file_path.default == "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + + assert isinstance(stream.retriever.requester, HttpRequester) + assert stream.retriever.requester.http_method == HttpMethod.GET + + assert isinstance(stream.retriever.requester.authenticator, BearerAuthenticator) + assert stream.retriever.requester.authenticator.token_provider.get_token() == "verysecrettoken" + + assert isinstance(stream.retriever.record_selector, RecordSelector) + assert isinstance(stream.retriever.record_selector.extractor, DpathExtractor) + assert [fp.eval(input_config) for fp in stream.retriever.record_selector.extractor._field_path] == ["result"] + + assert isinstance(stream.retriever.paginator, DefaultPaginator) + assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com" + assert stream.retriever.paginator.pagination_strategy.get_page_size() == 10 + + +def test_create_default_paginator(): + content = """ + paginator: + type: "DefaultPaginator" + page_size_option: + type: RequestOption + inject_into: request_parameter + field_name: page_size + page_token_option: + type: RequestPath + pagination_strategy: + type: "CursorPagination" + page_size: 50 + cursor_value: "{{ response._metadata.next }}" + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + paginator_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["paginator"], {}) + + paginator = factory.create_component( + model_type=DefaultPaginatorModel, + component_definition=paginator_manifest, + config=input_config, + url_base="https://airbyte.io", + decoder=JsonDecoder(parameters={}), + ) + + assert isinstance(paginator, DefaultPaginator) + assert paginator.url_base.string == "https://airbyte.io" + + assert isinstance(paginator.pagination_strategy, CursorPaginationStrategy) + assert paginator.pagination_strategy.page_size == 50 + assert paginator.pagination_strategy._cursor_value.string == "{{ response._metadata.next }}" + + assert isinstance(paginator.page_size_option, RequestOption) + assert paginator.page_size_option.inject_into == RequestOptionType.request_parameter + assert paginator.page_size_option.field_name.eval(config=input_config) == "page_size" + + assert isinstance(paginator.page_token_option, RequestPath) + + +@pytest.mark.parametrize( + "manifest, field_name, expected_value, expected_error", + [ + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "subcomponent_field_with_hint": {"type": "DpathExtractor", "field_path": [], "decoder": {"type": "JsonDecoder"}}, + }, + "subcomponent_field_with_hint", + DpathExtractor( + field_path=[], + config={"apikey": "verysecrettoken", "repos": ["airbyte", "airbyte-cloud"]}, + decoder=JsonDecoder(parameters={}), + parameters={}, + ), + None, + id="test_create_custom_component_with_subcomponent_that_must_be_parsed", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "subcomponent_field_with_hint": {"field_path": []}, + }, + "subcomponent_field_with_hint", + DpathExtractor(field_path=[], config={"apikey": "verysecrettoken", "repos": ["airbyte", "airbyte-cloud"]}, parameters={}), + None, + id="test_create_custom_component_with_subcomponent_that_must_infer_type_from_explicit_hints", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "basic_field": "expected", + }, + "basic_field", + "expected", + None, + id="test_create_custom_component_with_built_in_type", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "optional_subcomponent_field": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "destination"}, + }, + "optional_subcomponent_field", + RequestOption(inject_into=RequestOptionType.request_parameter, field_name="destination", parameters={}), + None, + id="test_create_custom_component_with_subcomponent_wrapped_in_optional", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "list_of_subcomponents": [ + {"inject_into": "header", "field_name": "store_me"}, + {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "destination"}, + ], + }, + "list_of_subcomponents", + [ + RequestOption(inject_into=RequestOptionType.header, field_name="store_me", parameters={}), + RequestOption(inject_into=RequestOptionType.request_parameter, field_name="destination", parameters={}), + ], + None, + id="test_create_custom_component_with_subcomponent_wrapped_in_list", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "without_hint": {"inject_into": "request_parameter", "field_name": "missing_hint"}, + }, + "without_hint", + None, + None, + id="test_create_custom_component_with_subcomponent_without_type_hints", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "paginator": { + "type": "DefaultPaginator", + "pagination_strategy": {"type": "OffsetIncrement", "page_size": 10}, + "$parameters": {"url_base": "https://physical_100.com"}, + }, + }, + "paginator", + DefaultPaginator( + pagination_strategy=OffsetIncrement( + page_size=10, config={"apikey": "verysecrettoken", "repos": ["airbyte", "airbyte-cloud"]}, parameters={} + ), + url_base="https://physical_100.com", + config={"apikey": "verysecrettoken", "repos": ["airbyte", "airbyte-cloud"]}, + parameters={"decoder": {"type": "JsonDecoder"}}, + ), + None, + id="test_create_custom_component_with_subcomponent_that_uses_parameters", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingSomeComponent", + "paginator": { + "type": "DefaultPaginator", + "pagination_strategy": {"type": "OffsetIncrement", "page_size": 10}, + }, + }, + "paginator", + None, + ValueError, + id="test_create_custom_component_missing_required_field_emits_error", + ), + pytest.param( + { + "type": "CustomErrorHandler", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.NonExistingClass", + "paginator": { + "type": "DefaultPaginator", + "pagination_strategy": {"type": "OffsetIncrement", "page_size": 10}, + }, + }, + "paginator", + None, + ValueError, + id="test_create_custom_component_non_existing_class_raises_value_error", + ), + ], +) +def test_create_custom_components(manifest, field_name, expected_value, expected_error): + if expected_error: + with pytest.raises(expected_error): + factory.create_component(CustomErrorHandlerModel, manifest, input_config) + else: + custom_component = factory.create_component(CustomErrorHandlerModel, manifest, input_config) + assert isinstance(custom_component, TestingSomeComponent) + + assert isinstance(getattr(custom_component, field_name), type(expected_value)) + assert getattr(custom_component, field_name) == expected_value + + +def test_custom_components_do_not_contain_extra_fields(): + custom_substream_partition_router_manifest = { + "type": "CustomPartitionRouter", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingCustomSubstreamPartitionRouter", + "custom_field": "here", + "extra_field_to_exclude": "should_not_pass_as_parameter", + "custom_pagination_strategy": {"type": "PageIncrement", "page_size": 100}, + "parent_stream_configs": [ + { + "type": "ParentStreamConfig", + "stream": { + "type": "DeclarativeStream", + "name": "a_parent", + "primary_key": "id", + "retriever": { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "requester": {"type": "HttpRequester", "url_base": "https://airbyte.io", "path": "some"}, + }, + "schema_loader": { + "type": "JsonFileSchemaLoader", + "file_path": "./source_sendgrid/schemas/{{ parameters['name'] }}.yaml", + }, + }, + "parent_key": "id", + "partition_field": "repository_id", + "request_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "repository_id"}, + } + ], + } + + custom_substream_partition_router = factory.create_component( + CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config + ) + assert isinstance(custom_substream_partition_router, TestingCustomSubstreamPartitionRouter) + + assert len(custom_substream_partition_router.parent_stream_configs) == 1 + assert custom_substream_partition_router.parent_stream_configs[0].parent_key.eval({}) == "id" + assert custom_substream_partition_router.parent_stream_configs[0].partition_field.eval({}) == "repository_id" + assert custom_substream_partition_router.parent_stream_configs[0].request_option.inject_into == RequestOptionType.request_parameter + assert custom_substream_partition_router.parent_stream_configs[0].request_option.field_name.eval(config=input_config) == "repository_id" + + assert isinstance(custom_substream_partition_router.custom_pagination_strategy, PageIncrement) + assert custom_substream_partition_router.custom_pagination_strategy.page_size == 100 + + +def test_parse_custom_component_fields_if_subcomponent(): + custom_substream_partition_router_manifest = { + "type": "CustomPartitionRouter", + "class_name": "unit_tests.sources.declarative.parsers.testing_components.TestingCustomSubstreamPartitionRouter", + "custom_field": "here", + "custom_pagination_strategy": {"type": "PageIncrement", "page_size": 100}, + "parent_stream_configs": [ + { + "type": "ParentStreamConfig", + "stream": { + "type": "DeclarativeStream", + "name": "a_parent", + "primary_key": "id", + "retriever": { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "requester": {"type": "HttpRequester", "url_base": "https://airbyte.io", "path": "some"}, + }, + "schema_loader": { + "type": "JsonFileSchemaLoader", + "file_path": "./source_sendgrid/schemas/{{ parameters['name'] }}.yaml", + }, + }, + "parent_key": "id", + "partition_field": "repository_id", + "request_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "repository_id"}, + } + ], + } + + custom_substream_partition_router = factory.create_component( + CustomPartitionRouterModel, custom_substream_partition_router_manifest, input_config + ) + assert isinstance(custom_substream_partition_router, TestingCustomSubstreamPartitionRouter) + assert custom_substream_partition_router.custom_field == "here" + + assert len(custom_substream_partition_router.parent_stream_configs) == 1 + assert custom_substream_partition_router.parent_stream_configs[0].parent_key.eval({}) == "id" + assert custom_substream_partition_router.parent_stream_configs[0].partition_field.eval({}) == "repository_id" + assert custom_substream_partition_router.parent_stream_configs[0].request_option.inject_into == RequestOptionType.request_parameter + assert custom_substream_partition_router.parent_stream_configs[0].request_option.field_name.eval(config=input_config) == "repository_id" + + assert isinstance(custom_substream_partition_router.custom_pagination_strategy, PageIncrement) + assert custom_substream_partition_router.custom_pagination_strategy.page_size == 100 + + +class TestCreateTransformations: + # the tabbing matters + base_parameters = """ + name: "lists" + primary_key: id + url_base: "https://api.sendgrid.com" + schema_loader: + name: "{{ parameters.name }}" + file_path: "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + retriever: + requester: + name: "{{ parameters.name }}" + path: "/v3/marketing/lists" + request_parameters: + page_size: 10 + record_selector: + extractor: + field_path: ["result"] + """ + + def test_no_transformations(self): + content = f""" + the_stream: + type: DeclarativeStream + $parameters: + {self.base_parameters} + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + resolved_manifest["type"] = "DeclarativeSource" + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["the_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream, DeclarativeStream) + assert [] == stream.retriever.record_selector.transformations + + def test_remove_fields(self): + content = f""" + the_stream: + type: DeclarativeStream + $parameters: + {self.base_parameters} + transformations: + - type: RemoveFields + field_pointers: + - ["path", "to", "field1"] + - ["path2"] + """ + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + resolved_manifest["type"] = "DeclarativeSource" + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["the_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream, DeclarativeStream) + expected = [RemoveFields(field_pointers=[["path", "to", "field1"], ["path2"]], parameters={})] + assert stream.retriever.record_selector.transformations == expected + + def test_add_fields_no_value_type(self): + content = f""" + the_stream: + type: DeclarativeStream + $parameters: + {self.base_parameters} + transformations: + - type: AddFields + fields: + - path: ["field1"] + value: "static_value" + """ + expected = [ + AddFields( + fields=[ + AddedFieldDefinition( + path=["field1"], + value=InterpolatedString(string="static_value", default="static_value", parameters={}), + value_type=None, + parameters={}, + ) + ], + parameters={}, + ) + ] + self._test_add_fields(content, expected) + + def test_add_fields_value_type_is_string(self): + content = f""" + the_stream: + type: DeclarativeStream + $parameters: + {self.base_parameters} + transformations: + - type: AddFields + fields: + - path: ["field1"] + value: "static_value" + value_type: string + """ + expected = [ + AddFields( + fields=[ + AddedFieldDefinition( + path=["field1"], + value=InterpolatedString(string="static_value", default="static_value", parameters={}), + value_type=str, + parameters={}, + ) + ], + parameters={}, + ) + ] + self._test_add_fields(content, expected) + + def test_add_fields_value_type_is_number(self): + content = f""" + the_stream: + type: DeclarativeStream + $parameters: + {self.base_parameters} + transformations: + - type: AddFields + fields: + - path: ["field1"] + value: "1" + value_type: number + """ + expected = [ + AddFields( + fields=[ + AddedFieldDefinition( + path=["field1"], + value=InterpolatedString(string="1", default="1", parameters={}), + value_type=float, + parameters={}, + ) + ], + parameters={}, + ) + ] + self._test_add_fields(content, expected) + + def test_add_fields_value_type_is_integer(self): + content = f""" + the_stream: + type: DeclarativeStream + $parameters: + {self.base_parameters} + transformations: + - type: AddFields + fields: + - path: ["field1"] + value: "1" + value_type: integer + """ + expected = [ + AddFields( + fields=[ + AddedFieldDefinition( + path=["field1"], + value=InterpolatedString(string="1", default="1", parameters={}), + value_type=int, + parameters={}, + ) + ], + parameters={}, + ) + ] + self._test_add_fields(content, expected) + + def test_add_fields_value_type_is_boolean(self): + content = f""" + the_stream: + type: DeclarativeStream + $parameters: + {self.base_parameters} + transformations: + - type: AddFields + fields: + - path: ["field1"] + value: False + value_type: boolean + """ + expected = [ + AddFields( + fields=[ + AddedFieldDefinition( + path=["field1"], + value=InterpolatedString(string="False", default="False", parameters={}), + value_type=bool, + parameters={}, + ) + ], + parameters={}, + ) + ] + self._test_add_fields(content, expected) + + def _test_add_fields(self, content, expected): + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + resolved_manifest["type"] = "DeclarativeSource" + stream_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["the_stream"], {}) + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=input_config) + + assert isinstance(stream, DeclarativeStream) + assert stream.retriever.record_selector.transformations == expected + + def test_default_schema_loader(self): + component_definition = { + "type": "DeclarativeStream", + "name": "test", + "primary_key": [], + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "http://localhost:6767/", + "path": "items/", + "request_options_provider": { + "request_parameters": {}, + "request_headers": {}, + "request_body_json": {}, + "type": "InterpolatedRequestOptionsProvider", + }, + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config['api_key'] }}"}, + }, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": ["items"]}}, + "paginator": {"type": "NoPagination"}, + }, + } + resolved_manifest = resolver.preprocess_manifest(component_definition) + ws = ManifestComponentTransformer() + propagated_source_config = ws.propagate_types_and_parameters("", resolved_manifest, {}) + stream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=propagated_source_config, config=input_config + ) + schema_loader = stream.schema_loader + assert schema_loader.default_loader._get_json_filepath().split("/")[-1] == f"{stream.name}.json" + + +@pytest.mark.parametrize( + "incremental, partition_router, expected_type", + [ + pytest.param( + { + "type": "DatetimeBasedCursor", + "datetime_format": "%Y-%m-%dT%H:%M:%S.%f%z", + "start_datetime": "{{ config['start_time'] }}", + "end_datetime": "{{ config['end_time'] }}", + "step": "P10D", + "cursor_field": "created", + "cursor_granularity": "PT0.000001S", + }, + None, + DatetimeBasedCursor, + id="test_create_simple_retriever_with_incremental", + ), + pytest.param( + None, + { + "type": "ListPartitionRouter", + "values": "{{config['repos']}}", + "cursor_field": "a_key", + }, + PerPartitionCursor, + id="test_create_simple_retriever_with_partition_router", + ), + pytest.param( + { + "type": "DatetimeBasedCursor", + "datetime_format": "%Y-%m-%dT%H:%M:%S.%f%z", + "start_datetime": "{{ config['start_time'] }}", + "end_datetime": "{{ config['end_time'] }}", + "step": "P10D", + "cursor_field": "created", + "cursor_granularity": "PT0.000001S", + }, + { + "type": "ListPartitionRouter", + "values": "{{config['repos']}}", + "cursor_field": "a_key", + }, + PerPartitionWithGlobalCursor, + id="test_create_simple_retriever_with_incremental_and_partition_router", + ), + pytest.param( + { + "type": "DatetimeBasedCursor", + "datetime_format": "%Y-%m-%dT%H:%M:%S.%f%z", + "start_datetime": "{{ config['start_time'] }}", + "end_datetime": "{{ config['end_time'] }}", + "step": "P10D", + "cursor_field": "created", + "cursor_granularity": "PT0.000001S", + }, + [ + { + "type": "ListPartitionRouter", + "values": "{{config['repos']}}", + "cursor_field": "a_key", + }, + { + "type": "ListPartitionRouter", + "values": "{{config['repos']}}", + "cursor_field": "b_key", + }, + ], + PerPartitionWithGlobalCursor, + id="test_create_simple_retriever_with_partition_routers_multiple_components", + ), + pytest.param(None, None, SinglePartitionRouter, id="test_create_simple_retriever_with_no_incremental_or_partition_router"), + ], +) +def test_merge_incremental_and_partition_router(incremental, partition_router, expected_type): + stream_model = { + "type": "DeclarativeStream", + "retriever": { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [], + }, + }, + "requester": { + "type": "HttpRequester", + "name": "list", + "url_base": "orange.com", + "path": "/v1/api", + }, + }, + } + + if incremental: + stream_model["incremental_sync"] = incremental + + if partition_router: + stream_model["retriever"]["partition_router"] = partition_router + + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_model, config=input_config) + + assert isinstance(stream, DeclarativeStream) + assert isinstance(stream.retriever, SimpleRetriever) + print(stream.retriever.stream_slicer) + assert isinstance(stream.retriever.stream_slicer, expected_type) + + if incremental and partition_router: + assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) + if isinstance(partition_router, list) and len(partition_router) > 1: + assert isinstance(stream.retriever.stream_slicer._partition_router, CartesianProductStreamSlicer) + assert len(stream.retriever.stream_slicer._partition_router.stream_slicers) == len(partition_router) + elif partition_router and isinstance(partition_router, list) and len(partition_router) > 1: + assert isinstance(stream.retriever.stream_slicer, PerPartitionWithGlobalCursor) + assert len(stream.retriever.stream_slicer.stream_slicerS) == len(partition_router) + + +def test_simple_retriever_emit_log_messages(): + simple_retriever_model = { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [], + }, + }, + "requester": {"type": "HttpRequester", "name": "list", "url_base": "orange.com", "path": "/v1/api"}, + } + + connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) + retriever = connector_builder_factory.create_component( + model_type=SimpleRetrieverModel, + component_definition=simple_retriever_model, + config={}, + name="Test", + primary_key="id", + stream_slicer=None, + transformations=[], + ) + + assert isinstance(retriever, SimpleRetrieverTestReadDecorator) + assert connector_builder_factory._message_repository._log_level == Level.DEBUG + + +def test_create_page_increment(): + model = PageIncrementModel( + type="PageIncrement", + page_size=10, + start_from_page=1, + inject_on_first_request=True, + ) + expected_strategy = PageIncrement(page_size=10, start_from_page=1, inject_on_first_request=True, parameters={}, config=input_config) + + strategy = factory.create_page_increment(model, input_config) + + assert strategy.page_size == expected_strategy.page_size + assert strategy.start_from_page == expected_strategy.start_from_page + assert strategy.inject_on_first_request == expected_strategy.inject_on_first_request + + +def test_create_page_increment_with_interpolated_page_size(): + model = PageIncrementModel( + type="PageIncrement", + page_size="{{ config['page_size'] }}", + start_from_page=1, + inject_on_first_request=True, + ) + config = {**input_config, "page_size": 5} + expected_strategy = PageIncrement(page_size=5, start_from_page=1, inject_on_first_request=True, parameters={}, config=config) + + strategy = factory.create_page_increment(model, config) + + assert strategy.get_page_size() == expected_strategy.get_page_size() + assert strategy.start_from_page == expected_strategy.start_from_page + assert strategy.inject_on_first_request == expected_strategy.inject_on_first_request + + +def test_create_offset_increment(): + model = OffsetIncrementModel( + type="OffsetIncrement", + page_size=10, + inject_on_first_request=True, + ) + expected_strategy = OffsetIncrement(page_size=10, inject_on_first_request=True, parameters={}, config=input_config) + + strategy = factory.create_offset_increment(model, input_config, decoder=JsonDecoder(parameters={})) + + assert strategy.page_size == expected_strategy.page_size + assert strategy.inject_on_first_request == expected_strategy.inject_on_first_request + assert strategy.config == input_config + + +class MyCustomSchemaLoader(SchemaLoader): + def get_json_schema(self) -> Mapping[str, Any]: + """Returns a mapping describing the stream's schema""" + return {} + + +def test_create_custom_schema_loader(): + + definition = { + "type": "CustomSchemaLoader", + "class_name": "unit_tests.sources.declarative.parsers.test_model_to_component_factory.MyCustomSchemaLoader", + } + component = factory.create_component(CustomSchemaLoaderModel, definition, {}) + assert isinstance(component, MyCustomSchemaLoader) + + +@freezegun.freeze_time("2021-01-01 00:00:00") +@pytest.mark.parametrize( + "config, manifest, expected", + [ + ( + { + "secret_key": "secret_key", + }, + """ + authenticator: + type: JwtAuthenticator + secret_key: "{{ config['secret_key'] }}" + algorithm: HS256 + """, + { + "secret_key": "secret_key", + "algorithm": "HS256", + "base64_encode_secret_key": False, + "token_duration": 1200, + "jwt_headers": {"typ": "JWT", "alg": "HS256"}, + "jwt_payload": {}, + }, + ), + ( + { + "secret_key": "secret_key", + "kid": "test kid", + "iss": "test iss", + "test": "test custom header", + }, + """ + authenticator: + type: JwtAuthenticator + secret_key: "{{ config['secret_key'] }}" + base64_encode_secret_key: True + algorithm: RS256 + token_duration: 3600 + header_prefix: Bearer + jwt_headers: + kid: "{{ config['kid'] }}" + cty: "JWT" + typ: "Alt" + additional_jwt_headers: + test: "{{ config['test']}}" + jwt_payload: + iss: "{{ config['iss'] }}" + sub: "test sub" + aud: "test aud" + additional_jwt_payload: + test: "test custom payload" + """, + { + "secret_key": "secret_key", + "algorithm": "RS256", + "base64_encode_secret_key": True, + "token_duration": 3600, + "header_prefix": "Bearer", + "jwt_headers": { + "kid": "test kid", + "typ": "Alt", + "alg": "RS256", + "cty": "JWT", + "test": "test custom header", + }, + "jwt_payload": { + "iss": "test iss", + "sub": "test sub", + "aud": "test aud", + "test": "test custom payload", + }, + }, + ), + ( + { + "secret_key": "secret_key", + }, + """ + authenticator: + type: JwtAuthenticator + secret_key: "{{ config['secret_key'] }}" + algorithm: HS256 + additional_jwt_headers: + custom_header: "custom header value" + additional_jwt_payload: + custom_payload: "custom payload value" + """, + { + "secret_key": "secret_key", + "algorithm": "HS256", + "base64_encode_secret_key": False, + "token_duration": 1200, + "jwt_headers": { + "typ": "JWT", + "alg": "HS256", + "custom_header": "custom header value", + }, + "jwt_payload": { + "custom_payload": "custom payload value", + }, + }, + ), + ( + { + "secret_key": "secret_key", + }, + """ + authenticator: + type: JwtAuthenticator + secret_key: "{{ config['secret_key'] }}" + algorithm: invalid_algorithm + """, + { + "expect_error": True, + }, + ), + ], +) +def test_create_jwt_authenticator(config, manifest, expected): + parsed_manifest = YamlDeclarativeSource._parse(manifest) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + + authenticator_manifest = transformer.propagate_types_and_parameters("", resolved_manifest["authenticator"], {}) + + if expected.get("expect_error"): + with pytest.raises(ValueError): + authenticator = factory.create_component( + model_type=JwtAuthenticatorModel, component_definition=authenticator_manifest, config=config + ) + return + + authenticator = factory.create_component(model_type=JwtAuthenticatorModel, component_definition=authenticator_manifest, config=config) + + assert isinstance(authenticator, JwtAuthenticator) + assert authenticator._secret_key.eval(config) == expected["secret_key"] + assert authenticator._algorithm == expected["algorithm"] + assert authenticator._base64_encode_secret_key == expected["base64_encode_secret_key"] + assert authenticator._token_duration == expected["token_duration"] + if "header_prefix" in expected: + assert authenticator._header_prefix.eval(config) == expected["header_prefix"] + assert authenticator._get_jwt_headers() == expected["jwt_headers"] + jwt_payload = expected["jwt_payload"] + jwt_payload.update( + { + "iat": int(datetime.datetime.now().timestamp()), + "nbf": int(datetime.datetime.now().timestamp()), + "exp": int(datetime.datetime.now().timestamp()) + expected["token_duration"], + } + ) + assert authenticator._get_jwt_payload() == jwt_payload + + +def test_use_request_options_provider_for_datetime_based_cursor(): + config = { + "start_time": "2024-01-01T00:00:00.000000+0000", + } + + simple_retriever_model = { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [], + }, + }, + "requester": {"type": "HttpRequester", "name": "list", "url_base": "orange.com", "path": "/v1/api"}, + } + + datetime_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="{{ config.start_time }}", parameters={}), + step="P5D", + cursor_field="updated_at", + datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", + cursor_granularity="PT1S", + is_compare_strictly=True, + config=config, + parameters={}, + ) + + datetime_based_request_options_provider = DatetimeBasedRequestOptionsProvider( + start_time_option=RequestOption( + inject_into=RequestOptionType.request_parameter, + field_name="after", + parameters={}, + ), + end_time_option=RequestOption( + inject_into=RequestOptionType.request_parameter, + field_name="before", + parameters={}, + ), + config=config, + parameters={}, + ) + + connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) + retriever = connector_builder_factory.create_component( + model_type=SimpleRetrieverModel, + component_definition=simple_retriever_model, + config={}, + name="Test", + primary_key="id", + stream_slicer=datetime_based_cursor, + request_options_provider=datetime_based_request_options_provider, + transformations=[], + ) + + assert isinstance(retriever, SimpleRetriever) + assert retriever.primary_key == "id" + assert retriever.name == "Test" + + assert isinstance(retriever.cursor, DatetimeBasedCursor) + assert isinstance(retriever.stream_slicer, DatetimeBasedCursor) + + assert isinstance(retriever.request_option_provider, DatetimeBasedRequestOptionsProvider) + assert retriever.request_option_provider.start_time_option.inject_into == RequestOptionType.request_parameter + assert retriever.request_option_provider.start_time_option.field_name.eval(config=input_config) == "after" + assert retriever.request_option_provider.end_time_option.inject_into == RequestOptionType.request_parameter + assert retriever.request_option_provider.end_time_option.field_name.eval(config=input_config) == "before" + assert retriever.request_option_provider._partition_field_start.string == "start_time" + assert retriever.request_option_provider._partition_field_end.string == "end_time" + + +def test_do_not_separate_request_options_provider_for_non_datetime_based_cursor(): + # This test validates that we're only using the dedicated RequestOptionsProvider for DatetimeBasedCursor and using the + # existing StreamSlicer for other types of cursors and partition routing. Once everything is migrated this test can be deleted + + config = { + "start_time": "2024-01-01T00:00:00.000000+0000", + } + + simple_retriever_model = { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [], + }, + }, + "requester": {"type": "HttpRequester", "name": "list", "url_base": "orange.com", "path": "/v1/api"}, + } + + datetime_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="{{ config.start_time }}", parameters={}), + step="P5D", + cursor_field="updated_at", + datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", + cursor_granularity="PT1S", + is_compare_strictly=True, + config=config, + parameters={}, + ) + + list_partition_router = ListPartitionRouter( + cursor_field="id", + values=["four", "oh", "eight"], + config=config, + parameters={}, + ) + + per_partition_cursor = PerPartitionCursor( + cursor_factory=CursorFactory(lambda: datetime_based_cursor), + partition_router=list_partition_router, + ) + + connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) + retriever = connector_builder_factory.create_component( + model_type=SimpleRetrieverModel, + component_definition=simple_retriever_model, + config={}, + name="Test", + primary_key="id", + stream_slicer=per_partition_cursor, + request_options_provider=None, + transformations=[], + ) + + assert isinstance(retriever, SimpleRetriever) + assert retriever.primary_key == "id" + assert retriever.name == "Test" + + assert isinstance(retriever.cursor, PerPartitionCursor) + assert isinstance(retriever.stream_slicer, PerPartitionCursor) + + assert isinstance(retriever.request_option_provider, PerPartitionCursor) + assert isinstance(retriever.request_option_provider._cursor_factory, CursorFactory) + assert retriever.request_option_provider._partition_router == list_partition_router + + +def test_use_default_request_options_provider(): + simple_retriever_model = { + "type": "SimpleRetriever", + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [], + }, + }, + "requester": {"type": "HttpRequester", "name": "list", "url_base": "orange.com", "path": "/v1/api"}, + } + + connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) + retriever = connector_builder_factory.create_component( + model_type=SimpleRetrieverModel, + component_definition=simple_retriever_model, + config={}, + name="Test", + primary_key="id", + stream_slicer=None, + request_options_provider=None, + transformations=[], + ) + + assert isinstance(retriever, SimpleRetriever) + assert retriever.primary_key == "id" + assert retriever.name == "Test" + + assert isinstance(retriever.stream_slicer, SinglePartitionRouter) + assert isinstance(retriever.request_option_provider, DefaultRequestOptionsProvider) + + +@pytest.mark.parametrize( + "stream_state,expected_start", + [ + pytest.param({}, "2024-08-01T00:00:00.000000Z", id="test_create_concurrent_cursor_without_state"), + pytest.param({"updated_at": "2024-10-01T00:00:00.000000Z"}, "2024-10-01T00:00:00.000000Z", id="test_create_concurrent_cursor_with_state"), + ] +) +def test_create_concurrent_cursor_from_datetime_based_cursor_all_fields(stream_state, expected_start): + config = { + "start_time": "2024-08-01T00:00:00.000000Z", + "end_time": "2024-10-15T00:00:00.000000Z" + } + + expected_cursor_field = "updated_at" + expected_start_boundary = "custom_start" + expected_end_boundary = "custom_end" + expected_step = datetime.timedelta(days=10) + expected_lookback_window = datetime.timedelta(days=3) + expected_datetime_format = "%Y-%m-%dT%H:%M:%S.%fZ" + expected_cursor_granularity = datetime.timedelta(microseconds=1) + + expected_start = pendulum.parse(expected_start) + expected_end = datetime.datetime(year=2024, month=10, day=15, second=0, microsecond=0, tzinfo=datetime.timezone.utc) + if stream_state: + # Using incoming state, the resulting already completed partition is the start_time up to the last successful + # partition indicated by the legacy sequential state + expected_concurrent_state = { + "slices": [ + { + "start": pendulum.parse(config["start_time"]), + "end": pendulum.parse(stream_state["updated_at"]), + }, + ], + "state_type": "date-range", + "legacy": {"updated_at": "2024-10-01T00:00:00.000000Z"}, + } + else: + expected_concurrent_state = { + "slices": [ + { + "start": pendulum.parse(config["start_time"]), + "end": pendulum.parse(config["start_time"]), + }, + ], + "state_type": "date-range", + "legacy": {}, + } + + connector_state_manager = ConnectorStateManager() + + connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) + + stream_name = "test" + + cursor_component_definition = { + "type": "DatetimeBasedCursor", + "cursor_field": "updated_at", + "datetime_format": "%Y-%m-%dT%H:%M:%S.%fZ", + "start_datetime": "{{ config['start_time'] }}", + "end_datetime": "{{ config['end_time'] }}", + "partition_field_start": "custom_start", + "partition_field_end": "custom_end", + "step": "P10D", + "cursor_granularity": "PT0.000001S", + "lookback_window": "P3D" + } + + concurrent_cursor, stream_state_converter = connector_builder_factory.create_concurrent_cursor_from_datetime_based_cursor( + state_manager=connector_state_manager, + model_type=DatetimeBasedCursorModel, + component_definition=cursor_component_definition, + stream_name=stream_name, + stream_namespace=None, + config=config, + stream_state=stream_state, + ) + + assert concurrent_cursor._stream_name == stream_name + assert not concurrent_cursor._stream_namespace + assert concurrent_cursor._connector_state_manager == connector_state_manager + assert concurrent_cursor.cursor_field.cursor_field_key == expected_cursor_field + assert concurrent_cursor._slice_range == expected_step + assert concurrent_cursor._lookback_window == expected_lookback_window + + assert concurrent_cursor.slice_boundary_fields[ConcurrentCursor._START_BOUNDARY] == expected_start_boundary + assert concurrent_cursor.slice_boundary_fields[ConcurrentCursor._END_BOUNDARY] == expected_end_boundary + + assert concurrent_cursor.start == expected_start + assert concurrent_cursor._end_provider() == expected_end + assert concurrent_cursor._concurrent_state == expected_concurrent_state + + assert isinstance(stream_state_converter, CustomFormatConcurrentStreamStateConverter) + assert stream_state_converter._datetime_format == expected_datetime_format + assert stream_state_converter._is_sequential_state + assert stream_state_converter._cursor_granularity == expected_cursor_granularity + + +@pytest.mark.parametrize( + "cursor_fields_to_replace,assertion_field,expected_value,expected_error", + [ + pytest.param({"partition_field_start": None}, "slice_boundary_fields", ('start_time', 'custom_end'), None, id="test_no_partition_field_start"), + pytest.param({"partition_field_end": None}, "slice_boundary_fields", ('custom_start', 'end_time'), None, id="test_no_partition_field_end"), + pytest.param({"lookback_window": None}, "_lookback_window", None, None, id="test_no_lookback_window"), + pytest.param({"lookback_window": "{{ config.does_not_exist }}"}, "_lookback_window", None, None, id="test_no_lookback_window"), + pytest.param({"step": None}, None, None, ValueError, id="test_no_step_raises_exception"), + pytest.param({"cursor_granularity": None}, None, None, ValueError, id="test_no_cursor_granularity_exception"), + pytest.param({ + "end_time": None, + "cursor_granularity": None, + "step": None, + }, "_slice_range", datetime.timedelta.max, None, id="test_uses_a_single_time_interval_when_no_specified_step_and_granularity"), + ] +) +@freezegun.freeze_time("2024-10-01T00:00:00") +def test_create_concurrent_cursor_from_datetime_based_cursor(cursor_fields_to_replace, assertion_field, expected_value, expected_error): + connector_state_manager = ConnectorStateManager() + + config = { + "start_time": "2024-08-01T00:00:00.000000Z", + "end_time": "2024-09-01T00:00:00.000000Z" + } + + stream_name = "test" + + cursor_component_definition = { + "type": "DatetimeBasedCursor", + "cursor_field": "updated_at", + "datetime_format": "%Y-%m-%dT%H:%M:%S.%fZ", + "start_datetime": "{{ config['start_time'] }}", + "end_datetime": "{{ config['end_time'] }}", + "partition_field_start": "custom_start", + "partition_field_end": "custom_end", + "step": "P10D", + "cursor_granularity": "PT0.000001S", + "lookback_window": "P3D", + } + + for cursor_field_to_replace, value in cursor_fields_to_replace.items(): + if value is None: + cursor_component_definition[cursor_field_to_replace] = value + else: + del cursor_component_definition[cursor_field_to_replace] + + connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) + + if expected_error: + with pytest.raises(expected_error): + connector_builder_factory.create_concurrent_cursor_from_datetime_based_cursor( + state_manager=connector_state_manager, + model_type=DatetimeBasedCursorModel, + component_definition=cursor_component_definition, + stream_name=stream_name, + stream_namespace=None, + config=config, + stream_state={}, + ) + else: + concurrent_cursor, stream_state_converter = connector_builder_factory.create_concurrent_cursor_from_datetime_based_cursor( + state_manager=connector_state_manager, + model_type=DatetimeBasedCursorModel, + component_definition=cursor_component_definition, + stream_name=stream_name, + stream_namespace=None, + config=config, + stream_state={}, + ) + + assert getattr(concurrent_cursor, assertion_field) == expected_value + + +def test_create_concurrent_cursor_uses_min_max_datetime_format_if_defined(): + """ + Validates a special case for when the start_time.datetime_format and end_time.datetime_format are defined, the date to + string parser should not inherit from the parent DatetimeBasedCursor.datetime_format. The parent which uses an incorrect + precision would fail if it were used by the dependent children. + """ + expected_start = datetime.datetime(year=2024, month=8, day=1, second=0, microsecond=0, tzinfo=datetime.timezone.utc) + expected_end = datetime.datetime(year=2024, month=9, day=1, second=0, microsecond=0, tzinfo=datetime.timezone.utc) + + connector_state_manager = ConnectorStateManager() + + config = { + "start_time": "2024-08-01T00:00:00Z", + "end_time": "2024-09-01T00:00:00Z" + } + + connector_builder_factory = ModelToComponentFactory(emit_connector_builder_messages=True) + + stream_name = "test" + + cursor_component_definition = { + "type": "DatetimeBasedCursor", + "cursor_field": "updated_at", + "datetime_format": "%Y-%m-%dT%H:%MZ", + "start_datetime": { + "type": "MinMaxDatetime", + "datetime": "{{ config.start_time }}", + "datetime_format": "%Y-%m-%dT%H:%M:%SZ" + }, + "end_datetime": { + "type": "MinMaxDatetime", + "datetime": "{{ config.end_time }}", + "datetime_format": "%Y-%m-%dT%H:%M:%SZ" + }, + "partition_field_start": "custom_start", + "partition_field_end": "custom_end", + "step": "P10D", + "cursor_granularity": "PT0.000001S", + "lookback_window": "P3D" + } + + concurrent_cursor, stream_state_converter = connector_builder_factory.create_concurrent_cursor_from_datetime_based_cursor( + state_manager=connector_state_manager, + model_type=DatetimeBasedCursorModel, + component_definition=cursor_component_definition, + stream_name=stream_name, + stream_namespace=None, + config=config, + stream_state={}, + ) + + assert concurrent_cursor.start == expected_start + assert concurrent_cursor._end_provider() == expected_end + assert concurrent_cursor._concurrent_state == { + "slices": [ + { + "start": expected_start, + "end": expected_start, + }, + ], + "state_type": "date-range", + "legacy": {}, + } diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/parsers/testing_components.py b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/testing_components.py new file mode 100644 index 000000000000..db85283b7c0f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/testing_components.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass, field +from typing import List, Optional + +from airbyte_cdk.sources.declarative.extractors import DpathExtractor +from airbyte_cdk.sources.declarative.partition_routers import SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.requesters import RequestOption +from airbyte_cdk.sources.declarative.requesters.error_handlers import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.paginators import DefaultPaginator, PaginationStrategy + + +@dataclass +class TestingSomeComponent(DefaultErrorHandler): + """ + A basic test class with various field permutations used to test manifests with custom components + """ + + subcomponent_field_with_hint: DpathExtractor = field(default_factory=lambda: DpathExtractor(field_path=[], config={}, parameters={})) + basic_field: str = "" + optional_subcomponent_field: Optional[RequestOption] = None + list_of_subcomponents: List[RequestOption] = None + without_hint = None + paginator: DefaultPaginator = None + + +@dataclass +class TestingCustomSubstreamPartitionRouter(SubstreamPartitionRouter): + """ + A test class based on a SubstreamPartitionRouter used for testing manifests that use custom components. + """ + + custom_field: str + custom_pagination_strategy: PaginationStrategy diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/__init__.py new file mode 100644 index 000000000000..1100c1c58cf5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_cartesian_product_partition_router.py b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_cartesian_product_partition_router.py new file mode 100644 index 000000000000..2b9313b3ebd7 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_cartesian_product_partition_router.py @@ -0,0 +1,232 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest as pytest +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.partition_routers import CartesianProductStreamSlicer, ListPartitionRouter +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.types import StreamSlice + + +@pytest.mark.parametrize( + "test_name, stream_slicers, expected_slices", + [ + ( + "test_single_stream_slicer", + [ListPartitionRouter(values=["customer", "store", "subscription"], cursor_field="owner_resource", config={}, parameters={})], + [ + StreamSlice(partition={"owner_resource": "customer"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "store"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "subscription"}, cursor_slice={}), + ], + ), + ( + "test_two_stream_slicers", + [ + ListPartitionRouter(values=["customer", "store", "subscription"], cursor_field="owner_resource", config={}, parameters={}), + ListPartitionRouter(values=["A", "B"], cursor_field="letter", config={}, parameters={}), + ], + [ + StreamSlice(partition={"owner_resource": "customer", "letter": "A"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "customer", "letter": "B"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "store", "letter": "A"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "store", "letter": "B"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "subscription", "letter": "A"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "subscription", "letter": "B"}, cursor_slice={}), + ], + ), + ( + "test_singledatetime", + [ + DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format="%Y-%m-%d", parameters={}), + end_datetime=MinMaxDatetime(datetime="2021-01-03", datetime_format="%Y-%m-%d", parameters={}), + step="P1D", + cursor_field=InterpolatedString.create("", parameters={}), + datetime_format="%Y-%m-%d", + cursor_granularity="P1D", + config={}, + parameters={}, + ), + ], + [ + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-01", "end_time": "2021-01-01"}), + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-02", "end_time": "2021-01-02"}), + StreamSlice(partition={}, cursor_slice={"start_time": "2021-01-03", "end_time": "2021-01-03"}), + ], + ), + ( + "test_list_and_datetime", + [ + ListPartitionRouter(values=["customer", "store", "subscription"], cursor_field="owner_resource", config={}, parameters={}), + DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format="%Y-%m-%d", parameters={}), + end_datetime=MinMaxDatetime(datetime="2021-01-03", datetime_format="%Y-%m-%d", parameters={}), + step="P1D", + cursor_field=InterpolatedString.create("", parameters={}), + datetime_format="%Y-%m-%d", + cursor_granularity="P1D", + config={}, + parameters={}, + ), + ], + [ + StreamSlice(partition={"owner_resource": "customer"}, cursor_slice={"start_time": "2021-01-01", "end_time": "2021-01-01"}), + StreamSlice(partition={"owner_resource": "customer"}, cursor_slice={"start_time": "2021-01-02", "end_time": "2021-01-02"}), + StreamSlice(partition={"owner_resource": "customer"}, cursor_slice={"start_time": "2021-01-03", "end_time": "2021-01-03"}), + StreamSlice(partition={"owner_resource": "store"}, cursor_slice={"start_time": "2021-01-01", "end_time": "2021-01-01"}), + StreamSlice(partition={"owner_resource": "store"}, cursor_slice={"start_time": "2021-01-02", "end_time": "2021-01-02"}), + StreamSlice(partition={"owner_resource": "store"}, cursor_slice={"start_time": "2021-01-03", "end_time": "2021-01-03"}), + StreamSlice( + partition={"owner_resource": "subscription"}, cursor_slice={"start_time": "2021-01-01", "end_time": "2021-01-01"} + ), + StreamSlice( + partition={"owner_resource": "subscription"}, cursor_slice={"start_time": "2021-01-02", "end_time": "2021-01-02"} + ), + StreamSlice( + partition={"owner_resource": "subscription"}, cursor_slice={"start_time": "2021-01-03", "end_time": "2021-01-03"} + ), + ], + ), + ], +) +def test_substream_slicer(test_name, stream_slicers, expected_slices): + slicer = CartesianProductStreamSlicer(stream_slicers=stream_slicers, parameters={}) + slices = [s for s in slicer.stream_slices()] + assert slices == expected_slices + + +def test_stream_slices_raises_exception_if_multiple_cursor_slice_components(): + stream_slicers = [ + DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format="%Y-%m-%d", parameters={}), + end_datetime=MinMaxDatetime(datetime="2021-01-03", datetime_format="%Y-%m-%d", parameters={}), + step="P1D", + cursor_field=InterpolatedString.create("", parameters={}), + datetime_format="%Y-%m-%d", + cursor_granularity="P1D", + config={}, + parameters={}, + ), + DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format="%Y-%m-%d", parameters={}), + end_datetime=MinMaxDatetime(datetime="2021-01-03", datetime_format="%Y-%m-%d", parameters={}), + step="P1D", + cursor_field=InterpolatedString.create("", parameters={}), + datetime_format="%Y-%m-%d", + cursor_granularity="P1D", + config={}, + parameters={}, + ), + ] + slicer = CartesianProductStreamSlicer(stream_slicers=stream_slicers, parameters={}) + with pytest.raises(ValueError): + list(slicer.stream_slices()) + + +@pytest.mark.parametrize( + "test_name, stream_1_request_option, stream_2_request_option, expected_req_params, expected_headers,expected_body_json, expected_body_data", + [ + ( + "test_param_header", + RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="repo"), + {"owner": "customer"}, + {"repo": "airbyte"}, + {}, + {}, + ), + ( + "test_header_header", + RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="repo"), + {}, + {"owner": "customer", "repo": "airbyte"}, + {}, + {}, + ), + ( + "test_body_data", + RequestOption(inject_into=RequestOptionType.body_data, parameters={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.body_data, parameters={}, field_name="repo"), + {}, + {}, + {}, + {"owner": "customer", "repo": "airbyte"}, + ), + ( + "test_body_json", + RequestOption(inject_into=RequestOptionType.body_json, parameters={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.body_json, parameters={}, field_name="repo"), + {}, + {}, + {"owner": "customer", "repo": "airbyte"}, + {}, + ), + ], +) +def test_request_option( + test_name, + stream_1_request_option, + stream_2_request_option, + expected_req_params, + expected_headers, + expected_body_json, + expected_body_data, +): + slicer = CartesianProductStreamSlicer( + stream_slicers=[ + ListPartitionRouter( + values=["customer", "store", "subscription"], + cursor_field="owner_resource", + config={}, + request_option=stream_1_request_option, + parameters={}, + ), + ListPartitionRouter( + values=["airbyte", "airbyte-cloud"], + cursor_field="repository", + config={}, + request_option=stream_2_request_option, + parameters={}, + ), + ], + parameters={}, + ) + stream_slice = {"owner_resource": "customer", "repository": "airbyte"} + + assert slicer.get_request_params(stream_slice=stream_slice) == expected_req_params + assert slicer.get_request_headers(stream_slice=stream_slice) == expected_headers + assert slicer.get_request_body_json(stream_slice=stream_slice) == expected_body_json + assert slicer.get_request_body_data(stream_slice=stream_slice) == expected_body_data + + +def test_request_option_before_updating_cursor(): + stream_1_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="owner") + stream_2_request_option = RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="repo") + slicer = CartesianProductStreamSlicer( + stream_slicers=[ + ListPartitionRouter( + values=["customer", "store", "subscription"], + cursor_field="owner_resource", + config={}, + request_option=stream_1_request_option, + parameters={}, + ), + ListPartitionRouter( + values=["airbyte", "airbyte-cloud"], + cursor_field="repository", + config={}, + request_option=stream_2_request_option, + parameters={}, + ), + ], + parameters={}, + ) + assert {} == slicer.get_request_params() + assert {} == slicer.get_request_headers() + assert {} == slicer.get_request_body_json() + assert {} == slicer.get_request_body_data() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_list_partition_router.py b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_list_partition_router.py new file mode 100644 index 000000000000..87aa18f5a0b4 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_list_partition_router.py @@ -0,0 +1,161 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest as pytest +from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import ListPartitionRouter +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.types import StreamSlice + +partition_values = ["customer", "store", "subscription"] +cursor_field = "owner_resource" +parameters = {"cursor_field": "owner_resource"} + + +@pytest.mark.parametrize( + "partition_values, cursor_field, expected_slices", + [ + ( + ["customer", "store", "subscription"], + "owner_resource", + [ + StreamSlice(partition={"owner_resource": "customer"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "store"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "subscription"}, cursor_slice={}), + ], + ), + ( + '["customer", "store", "subscription"]', + "owner_resource", + [ + StreamSlice(partition={"owner_resource": "customer"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "store"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "subscription"}, cursor_slice={}), + ], + ), + ( + '["customer", "store", "subscription"]', + "{{ parameters['cursor_field'] }}", + [ + StreamSlice(partition={"owner_resource": "customer"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "store"}, cursor_slice={}), + StreamSlice(partition={"owner_resource": "subscription"}, cursor_slice={}), + ], + ), + ], + ids=[ + "test_single_element", + "test_input_list_is_string", + "test_using_cursor_from_parameters", + ], +) +def test_list_partition_router(partition_values, cursor_field, expected_slices): + slicer = ListPartitionRouter(values=partition_values, cursor_field=cursor_field, config={}, parameters=parameters) + slices = [s for s in slicer.stream_slices()] + assert slices == expected_slices + assert all(isinstance(s, StreamSlice) for s in slices) + + +@pytest.mark.parametrize( + "request_option, expected_req_params, expected_headers, expected_body_json, expected_body_data", + [ + ( + RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="owner_resource"), + {"owner_resource": "customer"}, + {}, + {}, + {}, + ), + ( + RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="owner_resource"), + {}, + {"owner_resource": "customer"}, + {}, + {}, + ), + ( + RequestOption(inject_into=RequestOptionType.body_json, parameters={}, field_name="owner_resource"), + {}, + {}, + {"owner_resource": "customer"}, + {}, + ), + ( + RequestOption(inject_into=RequestOptionType.body_data, parameters={}, field_name="owner_resource"), + {}, + {}, + {}, + {"owner_resource": "customer"}, + ), + ], + ids=[ + "test_inject_into_req_param", + "test_pass_by_header", + "test_inject_into_body_json", + "test_inject_into_body_data", + ], +) +def test_request_option(request_option, expected_req_params, expected_headers, expected_body_json, expected_body_data): + partition_router = ListPartitionRouter( + values=partition_values, cursor_field=cursor_field, config={}, request_option=request_option, parameters={} + ) + stream_slice = {cursor_field: "customer"} + + assert partition_router.get_request_params(stream_slice=stream_slice) == expected_req_params + assert partition_router.get_request_headers(stream_slice=stream_slice) == expected_headers + assert partition_router.get_request_body_json(stream_slice=stream_slice) == expected_body_json + assert partition_router.get_request_body_data(stream_slice=stream_slice) == expected_body_data + + +@pytest.mark.parametrize( + "stream_slice", + [ + pytest.param({}, id="test_request_option_is_empty_if_empty_stream_slice"), + pytest.param({"not the cursor": "value"}, id="test_request_option_is_empty_if_the_stream_slice_does_not_have_cursor_field"), + pytest.param(None, id="test_request_option_is_empty_if_no_stream_slice"), + ], +) +def test_request_option_is_empty_if_no_stream_slice(stream_slice): + request_option = RequestOption(inject_into=RequestOptionType.body_data, parameters={}, field_name="owner_resource") + partition_router = ListPartitionRouter( + values=partition_values, cursor_field=cursor_field, config={}, request_option=request_option, parameters={} + ) + assert {} == partition_router.get_request_body_data(stream_slice=stream_slice) + + +@pytest.mark.parametrize( + "field_name_interpolation, expected_request_params", + [ + ("{{parameters['partition_name']}}", {"parameters_partition": "customer"}), + ("{{config['partition_name']}}", {"config_partition": "customer"}), + ], + ids=[ + "parameters_interpolation", + "config_interpolation", + ], +) +def test_request_options_interpolation(field_name_interpolation: str, expected_request_params: dict): + config = {"partition_name": "config_partition"} + parameters = {"partition_name": "parameters_partition"} + request_option = RequestOption( + inject_into=RequestOptionType.request_parameter, parameters=parameters, field_name=field_name_interpolation + ) + partition_router = ListPartitionRouter( + values=partition_values, cursor_field=cursor_field, config=config, request_option=request_option, parameters=parameters + ) + stream_slice = {cursor_field: "customer"} + + assert partition_router.get_request_params(stream_slice=stream_slice) == expected_request_params + + +def test_request_option_before_updating_cursor(): + request_option = RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="owner_resource") + partition_router = ListPartitionRouter( + values=partition_values, cursor_field=cursor_field, config={}, request_option=request_option, parameters={} + ) + stream_slice = {cursor_field: "customer"} + + assert {} == partition_router.get_request_params(stream_slice) + assert {} == partition_router.get_request_headers() + assert {} == partition_router.get_request_body_json() + assert {} == partition_router.get_request_body_data() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_parent_state_stream.py b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_parent_state_stream.py new file mode 100644 index 000000000000..e41f548507c0 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_parent_state_stream.py @@ -0,0 +1,1778 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import copy +from typing import Any, List, Mapping, MutableMapping, Optional, Union +from unittest.mock import MagicMock + +import pytest +import requests_mock +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + StreamDescriptor, + SyncMode, +) +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from orjson import orjson + +SUBSTREAM_MANIFEST: MutableMapping[str, Any] = { + "version": "0.51.42", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["post_comment_votes"]}, + "definitions": { + "basic_authenticator": { + "type": "BasicHttpAuthenticator", + "username": "{{ config['credentials']['email'] + '/token' }}", + "password": "{{ config['credentials']['api_token'] }}", + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": ["{{ parameters.get('data_path') or parameters['name'] }}"], + }, + "schema_normalization": "Default", + }, + "paginator": { + "type": "DefaultPaginator", + "page_size_option": {"type": "RequestOption", "field_name": "per_page", "inject_into": "request_parameter"}, + "pagination_strategy": { + "type": "CursorPagination", + "page_size": 100, + "cursor_value": "{{ response.get('next_page', {}) }}", + "stop_condition": "{{ not response.get('next_page', {}) }}", + }, + "page_token_option": {"type": "RequestPath"}, + }, + }, + "cursor_incremental_sync": { + "type": "DatetimeBasedCursor", + "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], + "datetime_format": "%Y-%m-%dT%H:%M:%SZ", + "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", + "start_datetime": {"datetime": "{{ config.get('start_date')}}"}, + "start_time_option": {"inject_into": "request_parameter", "field_name": "start_time", "type": "RequestOption"}, + }, + "posts_stream": { + "type": "DeclarativeStream", + "name": "posts", + "primary_key": ["id"], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "id": {"type": "integer"}, + "updated_at": {"type": "string", "format": "date-time"}, + "title": {"type": "string"}, + "content": {"type": "string"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "path": "/community/posts", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": "#/definitions/retriever/record_selector", + "paginator": "#/definitions/retriever/paginator", + }, + "incremental_sync": "#/definitions/cursor_incremental_sync", + "$parameters": { + "name": "posts", + "path": "community/posts", + "data_path": "posts", + "cursor_field": "updated_at", + "primary_key": "id", + }, + }, + "post_comments_stream": { + "type": "DeclarativeStream", + "name": "post_comments", + "primary_key": ["id"], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "id": {"type": "integer"}, + "updated_at": {"type": "string", "format": "date-time"}, + "post_id": {"type": "integer"}, + "comment": {"type": "string"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "path": "/community/posts/{{ stream_slice.id }}/comments", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": ["comments"]}, + "record_filter": { + "condition": "{{ record['updated_at'] >= stream_state.get('updated_at', config.get('start_date')) }}" + }, + }, + "paginator": "#/definitions/retriever/paginator", + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "stream": "#/definitions/posts_stream", + "parent_key": "id", + "partition_field": "id", + "incremental_dependency": True, + } + ], + }, + }, + "incremental_sync": { + "type": "DatetimeBasedCursor", + "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], + "datetime_format": "%Y-%m-%dT%H:%M:%SZ", + "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", + "start_datetime": {"datetime": "{{ config.get('start_date') }}"}, + }, + "$parameters": { + "name": "post_comments", + "path": "community/posts/{{ stream_slice.id }}/comments", + "data_path": "comments", + "cursor_field": "updated_at", + "primary_key": "id", + }, + }, + "post_comment_votes_stream": { + "type": "DeclarativeStream", + "name": "post_comment_votes", + "primary_key": ["id"], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "id": {"type": "integer"}, + "created_at": {"type": "string", "format": "date-time"}, + "comment_id": {"type": "integer"}, + "vote": {"type": "number"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "path": "/community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": "#/definitions/retriever/record_selector", + "paginator": "#/definitions/retriever/paginator", + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "stream": "#/definitions/post_comments_stream", + "parent_key": "id", + "partition_field": "id", + "incremental_dependency": True, + } + ], + }, + }, + "incremental_sync": "#/definitions/cursor_incremental_sync", + "$parameters": { + "name": "post_comment_votes", + "path": "community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", + "data_path": "votes", + "cursor_field": "created_at", + "primary_key": "id", + }, + }, + }, + "streams": [ + {"$ref": "#/definitions/posts_stream"}, + {"$ref": "#/definitions/post_comments_stream"}, + {"$ref": "#/definitions/post_comment_votes_stream"}, + ], +} + + +def _run_read( + manifest: Mapping[str, Any], + config: Mapping[str, Any], + stream_name: str, + state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None, +) -> List[AirbyteMessage]: + source = ManifestDeclarativeSource(source_config=manifest) + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=stream_name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + logger = MagicMock() + return list(source.read(logger, config, catalog, state)) + + +def run_incremental_parent_state_test(manifest, mock_requests, expected_records, initial_state, expected_states): + """ + Run an incremental parent state test for the specified stream. + + This function performs the following steps: + 1. Mocks the API requests as defined in mock_requests. + 2. Executes the read operation using the provided manifest and config. + 3. Asserts that the output records match the expected records. + 4. Collects intermediate states and records, performing additional reads as necessary. + 5. Compares the cumulative records from each state against the expected records. + 6. Asserts that the final state matches one of the expected states for each run. + + Args: + manifest (dict): The manifest configuration for the stream. + mock_requests (list): A list of tuples containing URL and response data for mocking API requests. + expected_records (list): The expected records to compare against the output. + initial_state (list): The initial state to start the read operation. + expected_states (list): A list of expected final states after the read operation. + """ + _stream_name = "post_comment_votes" + config = {"start_date": "2024-01-01T00:00:01Z", "credentials": {"email": "email", "api_token": "api_token"}} + + with requests_mock.Mocker() as m: + for url, response in mock_requests: + m.get(url, json=response) + + # Run the initial read + output = _run_read(manifest, config, _stream_name, initial_state) + output_data = [message.record.data for message in output if message.record] + + # Assert that output_data equals expected_records + assert output_data == expected_records + + # Collect the intermediate states and records produced before each state + cumulative_records = [] + intermediate_states = [] + final_states = [] # To store the final state after each read + + # Store the final state after the initial read + final_state_initial = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + final_states.append(final_state_initial[-1]) + + for message in output: + if message.type.value == "RECORD": + record_data = message.record.data + cumulative_records.append(record_data) + elif message.type.value == "STATE": + # Record the state and the records produced before this state + state = message.state + records_before_state = cumulative_records.copy() + intermediate_states.append((state, records_before_state)) + + # For each intermediate state, perform another read starting from that state + for state, records_before_state in intermediate_states[:-1]: + output_intermediate = _run_read(manifest, config, _stream_name, [state]) + records_from_state = [message.record.data for message in output_intermediate if message.record] + + # Combine records produced before the state with records from the new read + cumulative_records_state = records_before_state + records_from_state + + # Duplicates may occur because the state matches the cursor of the last record, causing it to be re-emitted in the next sync. + cumulative_records_state_deduped = list({orjson.dumps(record): record for record in cumulative_records_state}.values()) + + # Compare the cumulative records with the expected records + expected_records_set = list({orjson.dumps(record): record for record in expected_records}.values()) + assert sorted(cumulative_records_state_deduped, key=lambda x: orjson.dumps(x)) == sorted( + expected_records_set, key=lambda x: orjson.dumps(x) + ), f"Records mismatch with intermediate state {state}. Expected {expected_records}, got {cumulative_records_state_deduped}" + + # Store the final state after each intermediate read + final_state_intermediate = [ + orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output_intermediate if message.state + ] + final_states.append(final_state_intermediate[-1]) + + # Assert that the final state matches the expected state for all runs + for i, final_state in enumerate(final_states): + assert final_state in expected_states, f"Final state mismatch at run {i + 1}. Expected {expected_states}, got {final_state}" + + +@pytest.mark.parametrize( + "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", + [ + ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", + { + "posts": [{"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + }, + ), + # Fetch the second page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + {"votes": [{"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ("https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-01T00:00:01Z", {"votes": []}), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-01T00:00:01Z", + {"votes": [{"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-01T00:00:01Z", + {"votes": [{"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", + {"votes": [{"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}]}, + ), + # Requests with intermediate states + # Fetch votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-15T00:00:00Z", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}], + }, + ), + # Fetch votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-13T00:00:00Z", + { + "votes": [{"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}], + }, + ), + # Fetch votes for comment 12 of post 1 + ( + "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-15T00:00:00Z", + { + "votes": [], + }, + ), + # Fetch votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-12T00:00:00Z", + {"votes": [{"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}]}, + ), + # Fetch votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-12T00:00:15Z", + {"votes": [{"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}]}, + ), + ], + # Expected records + [ + {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, + {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, + {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, + {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, + {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, + {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, + ], + # Initial state + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "parent_state": { + "post_comments": { + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2023-01-04T00:00:00Z"}} + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-03T00:00:00Z"}, + }, + ], + } + ), + ), + ) + ], + # Expected state + { + "use_global_cursor": False, + "state": {"created_at": "2024-01-15T00:00:00Z"}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": "2024-01-25T00:00:00Z"}, + "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, + "lookback_window": 1, + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-25T00:00:00Z"}}, + {"partition": {"id": 2, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-22T00:00:00Z"}}, + {"partition": {"id": 3, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-09T00:00:00Z"}}, + ], + } + }, + "lookback_window": 1, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:00Z"}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:15Z"}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-10T00:00:00Z"}, + }, + ], + }, + ), + ], +) +def test_incremental_parent_state(test_name, manifest, mock_requests, expected_records, initial_state, expected_state): + additional_expected_state = copy.deepcopy(expected_state) + # State for empty partition (comment 12), when the global cursor is used for intermediate states + empty_state = {"cursor": {"created_at": "2024-01-15T00:00:00Z"}, "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}} + additional_expected_state["states"].append(empty_state) + run_incremental_parent_state_test(manifest, mock_requests, expected_records, initial_state, [expected_state, additional_expected_state]) + + +@pytest.mark.parametrize( + "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", + [ + ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-02T00:00:00Z", + { + "posts": [{"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-02T00:00:00Z&page=2", + }, + ), + # Fetch the second page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-02T00:00:00Z&page=2", + {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-02T00:00:00Z", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-02T00:00:00Z", + {"votes": [{"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + {"votes": [{"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ("https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-02T00:00:00Z", {"votes": []}), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + {"votes": [{"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + {"votes": [{"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + "https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + {"votes": [{"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}]}, + ), + ], + # Expected records + [ + {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, + {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, + {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, + {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, + {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, + {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, + ], + # Initial state + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "created_at": "2024-01-02T00:00:00Z" + } + ), + ), + ) + ], + # Expected state + { + "use_global_cursor": False, + "state": {"created_at": "2024-01-15T00:00:00Z"}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": "2024-01-25T00:00:00Z"}, + "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, + "lookback_window": 1, + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-25T00:00:00Z"}}, + {"partition": {"id": 2, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-22T00:00:00Z"}}, + {"partition": {"id": 3, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-09T00:00:00Z"}}, + ], + } + }, + "lookback_window": 1, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + 'partition': {'id': 12, 'parent_slice': {'id': 1, 'parent_slice': {}}}, + 'cursor': {'created_at': '2024-01-02T00:00:00Z'}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:00Z"}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:15Z"}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-10T00:00:00Z"}, + }, + ], + }, + ), + ], +) +def test_incremental_parent_state_migration(test_name, manifest, mock_requests, expected_records, initial_state, expected_state): + """ + Test incremental partition router with parent state migration + """ + _stream_name = "post_comment_votes" + config = {"start_date": "2024-01-01T00:00:01Z", "credentials": {"email": "email", "api_token": "api_token"}} + + with requests_mock.Mocker() as m: + for url, response in mock_requests: + m.get(url, json=response) + + output = _run_read(manifest, config, _stream_name, initial_state) + output_data = [message.record.data for message in output if message.record] + + assert output_data == expected_records + final_state = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + assert final_state[-1] == expected_state + + +@pytest.mark.parametrize( + "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", + [ + ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", + { + "posts": [], + "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + }, + ), + # Fetch the second page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + {"posts": []}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": []}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + { + "votes": [], + "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + {"votes": []}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": []}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ("https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-01T00:00:01Z", {"votes": []}), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": []}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-01T00:00:01Z", + {"votes": []}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-01T00:00:01Z", + {"votes": []}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": []}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", + {"votes": []}, + ), + ], + # Expected records + [], + # Initial state + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "parent_state": { + "post_comments": { + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2023-01-04T00:00:00Z"}} + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-03T00:00:00Z"}, + }, + ], + "state": {"created_at": "2024-01-03T00:00:00Z"}, + "lookback_window": 1, + } + ), + ), + ) + ], + # Expected state + { + "lookback_window": 1, + "use_global_cursor": False, + "state": {"created_at": "2024-01-03T00:00:00Z"}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {}, + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + "states": [{"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2023-01-04T00:00:00Z"}}], + } + }, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-03T00:00:00Z"}, + }, + ], + }, + ), + ], +) +def test_incremental_parent_state_no_slices(test_name, manifest, mock_requests, expected_records, initial_state, expected_state): + """ + Test incremental partition router with no parent records + """ + _stream_name = "post_comment_votes" + config = {"start_date": "2024-01-01T00:00:01Z", "credentials": {"email": "email", "api_token": "api_token"}} + + with requests_mock.Mocker() as m: + for url, response in mock_requests: + m.get(url, json=response) + + output = _run_read(manifest, config, _stream_name, initial_state) + output_data = [message.record.data for message in output if message.record] + + assert output_data == expected_records + final_state = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + assert final_state[-1] == expected_state + + +@pytest.mark.parametrize( + "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", + [ + ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", + { + "posts": [{"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + }, + ), + # Fetch the second page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + { + "votes": [], + "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + {"votes": []}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": []}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ("https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-03T00:00:00Z", {"votes": []}), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": []}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": []}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", + {"votes": []}, + ), + ], + # Expected records + [], + # Initial state + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "parent_state": { + "post_comments": { + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2023-01-04T00:00:00Z"}} + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-03T00:00:00Z"}, + }, + ], + "use_global_cursor": True, + "state": {"created_at": "2024-01-03T00:00:00Z"}, + "lookback_window": 0, + } + ), + ), + ) + ], + # Expected state + { + "lookback_window": 1, + "use_global_cursor": True, + "state": {"created_at": "2024-01-03T00:00:00Z"}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": "2024-01-25T00:00:00Z"}, + "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, + "lookback_window": 1, + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-25T00:00:00Z"}}, + {"partition": {"id": 2, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-22T00:00:00Z"}}, + {"partition": {"id": 3, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-09T00:00:00Z"}}, + ], + } + }, + }, + ), + ], +) +def test_incremental_parent_state_no_records(test_name, manifest, mock_requests, expected_records, initial_state, expected_state): + """ + Test incremental partition router with no child records + """ + _stream_name = "post_comment_votes" + config = {"start_date": "2024-01-01T00:00:01Z", "credentials": {"email": "email", "api_token": "api_token"}} + + with requests_mock.Mocker() as m: + for url, response in mock_requests: + m.get(url, json=response) + + output = _run_read(manifest, config, _stream_name, initial_state) + output_data = [message.record.data for message in output if message.record] + + assert output_data == expected_records + final_state = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + assert final_state[-1] == expected_state + + +@pytest.mark.parametrize( + "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", + [ + ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z", + { + "posts": [{"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + }, + ), + # Fetch the second page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z&page=2", + {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-02T00:00:00Z", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-01T00:00:01Z", + {"votes": [{"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ("https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-01T00:00:01Z", {"votes": []}), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-01T00:00:01Z", + {"votes": [{"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-01T00:00:01Z", + {"votes": [{"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", + {"votes": [{"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}]}, + ), + ], + # Expected records + [ + {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, + {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, + {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, + {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, + {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, + {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, + ], + # Initial state + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + # This should not happen since parent state is disabled, but I've added this to validate that and + # incoming parent_state is ignored when the parent stream's incremental_dependency is disabled + "parent_state": { + "post_comments": { + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2023-01-04T00:00:00Z"}} + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-03T00:00:00Z"}, + }, + ], + } + ), + ), + ) + ], + # Expected state + { + "use_global_cursor": False, + "state": {"created_at": "2024-01-15T00:00:00Z"}, + "lookback_window": 1, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:00Z"}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:15Z"}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-10T00:00:00Z"}, + }, + ], + }, + ), + ], +) +def test_incremental_parent_state_no_incremental_dependency( + test_name, manifest, mock_requests, expected_records, initial_state, expected_state +): + """ + This is a pretty complicated test that syncs a low-code connector stream with three levels of substreams + - posts: (ids: 1, 2, 3) + - post comments: (parent post 1 with ids: 9, 10, 11, 12; parent post 2 with ids: 20, 21; parent post 3 with id: 30) + - post comment votes: (parent comment 10 with ids: 100, 101; parent comment 11 with id: 102; + parent comment 20 with id: 200; parent comment 21 with id: 201, parent comment 30 with id: 300) + + By setting incremental_dependency to false, parent streams will not use the incoming state and will not update state. + The post_comment_votes substream is incremental and will emit state messages We verify this by ensuring that mocked + parent stream requests use the incoming config as query parameters and the substream state messages does not + contain parent stream state. + """ + + _stream_name = "post_comment_votes" + config = {"start_date": "2024-01-01T00:00:01Z", "credentials": {"email": "email", "api_token": "api_token"}} + + # Disable incremental_dependency + manifest["definitions"]["post_comments_stream"]["retriever"]["partition_router"]["parent_stream_configs"][0][ + "incremental_dependency" + ] = False + manifest["definitions"]["post_comment_votes_stream"]["retriever"]["partition_router"]["parent_stream_configs"][0][ + "incremental_dependency" + ] = False + + with requests_mock.Mocker() as m: + for url, response in mock_requests: + m.get(url, json=response) + + output = _run_read(manifest, config, _stream_name, initial_state) + output_data = [message.record.data for message in output if message.record] + + assert output_data == expected_records + final_state = [orjson.loads(orjson.dumps(message.state.stream.stream_state)) for message in output if message.state] + assert final_state[-1] == expected_state + + +SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR: MutableMapping[str, Any] = { + "version": "0.51.42", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["post_comment_votes"]}, + "definitions": { + "basic_authenticator": { + "type": "BasicHttpAuthenticator", + "username": "{{ config['credentials']['email'] + '/token' }}", + "password": "{{ config['credentials']['api_token'] }}", + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": ["{{ parameters.get('data_path') or parameters['name'] }}"], + }, + "schema_normalization": "Default", + }, + "paginator": { + "type": "DefaultPaginator", + "page_size_option": {"type": "RequestOption", "field_name": "per_page", "inject_into": "request_parameter"}, + "pagination_strategy": { + "type": "CursorPagination", + "page_size": 100, + "cursor_value": "{{ response.get('next_page', {}) }}", + "stop_condition": "{{ not response.get('next_page', {}) }}", + }, + "page_token_option": {"type": "RequestPath"}, + }, + }, + "cursor_incremental_sync": { + "type": "DatetimeBasedCursor", + "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], + "datetime_format": "%Y-%m-%dT%H:%M:%SZ", + "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", + "start_datetime": {"datetime": "{{ config.get('start_date')}}"}, + "start_time_option": {"inject_into": "request_parameter", "field_name": "start_time", "type": "RequestOption"}, + }, + "posts_stream": { + "type": "DeclarativeStream", + "name": "posts", + "primary_key": ["id"], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "id": {"type": "integer"}, + "updated_at": {"type": "string", "format": "date-time"}, + "title": {"type": "string"}, + "content": {"type": "string"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "path": "/community/posts", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": "#/definitions/retriever/record_selector", + "paginator": "#/definitions/retriever/paginator", + }, + "incremental_sync": "#/definitions/cursor_incremental_sync", + "$parameters": { + "name": "posts", + "path": "community/posts", + "data_path": "posts", + "cursor_field": "updated_at", + "primary_key": "id", + }, + }, + "post_comments_stream": { + "type": "DeclarativeStream", + "name": "post_comments", + "primary_key": ["id"], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "id": {"type": "integer"}, + "updated_at": {"type": "string", "format": "date-time"}, + "post_id": {"type": "integer"}, + "comment": {"type": "string"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "path": "/community/posts/{{ stream_slice.id }}/comments", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": ["comments"]}, + "record_filter": { + "condition": "{{ record['updated_at'] >= stream_state.get('updated_at', config.get('start_date')) }}" + }, + }, + "paginator": "#/definitions/retriever/paginator", + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "stream": "#/definitions/posts_stream", + "parent_key": "id", + "partition_field": "id", + "incremental_dependency": True, + } + ], + }, + }, + "incremental_sync": { + "type": "DatetimeBasedCursor", + "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], + "datetime_format": "%Y-%m-%dT%H:%M:%SZ", + "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", + "start_datetime": {"datetime": "{{ config.get('start_date') }}"}, + }, + "$parameters": { + "name": "post_comments", + "path": "community/posts/{{ stream_slice.id }}/comments", + "data_path": "comments", + "cursor_field": "updated_at", + "primary_key": "id", + }, + }, + "post_comment_votes_stream": { + "type": "DeclarativeStream", + "name": "post_comment_votes", + "primary_key": ["id"], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "id": {"type": "integer"}, + "created_at": {"type": "string", "format": "date-time"}, + "comment_id": {"type": "integer"}, + "vote": {"type": "number"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.example.com", + "path": "/community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", + "http_method": "GET", + "authenticator": "#/definitions/basic_authenticator", + }, + "record_selector": "#/definitions/retriever/record_selector", + "paginator": "#/definitions/retriever/paginator", + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "stream": "#/definitions/post_comments_stream", + "parent_key": "id", + "partition_field": "id", + "incremental_dependency": True, + } + ], + }, + }, + "incremental_sync": { + "type": "DatetimeBasedCursor", + "cursor_datetime_formats": ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"], + "datetime_format": "%Y-%m-%dT%H:%M:%SZ", + "cursor_field": "{{ parameters.get('cursor_field', 'updated_at') }}", + "start_datetime": {"datetime": "{{ config.get('start_date')}}"}, + "start_time_option": {"inject_into": "request_parameter", "field_name": "start_time", "type": "RequestOption"}, + "global_substream_cursor": True, + }, + "$parameters": { + "name": "post_comment_votes", + "path": "community/posts/{{ stream_slice.parent_slice.id }}/comments/{{ stream_slice.id }}/votes", + "data_path": "votes", + "cursor_field": "created_at", + "primary_key": "id", + }, + }, + }, + "streams": [ + {"$ref": "#/definitions/posts_stream"}, + {"$ref": "#/definitions/post_comments_stream"}, + {"$ref": "#/definitions/post_comment_votes_stream"}, + ], +} +SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR_NO_DEPENDENCY = copy.deepcopy(SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR) +SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR_NO_DEPENDENCY["definitions"]["post_comment_votes_stream"]["retriever"]["partition_router"][ + "parent_stream_configs" +][0]["incremental_dependency"] = False + + +@pytest.mark.parametrize( + "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", + [ + ( + "test_global_substream_cursor", + SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR, + [ + # Fetch the first page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z", + { + "posts": [{"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + }, + ), + # Fetch the second page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-05T00:00:00Z&page=2", + {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:01Z", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:01Z", + {"votes": [{"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ("https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-03T00:00:00Z", {"votes": []}), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", + {"votes": [{"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}]}, + ), + # Requests with intermediate states + # Fetch votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-14T23:59:59Z", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}], + }, + ), + # Fetch votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-14T23:59:59Z", + { + "votes": [{"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}], + }, + ), + # Fetch votes for comment 12 of post 1 + ( + "https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-14T23:59:59Z", + { + "votes": [], + }, + ), + # Fetch votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-14T23:59:59Z", + {"votes": [{"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}]}, + ), + # Fetch votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-14T23:59:59Z", + {"votes": [{"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}]}, + ), + ], + # Expected records + [ + {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, + {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, + {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, + {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, + {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, + {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, + ], + # Initial state + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "parent_state": { + "post_comments": { + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2023-01-04T00:00:00Z"}} + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + "state": {"created_at": "2024-01-04T02:03:04Z"}, + "lookback_window": 93784, + } + ), + ), + ) + ], + # Expected state + { + "state": {"created_at": "2024-01-15T00:00:00Z"}, + "lookback_window": 1, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": "2024-01-25T00:00:00Z"}, + "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, + "lookback_window": 1, + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-25T00:00:00Z"}}, + {"partition": {"id": 2, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-22T00:00:00Z"}}, + {"partition": {"id": 3, "parent_slice": {}}, "cursor": {"updated_at": "2024-01-09T00:00:00Z"}}, + ], + } + }, + }, + ), + ( + "test_global_substream_cursor_no_dependency", + SUBSTREAM_MANIFEST_GLOBAL_PARENT_CURSOR_NO_DEPENDENCY, + [ + # Fetch the first page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z", + { + "posts": [{"id": 1, "updated_at": "2024-01-30T00:00:00Z"}, {"id": 2, "updated_at": "2024-01-29T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z&page=2", + }, + ), + # Fetch the second page of posts + ( + "https://api.example.com/community/posts?per_page=100&start_time=2024-01-01T00:00:01Z&page=2", + {"posts": [{"id": 3, "updated_at": "2024-01-28T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:00Z", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + "https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + "https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ("https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time=2024-01-03T00:00:00Z", {"votes": []}), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + "https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + "https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time=2024-01-03T00:00:00Z", + {"votes": [{"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + "https://api.example.com/community/posts/3/comments/30/votes?per_page=100", + {"votes": [{"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}]}, + ), + ], + # Expected records + [ + {"id": 100, "comment_id": 10, "created_at": "2024-01-15T00:00:00Z"}, + {"id": 101, "comment_id": 10, "created_at": "2024-01-14T00:00:00Z"}, + {"id": 102, "comment_id": 11, "created_at": "2024-01-13T00:00:00Z"}, + {"id": 200, "comment_id": 20, "created_at": "2024-01-12T00:00:00Z"}, + {"id": 201, "comment_id": 21, "created_at": "2024-01-12T00:00:15Z"}, + {"id": 300, "comment_id": 30, "created_at": "2024-01-10T00:00:00Z"}, + ], + # Initial state + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="post_comment_votes", namespace=None), + stream_state=AirbyteStateBlob( + { + "parent_state": { + "post_comments": { + "states": [ + {"partition": {"id": 1, "parent_slice": {}}, "cursor": {"updated_at": "2023-01-04T00:00:00Z"}} + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + "state": {"created_at": "2024-01-04T02:03:04Z"}, + "lookback_window": 93784, + } + ), + ), + ) + ], + # Expected state + {"state": {"created_at": "2024-01-15T00:00:00Z"}, "lookback_window": 1}, + ), + ], +) +def test_incremental_global_parent_state(test_name, manifest, mock_requests, expected_records, initial_state, expected_state): + run_incremental_parent_state_test(manifest, mock_requests, expected_records, initial_state, [expected_state]) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_single_partition_router.py b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_single_partition_router.py new file mode 100644 index 000000000000..b1512dc54f12 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_single_partition_router.py @@ -0,0 +1,14 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter +from airbyte_cdk.sources.types import StreamSlice + + +def test(): + iterator = SinglePartitionRouter(parameters={}) + + stream_slices = iterator.stream_slices() + next_slice = next(stream_slices) + assert next_slice == StreamSlice(partition={}, cursor_slice={}) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py new file mode 100644 index 000000000000..f29917abdccf --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py @@ -0,0 +1,970 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from functools import partial +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union + +import pytest as pytest +from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, SyncMode, Type +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.incremental import ChildPartitionResumableFullRefreshCursor, ResumableFullRefreshCursor +from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import CursorFactory, PerPartitionCursor, StreamSlice +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.partition_routers import CartesianProductStreamSlicer, ListPartitionRouter +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ParentStreamConfig, SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.streams.checkpoint import Cursor +from airbyte_cdk.sources.types import Record +from airbyte_cdk.utils import AirbyteTracedException + +parent_records = [{"id": 1, "data": "data1"}, {"id": 2, "data": "data2"}] +more_records = [{"id": 10, "data": "data10", "slice": "second_parent"}, {"id": 20, "data": "data20", "slice": "second_parent"}] + +data_first_parent_slice = [{"id": 0, "slice": "first", "data": "A"}, {"id": 1, "slice": "first", "data": "B"}] +data_second_parent_slice = [{"id": 2, "slice": "second", "data": "C"}] +data_third_parent_slice = [] +all_parent_data = data_first_parent_slice + data_second_parent_slice + data_third_parent_slice +parent_slices = [{"slice": "first"}, {"slice": "second"}, {"slice": "third"}] +second_parent_stream_slice = [StreamSlice(partition={"slice": "second_parent"}, cursor_slice={})] + +data_first_parent_slice_with_cursor = [ + {"id": 0, "slice": "first", "data": "A", "cursor": "first_cursor_0"}, + {"id": 1, "slice": "first", "data": "B", "cursor": "first_cursor_1"}, +] +data_second_parent_slice_with_cursor = [{"id": 2, "slice": "second", "data": "C", "cursor": "second_cursor_2"}] +all_parent_data_with_cursor = data_first_parent_slice_with_cursor + data_second_parent_slice_with_cursor + + +class MockStream(DeclarativeStream): + def __init__(self, slices, records, name, cursor_field="", cursor=None): + self.config = {} + self._slices = slices + self._records = records + self._stream_cursor_field = ( + InterpolatedString.create(cursor_field, parameters={}) if isinstance(cursor_field, str) else cursor_field + ) + self._name = name + self._state = {"states": []} + self._cursor = cursor + + @property + def name(self) -> str: + return self._name + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return "id" + + @property + def state(self) -> Mapping[str, Any]: + return self._state + + @state.setter + def state(self, value: Mapping[str, Any]) -> None: + self._state = value + + @property + def is_resumable(self) -> bool: + return bool(self._cursor) + + def get_cursor(self) -> Optional[Cursor]: + return self._cursor + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None + ) -> Iterable[Optional[StreamSlice]]: + for s in self._slices: + if isinstance(s, StreamSlice): + yield s + else: + yield StreamSlice(partition=s, cursor_slice={}) + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + # The parent stream's records should always be read as full refresh + assert sync_mode == SyncMode.full_refresh + + if not stream_slice: + result = self._records + else: + result = [Record(data=r, associated_slice=stream_slice) for r in self._records if r["slice"] == stream_slice["slice"]] + + yield from result + + # Update the state only after reading the full slice + cursor_field = self._stream_cursor_field.eval(config=self.config) + if stream_slice and cursor_field and result: + self._state["states"].append({cursor_field: result[-1][cursor_field], "partition": stream_slice["slice"]}) + + def get_json_schema(self) -> Mapping[str, Any]: + return {} + + +class MockIncrementalStream(MockStream): + def __init__(self, slices, records, name, cursor_field="", cursor=None, date_ranges=None): + super().__init__(slices, records, name, cursor_field, cursor) + if date_ranges is None: + date_ranges = [] + self._date_ranges = date_ranges + self._state = {} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + results = [record for record in self._records if stream_slice["start_time"] <= record["updated_at"] <= stream_slice["end_time"]] + print(f"about to emit {results}") + yield from results + print(f"setting state to {stream_slice}") + self._state = stream_slice + + +class MockResumableFullRefreshStream(MockStream): + def __init__(self, slices, name, cursor_field="", cursor=None, record_pages: Optional[List[List[Mapping[str, Any]]]] = None): + super().__init__(slices, [], name, cursor_field, cursor) + if record_pages: + self._record_pages = record_pages + else: + self._record_pages = [] + self._state: MutableMapping[str, Any] = {} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + page_number = self.state.get("next_page_token") or 1 + yield from self._record_pages[page_number - 1] + + cursor = self.get_cursor() + if page_number < len(self._record_pages): + cursor.close_slice(StreamSlice(cursor_slice={"next_page_token": page_number + 1}, partition={})) + else: + cursor.close_slice(StreamSlice(cursor_slice={"__ab_full_refresh_sync_complete": True}, partition={})) + + @property + def state(self) -> Mapping[str, Any]: + cursor = self.get_cursor() + return cursor.get_stream_state() if cursor else {} + + @state.setter + def state(self, value: Mapping[str, Any]) -> None: + self._state = value + + +@pytest.mark.parametrize( + "parent_stream_configs, expected_slices", + [ + ([], None), + ( + [ + ParentStreamConfig( + stream=MockStream([{}], [], "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + [], + ), + ( + [ + ParentStreamConfig( + stream=MockStream([{}], parent_records, "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + [{"first_stream_id": 1, "parent_slice": {}}, {"first_stream_id": 2, "parent_slice": {}}], + ), + ( + [ + ParentStreamConfig( + stream=MockStream(parent_slices, all_parent_data, "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + [ + {"parent_slice": {"slice": "first"}, "first_stream_id": 0}, + {"parent_slice": {"slice": "first"}, "first_stream_id": 1}, + {"parent_slice": {"slice": "second"}, "first_stream_id": 2}, + ], + ), + ( + [ + ParentStreamConfig( + stream=MockStream( + [StreamSlice(partition=p, cursor_slice={"start": 0, "end": 1}) for p in parent_slices], + all_parent_data, + "first_stream", + ), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + [ + {"parent_slice": {"slice": "first"}, "first_stream_id": 0}, + {"parent_slice": {"slice": "first"}, "first_stream_id": 1}, + {"parent_slice": {"slice": "second"}, "first_stream_id": 2}, + ], + ), + ( + [ + ParentStreamConfig( + stream=MockStream(parent_slices, data_first_parent_slice + data_second_parent_slice, "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + ), + ParentStreamConfig( + stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + parent_key="id", + partition_field="second_stream_id", + parameters={}, + config={}, + ), + ], + [ + {"parent_slice": {"slice": "first"}, "first_stream_id": 0}, + {"parent_slice": {"slice": "first"}, "first_stream_id": 1}, + {"parent_slice": {"slice": "second"}, "first_stream_id": 2}, + {"parent_slice": {"slice": "second_parent"}, "second_stream_id": 10}, + {"parent_slice": {"slice": "second_parent"}, "second_stream_id": 20}, + ], + ), + ( + [ + ParentStreamConfig( + stream=MockStream([{}], [{"id": 0}, {"id": 1}, {"_id": 2}, {"id": 3}], "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + [ + {"first_stream_id": 0, "parent_slice": {}}, + {"first_stream_id": 1, "parent_slice": {}}, + {"first_stream_id": 3, "parent_slice": {}}, + ], + ), + ( + [ + ParentStreamConfig( + stream=MockStream([{}], [{"a": {"b": 0}}, {"a": {"b": 1}}, {"a": {"c": 2}}, {"a": {"b": 3}}], "first_stream"), + parent_key="a/b", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + [ + {"first_stream_id": 0, "parent_slice": {}}, + {"first_stream_id": 1, "parent_slice": {}}, + {"first_stream_id": 3, "parent_slice": {}}, + ], + ), + ], + ids=[ + "test_no_parents", + "test_single_parent_slices_no_records", + "test_single_parent_slices_with_records", + "test_with_parent_slices_and_records", + "test_multiple_parent_streams", + "test_cursor_values_are_removed_from_parent_slices", + "test_missed_parent_key", + "test_dpath_extraction", + ], +) +def test_substream_partition_router(parent_stream_configs, expected_slices): + if expected_slices is None: + try: + SubstreamPartitionRouter(parent_stream_configs=parent_stream_configs, parameters={}, config={}) + assert False + except ValueError: + return + partition_router = SubstreamPartitionRouter(parent_stream_configs=parent_stream_configs, parameters={}, config={}) + slices = [s for s in partition_router.stream_slices()] + assert slices == expected_slices + + +def test_substream_partition_router_invalid_parent_record_type(): + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream([{}], [list()], "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ) + + with pytest.raises(AirbyteTracedException): + _ = [s for s in partition_router.stream_slices()] + + +@pytest.mark.parametrize( + "parent_stream_request_parameters, expected_req_params, expected_headers, expected_body_json, expected_body_data", + [ + ( + [ + RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="second_stream"), + ], + {"first_stream": "1234", "second_stream": "4567"}, + {}, + {}, + {}, + ), + ( + [ + RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="second_stream"), + ], + {}, + {"first_stream": "1234", "second_stream": "4567"}, + {}, + {}, + ), + ( + [ + RequestOption(inject_into=RequestOptionType.request_parameter, parameters={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.header, parameters={}, field_name="second_stream"), + ], + {"first_stream": "1234"}, + {"second_stream": "4567"}, + {}, + {}, + ), + ( + [ + RequestOption(inject_into=RequestOptionType.body_json, parameters={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.body_json, parameters={}, field_name="second_stream"), + ], + {}, + {}, + {"first_stream": "1234", "second_stream": "4567"}, + {}, + ), + ( + [ + RequestOption(inject_into=RequestOptionType.body_data, parameters={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.body_data, parameters={}, field_name="second_stream"), + ], + {}, + {}, + {}, + {"first_stream": "1234", "second_stream": "4567"}, + ), + ], + ids=[ + "test_request_option_in_request_param", + "test_request_option_in_header", + "test_request_option_in_param_and_header", + "test_request_option_in_body_json", + "test_request_option_in_body_data", + ], +) +def test_request_option( + parent_stream_request_parameters, + expected_req_params, + expected_headers, + expected_body_json, + expected_body_data, +): + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream(parent_slices, data_first_parent_slice + data_second_parent_slice, "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + request_option=parent_stream_request_parameters[0], + ), + ParentStreamConfig( + stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + parent_key="id", + partition_field="second_stream_id", + parameters={}, + config={}, + request_option=parent_stream_request_parameters[1], + ), + ], + parameters={}, + config={}, + ) + stream_slice = {"first_stream_id": "1234", "second_stream_id": "4567"} + + assert partition_router.get_request_params(stream_slice=stream_slice) == expected_req_params + assert partition_router.get_request_headers(stream_slice=stream_slice) == expected_headers + assert partition_router.get_request_body_json(stream_slice=stream_slice) == expected_body_json + assert partition_router.get_request_body_data(stream_slice=stream_slice) == expected_body_data + + +@pytest.mark.parametrize( + "parent_stream_config, expected_state", + [ + ( + ParentStreamConfig( + stream=MockStream(parent_slices, all_parent_data_with_cursor, "first_stream", cursor_field="cursor"), + parent_key="id", + partition_field="first_stream_id", + parameters={}, + config={}, + incremental_dependency=True, + ), + { + "first_stream": { + "states": [{"cursor": "first_cursor_1", "partition": "first"}, {"cursor": "second_cursor_2", "partition": "second"}] + } + }, + ), + ], + ids=[ + "test_incremental_dependency_state_update_with_cursor", + ], +) +def test_substream_slicer_parent_state_update_with_cursor(parent_stream_config, expected_state): + partition_router = SubstreamPartitionRouter(parent_stream_configs=[parent_stream_config], parameters={}, config={}) + + # Simulate reading the records and updating the state + for _ in partition_router.stream_slices(): + pass # This will process the slices and should update the parent state + + # Check if the parent state has been updated correctly + parent_state = partition_router.get_stream_state() + assert parent_state == expected_state + + +@pytest.mark.parametrize( + "field_name_first_stream, field_name_second_stream, expected_request_params", + [ + ( + "{{parameters['field_name_first_stream']}}", + "{{parameters['field_name_second_stream']}}", + {"parameter_first_stream_id": "1234", "parameter_second_stream_id": "4567"}, + ), + ( + "{{config['field_name_first_stream']}}", + "{{config['field_name_second_stream']}}", + {"config_first_stream_id": "1234", "config_second_stream_id": "4567"}, + ), + ], + ids=[ + "parameters_interpolation", + "config_interpolation", + ], +) +def test_request_params_interpolation_for_parent_stream( + field_name_first_stream: str, field_name_second_stream: str, expected_request_params: dict +): + config = {"field_name_first_stream": "config_first_stream_id", "field_name_second_stream": "config_second_stream_id"} + parameters = {"field_name_first_stream": "parameter_first_stream_id", "field_name_second_stream": "parameter_second_stream_id"} + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream(parent_slices, data_first_parent_slice + data_second_parent_slice, "first_stream"), + parent_key="id", + partition_field="first_stream_id", + parameters=parameters, + config=config, + request_option=RequestOption( + inject_into=RequestOptionType.request_parameter, parameters=parameters, field_name=field_name_first_stream + ), + ), + ParentStreamConfig( + stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + parent_key="id", + partition_field="second_stream_id", + parameters=parameters, + config=config, + request_option=RequestOption( + inject_into=RequestOptionType.request_parameter, parameters=parameters, field_name=field_name_second_stream + ), + ), + ], + parameters=parameters, + config=config, + ) + stream_slice = {"first_stream_id": "1234", "second_stream_id": "4567"} + + assert partition_router.get_request_params(stream_slice=stream_slice) == expected_request_params + + +def test_given_record_is_airbyte_message_when_stream_slices_then_use_record_data(): + parent_slice = {} + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream( + [parent_slice], + [ + AirbyteMessage( + type=Type.RECORD, record=AirbyteRecordMessage(data={"id": "record value"}, emitted_at=0, stream="stream") + ) + ], + "first_stream", + ), + parent_key="id", + partition_field="partition_field", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ) + + slices = list(partition_router.stream_slices()) + assert slices == [{"partition_field": "record value", "parent_slice": parent_slice}] + + +def test_given_record_is_record_object_when_stream_slices_then_use_record_data(): + parent_slice = {} + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream([parent_slice], [Record({"id": "record value"}, {})], "first_stream"), + parent_key="id", + partition_field="partition_field", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ) + + slices = list(partition_router.stream_slices()) + assert slices == [{"partition_field": "record value", "parent_slice": parent_slice}] + + +def test_substream_using_incremental_parent_stream(): + mock_slices = [ + StreamSlice(cursor_slice={"start_time": "2024-04-27", "end_time": "2024-05-27"}, partition={}), + StreamSlice(cursor_slice={"start_time": "2024-05-27", "end_time": "2024-06-27"}, partition={}), + ] + + expected_slices = [ + {"partition_field": "may_record_0", "parent_slice": {}}, + {"partition_field": "may_record_1", "parent_slice": {}}, + {"partition_field": "jun_record_0", "parent_slice": {}}, + {"partition_field": "jun_record_1", "parent_slice": {}}, + ] + + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockIncrementalStream( + slices=mock_slices, + records=[ + Record({"id": "may_record_0", "updated_at": "2024-05-15"}, mock_slices[0]), + Record({"id": "may_record_1", "updated_at": "2024-05-16"}, mock_slices[0]), + Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, mock_slices[1]), + Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, mock_slices[1]), + ], + name="first_stream", + ), + parent_key="id", + partition_field="partition_field", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ) + + actual_slices = list(partition_router.stream_slices()) + assert actual_slices == expected_slices + + +def test_substream_checkpoints_after_each_parent_partition(): + """ + This test validates the specific behavior that when getting all parent records for a substream, + we are still updating state so that the parent stream's state is updated after we finish getting all + parent records for the parent slice (not just the substream) + """ + mock_slices = [ + StreamSlice(cursor_slice={"start_time": "2024-04-27", "end_time": "2024-05-27"}, partition={}), + StreamSlice(cursor_slice={"start_time": "2024-05-27", "end_time": "2024-06-27"}, partition={}), + ] + + expected_slices = [ + {"partition_field": "may_record_0", "parent_slice": {}}, + {"partition_field": "may_record_1", "parent_slice": {}}, + {"partition_field": "jun_record_0", "parent_slice": {}}, + {"partition_field": "jun_record_1", "parent_slice": {}}, + ] + + expected_parent_state = [ + {"first_stream": {}}, + {"first_stream": {}}, + {"first_stream": {"start_time": "2024-04-27", "end_time": "2024-05-27"}}, + {"first_stream": {"start_time": "2024-04-27", "end_time": "2024-05-27"}}, + {"first_stream": {"start_time": "2024-05-27", "end_time": "2024-06-27"}}, + ] + + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockIncrementalStream( + slices=mock_slices, + records=[ + Record({"id": "may_record_0", "updated_at": "2024-05-15"}, mock_slices[0]), + Record({"id": "may_record_1", "updated_at": "2024-05-16"}, mock_slices[0]), + Record({"id": "jun_record_0", "updated_at": "2024-06-15"}, mock_slices[1]), + Record({"id": "jun_record_1", "updated_at": "2024-06-16"}, mock_slices[1]), + ], + name="first_stream", + ), + incremental_dependency=True, + parent_key="id", + partition_field="partition_field", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ) + + expected_counter = 0 + for actual_slice in partition_router.stream_slices(): + assert actual_slice == expected_slices[expected_counter] + assert partition_router.get_stream_state() == expected_parent_state[expected_counter] + expected_counter += 1 + assert partition_router.get_stream_state() == expected_parent_state[expected_counter] + + +@pytest.mark.parametrize( + "use_incremental_dependency", + [ + pytest.param(False, id="test_resumable_full_refresh_stream_without_parent_checkpoint"), + pytest.param(True, id="test_resumable_full_refresh_stream_with_use_incremental_dependency_for_parent_checkpoint"), + ], +) +def test_substream_using_resumable_full_refresh_parent_stream(use_incremental_dependency): + mock_slices = [ + StreamSlice(cursor_slice={}, partition={}), + StreamSlice(cursor_slice={"next_page_token": 2}, partition={}), + StreamSlice(cursor_slice={"next_page_token": 3}, partition={}), + ] + + expected_slices = [ + {"partition_field": "makoto_yuki", "parent_slice": {}}, + {"partition_field": "yukari_takeba", "parent_slice": {}}, + {"partition_field": "mitsuru_kirijo", "parent_slice": {}}, + {"partition_field": "akihiko_sanada", "parent_slice": {}}, + {"partition_field": "junpei_iori", "parent_slice": {}}, + {"partition_field": "fuuka_yamagishi", "parent_slice": {}}, + ] + + expected_parent_state = [ + {"persona_3_characters": {}}, + {"persona_3_characters": {}}, + {"persona_3_characters": {"next_page_token": 2}}, + {"persona_3_characters": {"next_page_token": 2}}, + {"persona_3_characters": {"next_page_token": 3}}, + {"persona_3_characters": {"next_page_token": 3}}, + {"persona_3_characters": {"__ab_full_refresh_sync_complete": True}}, + ] + + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockResumableFullRefreshStream( + slices=[StreamSlice(partition={}, cursor_slice={})], + cursor=ResumableFullRefreshCursor(parameters={}), + record_pages=[ + [ + Record(data={"id": "makoto_yuki"}, associated_slice=mock_slices[0]), + Record(data={"id": "yukari_takeba"}, associated_slice=mock_slices[0]), + ], + [ + Record(data={"id": "mitsuru_kirijo"}, associated_slice=mock_slices[1]), + Record(data={"id": "akihiko_sanada"}, associated_slice=mock_slices[1]), + ], + [ + Record(data={"id": "junpei_iori"}, associated_slice=mock_slices[2]), + Record(data={"id": "fuuka_yamagishi"}, associated_slice=mock_slices[2]), + ], + ], + name="persona_3_characters", + ), + incremental_dependency=use_incremental_dependency, + parent_key="id", + partition_field="partition_field", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ) + + expected_counter = 0 + for actual_slice in partition_router.stream_slices(): + assert actual_slice == expected_slices[expected_counter] + if use_incremental_dependency: + assert partition_router.get_stream_state() == expected_parent_state[expected_counter] + expected_counter += 1 + if use_incremental_dependency: + assert partition_router.get_stream_state() == expected_parent_state[expected_counter] + + +@pytest.mark.parametrize( + "use_incremental_dependency", + [ + pytest.param(False, id="test_substream_resumable_full_refresh_stream_without_parent_checkpoint"), + pytest.param(True, id="test_substream_resumable_full_refresh_stream_with_use_incremental_dependency_for_parent_checkpoint"), + ], +) +def test_substream_using_resumable_full_refresh_parent_stream_slices(use_incremental_dependency): + mock_parent_slices = [ + StreamSlice(cursor_slice={}, partition={}), + StreamSlice(cursor_slice={"next_page_token": 2}, partition={}), + StreamSlice(cursor_slice={"next_page_token": 3}, partition={}), + ] + + expected_parent_slices = [ + {"partition_field": "makoto_yuki", "parent_slice": {}}, + {"partition_field": "yukari_takeba", "parent_slice": {}}, + {"partition_field": "mitsuru_kirijo", "parent_slice": {}}, + {"partition_field": "akihiko_sanada", "parent_slice": {}}, + {"partition_field": "junpei_iori", "parent_slice": {}}, + {"partition_field": "fuuka_yamagishi", "parent_slice": {}}, + ] + + expected_parent_state = [ + {"persona_3_characters": {}}, + {"persona_3_characters": {}}, + {"persona_3_characters": {"next_page_token": 2}}, + {"persona_3_characters": {"next_page_token": 2}}, + {"persona_3_characters": {"next_page_token": 3}}, + {"persona_3_characters": {"next_page_token": 3}}, + {"persona_3_characters": {"__ab_full_refresh_sync_complete": True}}, + ] + + expected_substream_state = { + "states": [ + {"partition": {"parent_slice": {}, "partition_field": "makoto_yuki"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "partition_field": "yukari_takeba"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "partition_field": "mitsuru_kirijo"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "partition_field": "akihiko_sanada"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "partition_field": "junpei_iori"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "partition_field": "fuuka_yamagishi"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ], + "parent_state": {"persona_3_characters": {"__ab_full_refresh_sync_complete": True}}, + } + + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockResumableFullRefreshStream( + slices=[StreamSlice(partition={}, cursor_slice={})], + cursor=ResumableFullRefreshCursor(parameters={}), + record_pages=[ + [ + Record(data={"id": "makoto_yuki"}, associated_slice=mock_parent_slices[0]), + Record(data={"id": "yukari_takeba"}, associated_slice=mock_parent_slices[0]), + ], + [ + Record(data={"id": "mitsuru_kirijo"}, associated_slice=mock_parent_slices[1]), + Record(data={"id": "akihiko_sanada"}, associated_slice=mock_parent_slices[1]), + ], + [ + Record(data={"id": "junpei_iori"}, associated_slice=mock_parent_slices[2]), + Record(data={"id": "fuuka_yamagishi"}, associated_slice=mock_parent_slices[2]), + ], + ], + name="persona_3_characters", + ), + incremental_dependency=use_incremental_dependency, + parent_key="id", + partition_field="partition_field", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ) + + substream_cursor_slicer = PerPartitionCursor( + cursor_factory=CursorFactory(create_function=partial(ChildPartitionResumableFullRefreshCursor, {})), + partition_router=partition_router, + ) + + expected_counter = 0 + for actual_slice in substream_cursor_slicer.stream_slices(): + # close the substream slice + substream_cursor_slicer.close_slice(actual_slice) + # check the slice has been processed + assert actual_slice == expected_parent_slices[expected_counter] + # check for parent state + if use_incremental_dependency: + assert substream_cursor_slicer._partition_router.get_stream_state() == expected_parent_state[expected_counter] + expected_counter += 1 + if use_incremental_dependency: + assert substream_cursor_slicer._partition_router.get_stream_state() == expected_parent_state[expected_counter] + + # validate final state for closed substream slices + final_state = substream_cursor_slicer.get_stream_state() + if not use_incremental_dependency: + assert final_state["states"] == expected_substream_state["states"], "State for substreams is not valid!" + else: + assert final_state == expected_substream_state, "State for substreams with incremental dependency is not valid!" + + +@pytest.mark.parametrize( + "parent_stream_configs, expected_slices", + [ + ( + [ + ParentStreamConfig( + stream=MockStream( + [{}], + [ + {"id": 1, "field_1": "value_1", "field_2": {"nested_field": "nested_value_1"}}, + {"id": 2, "field_1": "value_2", "field_2": {"nested_field": "nested_value_2"}}, + ], + "first_stream", + ), + parent_key="id", + partition_field="first_stream_id", + extra_fields=[["field_1"], ["field_2", "nested_field"]], + parameters={}, + config={}, + ) + ], + [ + {"field_1": "value_1", "field_2.nested_field": "nested_value_1"}, + {"field_1": "value_2", "field_2.nested_field": "nested_value_2"}, + ], + ), + ( + [ + ParentStreamConfig( + stream=MockStream([{}], [{"id": 1, "field_1": "value_1"}, {"id": 2, "field_1": "value_2"}], "first_stream"), + parent_key="id", + partition_field="first_stream_id", + extra_fields=[["field_1"]], + parameters={}, + config={}, + ) + ], + [{"field_1": "value_1"}, {"field_1": "value_2"}], + ), + ], + ids=[ + "test_with_nested_extra_keys", + "test_with_single_extra_key", + ], +) +def test_substream_partition_router_with_extra_keys(parent_stream_configs, expected_slices): + partition_router = SubstreamPartitionRouter(parent_stream_configs=parent_stream_configs, parameters={}, config={}) + slices = [s.extra_fields for s in partition_router.stream_slices()] + assert slices == expected_slices + + +@pytest.mark.parametrize( + "stream_slicers, expect_warning", + [ + # Case with two ListPartitionRouters, no warning expected + ( + [ + ListPartitionRouter(values=["1", "2", "3"], cursor_field="partition_field", config={}, parameters={}), + ListPartitionRouter(values=["1", "2", "3"], cursor_field="partition_field", config={}, parameters={}), + ], + False, + ), + # Case with a SubstreamPartitionRouter, warning expected + ( + [ + ListPartitionRouter(values=["1", "2", "3"], cursor_field="partition_field", config={}, parameters={}), + SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream([{}], [{"a": {"b": 0}}, {"a": {"b": 1}}, {"a": {"c": 2}}, {"a": {"b": 3}}], "first_stream"), + parent_key="a/b", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ), + ], + True, + ), + # Case with nested CartesianProductStreamSlicer containing a SubstreamPartitionRouter, warning expected + ( + [ + ListPartitionRouter(values=["1", "2", "3"], cursor_field="partition_field", config={}, parameters={}), + CartesianProductStreamSlicer( + stream_slicers=[ + ListPartitionRouter(values=["1", "2", "3"], cursor_field="partition_field", config={}, parameters={}), + SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream( + [{}], [{"a": {"b": 0}}, {"a": {"b": 1}}, {"a": {"c": 2}}, {"a": {"b": 3}}], "first_stream" + ), + parent_key="a/b", + partition_field="first_stream_id", + parameters={}, + config={}, + ) + ], + parameters={}, + config={}, + ), + ], + parameters={}, + ), + ], + True, + ), + ], +) +def test_cartesian_product_stream_slicer_warning_log_message(caplog, stream_slicers, expect_warning): + """Test that a warning is logged when SubstreamPartitionRouter is used within a CartesianProductStreamSlicer.""" + warning_message = "Parent state handling is not supported for CartesianProductStreamSlicer." + + with caplog.at_level(logging.WARNING, logger="airbyte"): + CartesianProductStreamSlicer(stream_slicers=stream_slicers, parameters={}) + + logged_warnings = [record.message for record in caplog.records if record.levelname == "WARNING"] + + if expect_warning: + assert warning_message in logged_warnings + else: + assert warning_message not in logged_warnings diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py new file mode 100644 index 000000000000..3a931a0d2863 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py @@ -0,0 +1,34 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy import ConstantBackoffStrategy + +BACKOFF_TIME = 10 +PARAMETERS_BACKOFF_TIME = 20 +CONFIG_BACKOFF_TIME = 30 + + +@pytest.mark.parametrize( + "test_name, attempt_count, backofftime, expected_backoff_time", + [ + ("test_constant_backoff_first_attempt", 1, BACKOFF_TIME, BACKOFF_TIME), + ("test_constant_backoff_first_attempt_float", 1, 6.7, 6.7), + ("test_constant_backoff_attempt_round_float", 1.0, 6.7, 6.7), + ("test_constant_backoff_attempt_round_float", 1.5, 6.7, 6.7), + ("test_constant_backoff_first_attempt_round_float", 1, 10.0, BACKOFF_TIME), + ("test_constant_backoff_second_attempt_round_float", 2, 10.0, BACKOFF_TIME), + ("test_constant_backoff_from_parameters", 1, "{{ parameters['backoff'] }}", PARAMETERS_BACKOFF_TIME), + ("test_constant_backoff_from_config", 1, "{{ config['backoff'] }}", CONFIG_BACKOFF_TIME), + ], +) +def test_constant_backoff(test_name, attempt_count, backofftime, expected_backoff_time): + response_mock = MagicMock() + backoff_strategy = ConstantBackoffStrategy( + parameters={"backoff": PARAMETERS_BACKOFF_TIME}, backoff_time_in_seconds=backofftime, config={"backoff": CONFIG_BACKOFF_TIME} + ) + backoff = backoff_strategy.backoff_time(response_mock, attempt_count=attempt_count) + assert backoff == expected_backoff_time diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py new file mode 100644 index 000000000000..a99050a7ba4e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy import ( + ExponentialBackoffStrategy, +) + +parameters = {"backoff": 5} +config = {"backoff": 5} + + +@pytest.mark.parametrize( + "test_name, attempt_count, factor, expected_backoff_time", + [ + ("test_exponential_backoff_first_attempt", 1, 5, 10), + ("test_exponential_backoff_second_attempt", 2, 5, 20), + ("test_exponential_backoff_from_parameters", 2, "{{parameters['backoff']}}", 20), + ("test_exponential_backoff_from_config", 2, "{{config['backoff']}}", 20), + ], +) +def test_exponential_backoff(test_name, attempt_count, factor, expected_backoff_time): + response_mock = MagicMock() + backoff_strategy = ExponentialBackoffStrategy(factor=factor, parameters=parameters, config=config) + backoff = backoff_strategy.backoff_time(response_mock, attempt_count=attempt_count) + assert backoff == expected_backoff_time + + +def test_exponential_backoff_default(): + response_mock = MagicMock() + backoff_strategy = ExponentialBackoffStrategy(parameters=parameters, config=config) + backoff = backoff_strategy.backoff_time(response_mock, attempt_count=3) + assert backoff == 40 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py new file mode 100644 index 000000000000..81c2d34e3f76 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import re +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper import get_numeric_value_from_header + + +@pytest.mark.parametrize( + "test_name, headers, requested_header, regex, expected_value", + [ + ("test_get_numeric_value_from_header", {"header": 1}, "header", None, 1), + ("test_get_numeric_value_float_from_header", {"header": 1.2}, "header", None, 1.2), + ("test_get_numeric_value_from_string_value", {"header": "10.9"}, "header", None, 10.9), + ("test_get_numeric_value_from_non_numeric", {"header": "60,120"}, "header", None, None), + ("test_get_numeric_value_from_missing_header", {"header": 1}, "notheader", None, None), + ("test_get_numeric_value_with_regex", {"header": "61,60"}, "header", re.compile("([-+]?\d+)"), 61), # noqa + ("test_get_numeric_value_with_regex_no_header", {"header": "61,60"}, "notheader", re.compile("([-+]?\d+)"), None), # noqa + ("test_get_numeric_value_with_regex_not_matching", {"header": "abc61,60"}, "header", re.compile("([-+]?\d+)"), None), # noqa + ], +) +def test_get_numeric_value_from_header(test_name, headers, requested_header, regex, expected_value): + response_mock = create_response(headers=headers) + numeric_value = get_numeric_value_from_header(response_mock, requested_header, regex) + assert numeric_value == expected_value + + +def create_response(headers=None, json_body=None): + url = "https://airbyte.io" + + response_mock = MagicMock() + response_mock.url = url + response_mock.headers = headers or {} + response_mock.json.return_value = json_body or {} + return response_mock diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py new file mode 100644 index 000000000000..59dbb6b419a7 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk import AirbyteTracedException +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_time_from_header_backoff_strategy import ( + WaitTimeFromHeaderBackoffStrategy, +) +from requests import Response + +SOME_BACKOFF_TIME = 60 +_A_RETRY_HEADER = "retry-header" +_A_MAX_TIME = 100 + + +@pytest.mark.parametrize( + "test_name, header, header_value, regex, expected_backoff_time", + [ + ("test_wait_time_from_header", "wait_time", SOME_BACKOFF_TIME, None, SOME_BACKOFF_TIME), + ("test_wait_time_from_header_string", "wait_time", "60", None, SOME_BACKOFF_TIME), + ("test_wait_time_from_header_parameters", "{{ parameters['wait_time'] }}", "60", None, SOME_BACKOFF_TIME), + ("test_wait_time_from_header_config", "{{ config['wait_time'] }}", "60", None, SOME_BACKOFF_TIME), + ("test_wait_time_from_header_not_a_number", "wait_time", "61,60", None, None), + ("test_wait_time_from_header_with_regex", "wait_time", "61,60", "([-+]?\d+)", 61), # noqa + ("test_wait_time_fœrom_header_with_regex_no_match", "wait_time", "...", "[-+]?\d+", None), # noqa + ("test_wait_time_from_header", "absent_header", None, None, None), + ], +) +def test_wait_time_from_header(test_name, header, header_value, regex, expected_backoff_time): + response_mock = MagicMock(spec=Response) + response_mock.headers = {"wait_time": header_value} + backoff_strategy = WaitTimeFromHeaderBackoffStrategy( + header=header, regex=regex, parameters={"wait_time": "wait_time"}, config={"wait_time": "wait_time"} + ) + backoff = backoff_strategy.backoff_time(response_mock, 1) + assert backoff == expected_backoff_time + + +def test_given_retry_after_smaller_than_max_time_then_raise_transient_error(): + response_mock = MagicMock(spec=Response) + retry_after = _A_MAX_TIME - 1 + response_mock.headers = {_A_RETRY_HEADER: str(retry_after)} + backoff_strategy = WaitTimeFromHeaderBackoffStrategy( + header=_A_RETRY_HEADER, max_waiting_time_in_seconds=_A_MAX_TIME, parameters={}, config={} + ) + + assert backoff_strategy.backoff_time(response_mock, 1) == retry_after + + +def test_given_retry_after_greater_than_max_time_then_raise_transient_error(): + response_mock = MagicMock(spec=Response) + response_mock.headers = {_A_RETRY_HEADER: str(_A_MAX_TIME + 1)} + backoff_strategy = WaitTimeFromHeaderBackoffStrategy( + header=_A_RETRY_HEADER, max_waiting_time_in_seconds=_A_MAX_TIME, parameters={}, config={} + ) + + with pytest.raises(AirbyteTracedException) as exception: + backoff_strategy.backoff_time(response_mock, 1) + assert exception.value.failure_type == FailureType.transient_error diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py new file mode 100644 index 000000000000..5f2bc02f95cd --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py @@ -0,0 +1,65 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from unittest.mock import MagicMock, patch + +import pytest +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_until_time_from_header_backoff_strategy import ( + WaitUntilTimeFromHeaderBackoffStrategy, +) + +SOME_BACKOFF_TIME = 60 +REGEX = "[-+]?\\d+" + + +@pytest.mark.parametrize( + "test_name, header, wait_until, min_wait, regex, expected_backoff_time", + [ + ("test_wait_until_time_from_header", "wait_until", 1600000060.0, None, None, 60), + ("test_wait_until_time_from_header_parameters", "{{parameters['wait_until']}}", 1600000060.0, None, None, 60), + ("test_wait_until_time_from_header_config", "{{config['wait_until']}}", 1600000060.0, None, None, 60), + ("test_wait_until_negative_time", "wait_until", 1500000000.0, None, None, None), + ("test_wait_until_time_less_than_min", "wait_until", 1600000060.0, 120, None, 120), + ("test_wait_until_no_header", "absent_header", 1600000000.0, None, None, None), + ("test_wait_until_time_from_header_not_numeric", "wait_until", "1600000000,1600000000", None, None, None), + ("test_wait_until_time_from_header_is_numeric", "wait_until", "1600000060", None, None, 60), + ("test_wait_until_time_from_header_with_regex", "wait_until", "1600000060,60", None, "[-+]?\d+", 60), # noqa + ("test_wait_until_time_from_header_with_regex_from_parameters", "wait_until", "1600000060,60", None, "{{parameters['regex']}}", 60), + # noqa + ("test_wait_until_time_from_header_with_regex_from_config", "wait_until", "1600000060,60", None, "{{config['regex']}}", 60), # noqa + ("test_wait_until_time_from_header_with_regex_no_match", "wait_time", "...", None, "[-+]?\d+", None), # noqa + ("test_wait_until_no_header_with_min", "absent_header", "1600000000.0", SOME_BACKOFF_TIME, None, SOME_BACKOFF_TIME), + ( + "test_wait_until_no_header_with_min_from_parameters", + "absent_header", + "1600000000.0", + "{{parameters['min_wait']}}", + None, + SOME_BACKOFF_TIME, + ), + ( + "test_wait_until_no_header_with_min_from_config", + "absent_header", + "1600000000.0", + "{{config['min_wait']}}", + None, + SOME_BACKOFF_TIME, + ), + ], +) +@patch("time.time", return_value=1600000000.0) +def test_wait_untiltime_from_header(time_mock, test_name, header, wait_until, min_wait, regex, expected_backoff_time): + response_mock = MagicMock(spec=requests.Response) + response_mock.headers = {"wait_until": wait_until} + backoff_strategy = WaitUntilTimeFromHeaderBackoffStrategy( + header=header, + min_wait=min_wait, + regex=regex, + parameters={"wait_until": "wait_until", "regex": REGEX, "min_wait": SOME_BACKOFF_TIME}, + config={"wait_until": "wait_until", "regex": REGEX, "min_wait": SOME_BACKOFF_TIME}, + ) + backoff = backoff_strategy.backoff_time(response_mock, 1) + assert backoff == expected_backoff_time diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py new file mode 100644 index 000000000000..574f3eec0e75 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py @@ -0,0 +1,226 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.requesters.error_handlers import HttpResponseFilter +from airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler import CompositeErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, ResponseAction + +SOME_BACKOFF_TIME = 60 + + +@pytest.mark.parametrize( + "test_name, first_handler_behavior, second_handler_behavior, expected_behavior", + [ + ( + "test_chain_retrier_ok_ok", + ErrorResolution( + response_action=ResponseAction.SUCCESS, + failure_type=None, + error_message=None, + ), + ErrorResolution( + response_action=ResponseAction.SUCCESS, + failure_type=None, + error_message=None, + ), + ErrorResolution( + response_action=ResponseAction.SUCCESS, + failure_type=None, + error_message=None, + ), + ), + ( + "test_chain_retrier_ignore_fail", + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ErrorResolution( + response_action=ResponseAction.FAIL, + ), + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ), + ( + "test_chain_retrier_fail_ignore", + ErrorResolution( + response_action=ResponseAction.FAIL, + ), + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ), + ( + "test_chain_retrier_ignore_retry", + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ErrorResolution( + response_action=ResponseAction.RETRY, + ), + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ), + ( + "test_chain_retrier_ignore_success", + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ErrorResolution( + response_action=ResponseAction.SUCCESS, + ), + ErrorResolution( + response_action=ResponseAction.IGNORE, + ), + ), + ], +) +def test_composite_error_handler(test_name, first_handler_behavior, second_handler_behavior, expected_behavior): + first_error_handler = MagicMock() + first_error_handler.interpret_response.return_value = first_handler_behavior + second_error_handler = MagicMock() + second_error_handler.interpret_response.return_value = second_handler_behavior + retriers = [first_error_handler, second_error_handler] + retrier = CompositeErrorHandler(error_handlers=retriers, parameters={}) + response_mock = MagicMock() + response_mock.ok = first_handler_behavior.response_action == ResponseAction.SUCCESS or second_handler_behavior == ResponseAction.SUCCESS + assert retrier.interpret_response(response_mock) == expected_behavior + + +def test_given_unmatched_response_or_exception_then_return_default_error_resolution(): + composite_error_handler = CompositeErrorHandler( + error_handlers=[ + DefaultErrorHandler( + response_filters=[], + parameters={}, + config={}, + ) + ], + parameters={}, + ) + + error_resolution = composite_error_handler.interpret_response(ValueError("Any error")) + + assert error_resolution.response_action == ResponseAction.RETRY + assert error_resolution.failure_type == FailureType.system_error + + +def test_composite_error_handler_no_handlers(): + try: + CompositeErrorHandler(error_handlers=[], parameters={}) + assert False + except ValueError: + pass + + +def test_error_handler_compatibility_simple(): + status_code = 403 + expected_action = ResponseAction.IGNORE + response_mock = create_response(status_code) + default_error_handler = DefaultErrorHandler( + config={}, + parameters={}, + response_filters=[HttpResponseFilter(action=ResponseAction.IGNORE, http_codes={403}, config={}, parameters={})], + ) + composite_error_handler = CompositeErrorHandler( + error_handlers=[ + DefaultErrorHandler( + response_filters=[HttpResponseFilter(action=ResponseAction.IGNORE, http_codes={403}, parameters={}, config={})], + parameters={}, + config={}, + ) + ], + parameters={}, + ) + assert default_error_handler.interpret_response(response_mock).response_action == expected_action + assert composite_error_handler.interpret_response(response_mock).response_action == expected_action + + +@pytest.mark.parametrize( + "test_name, status_code, expected_action", + [("test_first_filter", 403, ResponseAction.IGNORE), ("test_second_filter", 404, ResponseAction.FAIL)], +) +def test_error_handler_compatibility_multiple_filters(test_name, status_code, expected_action): + response_mock = create_response(status_code) + error_handler_with_multiple_filters = CompositeErrorHandler( + error_handlers=[ + DefaultErrorHandler( + response_filters=[ + HttpResponseFilter(action=ResponseAction.IGNORE, http_codes={403}, parameters={}, config={}), + HttpResponseFilter(action=ResponseAction.FAIL, http_codes={404}, parameters={}, config={}), + ], + parameters={}, + config={}, + ), + ], + parameters={}, + ) + composite_error_handler_with_single_filters = CompositeErrorHandler( + error_handlers=[ + DefaultErrorHandler( + response_filters=[HttpResponseFilter(action=ResponseAction.IGNORE, http_codes={403}, parameters={}, config={})], + parameters={}, + config={}, + ), + DefaultErrorHandler( + response_filters=[HttpResponseFilter(action=ResponseAction.FAIL, http_codes={404}, parameters={}, config={})], + parameters={}, + config={}, + ), + ], + parameters={}, + ) + actual_action_multiple_filters = error_handler_with_multiple_filters.interpret_response(response_mock).response_action + assert actual_action_multiple_filters == expected_action + + actual_action_single_filters = composite_error_handler_with_single_filters.interpret_response(response_mock).response_action + assert actual_action_single_filters == expected_action + + +def create_response(status_code: int, headers=None, json_body=None): + url = "https://airbyte.io" + + response_mock = MagicMock(spec=requests.Response) + response_mock.request = MagicMock(spec=requests.PreparedRequest) + response_mock.status_code = status_code + response_mock.ok = status_code < 400 or status_code >= 600 + response_mock.url = url + response_mock.headers = headers or {} + response_mock.json.return_value = json_body or {} + return response_mock + + +@pytest.mark.parametrize( + "test_name, max_times, expected_max_time", + [ + ("test_single_handler", [10], 10), + ("test_multiple_handlers", [10, 15], 15), + ], +) +def test_max_time_is_max_of_underlying_handlers(test_name, max_times, expected_max_time): + composite_error_handler = CompositeErrorHandler( + error_handlers=[ + DefaultErrorHandler( + response_filters=[HttpResponseFilter(action=ResponseAction.IGNORE, http_codes={403}, parameters={}, config={})], + max_time=max_time, + parameters={}, + config={}, + ) + for max_time in max_times + ], + parameters={}, + ) + + max_time = composite_error_handler.max_time + assert max_time == expected_max_time diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py new file mode 100644 index 000000000000..6fc99159afed --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py @@ -0,0 +1,277 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy import ConstantBackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy import ( + ExponentialBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler, HttpResponseFilter +from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, FailureType, ResponseAction + +SOME_BACKOFF_TIME = 60 + + +@pytest.mark.parametrize( + "test_name, http_status_code, expected_error_resolution", + [ + ( + "_with_http_response_status_200", + 200, + ErrorResolution( + response_action=ResponseAction.SUCCESS, + failure_type=None, + error_message=None, + ), + ), + ( + "_with_http_response_status_400", + 400, + DEFAULT_ERROR_MAPPING[400], + ), + ( + "_with_http_response_status_404", + 404, + DEFAULT_ERROR_MAPPING[404], + ), + ( + "_with_http_response_status_408", + 408, + DEFAULT_ERROR_MAPPING[408], + ), + ( + "_with_unmapped_http_status_418", + 418, + ErrorResolution( + response_action=ResponseAction.RETRY, + failure_type=FailureType.system_error, + error_message="Unexpected response with HTTP status 418", + ), + ), + ], +) +def test_default_error_handler_with_default_response_filter(test_name, http_status_code: int, expected_error_resolution: ErrorResolution): + response_mock = create_response(http_status_code) + error_handler = DefaultErrorHandler(config={}, parameters={}) + actual_error_resolution = error_handler.interpret_response(response_mock) + assert actual_error_resolution.response_action == expected_error_resolution.response_action + assert actual_error_resolution.failure_type == expected_error_resolution.failure_type + assert actual_error_resolution.error_message == expected_error_resolution.error_message + + +@pytest.mark.parametrize( + "test_name, http_status_code, test_response_filter, response_action, failure_type, error_message", + [ + ( + "_with_http_response_status_400_fail_with_default_failure_type", + 400, + HttpResponseFilter( + http_codes=[400], + action=ResponseAction.RETRY, + config={}, + parameters={}, + ), + ResponseAction.RETRY, + FailureType.system_error, + "Bad request. Please check your request parameters.", + ), + ( + "_with_http_response_status_402_fail_with_default_failure_type", + 402, + HttpResponseFilter( + http_codes=[402], + action=ResponseAction.FAIL, + config={}, + parameters={}, + ), + ResponseAction.FAIL, + FailureType.system_error, + "", + ), + ( + "_with_http_response_status_403_fail_with_default_failure_type", + 403, + HttpResponseFilter( + http_codes=[403], + action="FAIL", + config={}, + parameters={}, + ), + ResponseAction.FAIL, + FailureType.config_error, + "Forbidden. You don't have permission to access this resource.", + ), + ( + "_with_http_response_status_200_fail_with_contained_error_message", + 418, + HttpResponseFilter( + action=ResponseAction.FAIL, + error_message_contains="test", + config={}, + parameters={}, + ), + ResponseAction.FAIL, + FailureType.system_error, + "", + ), + ( + "_fail_with_predicate", + 418, + HttpResponseFilter( + action=ResponseAction.FAIL, + predicate="{{ 'error' in response }}", + config={}, + parameters={}, + ), + ResponseAction.FAIL, + FailureType.system_error, + "", + ), + ], +) +def test_default_error_handler_with_custom_response_filter( + test_name, http_status_code, test_response_filter, response_action, failure_type, error_message +): + response_mock = create_response(http_status_code) + if http_status_code == 418: + response_mock.json.return_value = {"error": "test"} + + response_filter = test_response_filter + error_handler = DefaultErrorHandler(config={}, parameters={}, response_filters=[response_filter]) + actual_error_resolution = error_handler.interpret_response(response_mock) + assert actual_error_resolution.response_action == response_action + assert actual_error_resolution.failure_type == failure_type + assert actual_error_resolution.error_message == error_message + + +@pytest.mark.parametrize( + "http_status_code, expected_response_action", + [ + (400, ResponseAction.RETRY), + (402, ResponseAction.FAIL), + ], +) +def test_default_error_handler_with_multiple_response_filters(http_status_code, expected_response_action): + response_filter_one = HttpResponseFilter( + http_codes=[400], + action=ResponseAction.RETRY, + config={}, + parameters={}, + ) + response_filter_two = HttpResponseFilter( + http_codes=[402], + action=ResponseAction.FAIL, + config={}, + parameters={}, + ) + + response_mock = create_response(http_status_code) + error_handler = DefaultErrorHandler(config={}, parameters={}, response_filters=[response_filter_one, response_filter_two]) + actual_error_resolution = error_handler.interpret_response(response_mock) + assert actual_error_resolution.response_action == expected_response_action + + +@pytest.mark.parametrize( + "first_response_filter_action, second_response_filter_action, expected_response_action", + [ + (ResponseAction.RETRY, ResponseAction.FAIL, ResponseAction.RETRY), + (ResponseAction.FAIL, ResponseAction.RETRY, ResponseAction.FAIL), + (ResponseAction.IGNORE, ResponseAction.IGNORE, ResponseAction.IGNORE), + (ResponseAction.SUCCESS, ResponseAction.IGNORE, ResponseAction.SUCCESS), + ], +) +def test_default_error_handler_with_conflicting_response_filters( + first_response_filter_action, second_response_filter_action, expected_response_action +): + response_filter_one = HttpResponseFilter( + http_codes=[400], + action=first_response_filter_action, + config={}, + parameters={}, + ) + response_filter_two = HttpResponseFilter( + http_codes=[400], + action=second_response_filter_action, + config={}, + parameters={}, + ) + + response_mock = create_response(400) + error_handler = DefaultErrorHandler(config={}, parameters={}, response_filters=[response_filter_one, response_filter_two]) + actual_error_resolution = error_handler.interpret_response(response_mock) + assert actual_error_resolution.response_action == expected_response_action + + +def test_default_error_handler_with_constant_backoff_strategy(): + response_mock = create_response(429) + error_handler = DefaultErrorHandler( + config={}, parameters={}, backoff_strategies=[ConstantBackoffStrategy(SOME_BACKOFF_TIME, config={}, parameters={})] + ) + assert error_handler.backoff_time(response_or_exception=response_mock, attempt_count=0) == SOME_BACKOFF_TIME + + +@pytest.mark.parametrize( + "attempt_count", + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + ], +) +def test_default_error_handler_with_exponential_backoff_strategy(attempt_count): + response_mock = create_response(429) + error_handler = DefaultErrorHandler( + config={}, parameters={}, backoff_strategies=[ExponentialBackoffStrategy(factor=1, config={}, parameters={})] + ) + assert error_handler.backoff_time(response_or_exception=response_mock, attempt_count=attempt_count) == (1 * 2**attempt_count) + + +def create_response(status_code: int, headers=None, json_body=None): + url = "https://airbyte.io" + + response_mock = MagicMock(spec=requests.Response) + response_mock.status_code = status_code + response_mock.ok = status_code < 400 or status_code >= 600 + response_mock.url = url + response_mock.headers = headers or {} + response_mock.json.return_value = json_body or {} + response_mock.request = MagicMock(spec=requests.PreparedRequest) + return response_mock + + +def test_default_error_handler_with_unmapped_http_code(): + error_handler = DefaultErrorHandler(config={}, parameters={}) + response_mock = MagicMock(spec=requests.Response) + response_mock.status_code = 418 + response_mock.ok = False + response_mock.headers = {} + actual_error_resolution = error_handler.interpret_response(response_mock) + assert actual_error_resolution + assert actual_error_resolution.failure_type == FailureType.system_error + assert actual_error_resolution.response_action == ResponseAction.RETRY + + +def test_predicate_takes_precedent_over_default_mapped_error(): + response_mock = create_response(404, json_body={"error": "test"}) + + response_filter = HttpResponseFilter( + action=ResponseAction.FAIL, + predicate="{{ 'error' in response }}", + config={}, + parameters={}, + ) + + error_handler = DefaultErrorHandler(config={}, parameters={}, response_filters=[response_filter]) + actual_error_resolution = error_handler.interpret_response(response_mock) + assert actual_error_resolution.response_action == ResponseAction.FAIL + assert actual_error_resolution.failure_type == FailureType.system_error + assert actual_error_resolution.error_message == DEFAULT_ERROR_MAPPING.get(404).error_message diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_http_response_filter.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_http_response_filter.py new file mode 100644 index 000000000000..b3e4c517da26 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_http_response_filter.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_http_response_filter import DefaultHttpResponseFilter +from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction +from requests import RequestException, Response + + +@pytest.mark.parametrize( + "http_code, expected_error_resolution", + [ + pytest.param(403, DEFAULT_ERROR_MAPPING[403], id="403 mapping"), + pytest.param(404, DEFAULT_ERROR_MAPPING[404], id="404 mapping"), + pytest.param(408, DEFAULT_ERROR_MAPPING[408], id="408 mapping"), + ], +) +def test_matches_mapped_http_status_code(http_code, expected_error_resolution): + + response = MagicMock(spec=Response) + response.status_code = http_code + + response_filter = DefaultHttpResponseFilter( + config={}, + parameters={}, + ) + + actual_error_resolution = response_filter.matches(response) + assert actual_error_resolution == expected_error_resolution + + +def test_matches_mapped_exception(): + + exc = MagicMock(spec=RequestException) + + response_filter = DefaultHttpResponseFilter( + config={}, + parameters={}, + ) + + actual_error_resolution = response_filter.matches(exc) + assert actual_error_resolution == DEFAULT_ERROR_MAPPING[RequestException] + + +def test_unmapped_http_status_code_returns_default_error_resolution(): + + response = MagicMock(spec=Response) + response.status_code = 508 + + response_filter = DefaultHttpResponseFilter( + config={}, + parameters={}, + ) + + actual_error_resolution = response_filter.matches(response) + assert actual_error_resolution + assert actual_error_resolution.failure_type == FailureType.system_error + assert actual_error_resolution.response_action == ResponseAction.RETRY diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py new file mode 100644 index 000000000000..9c6817c268c4 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py @@ -0,0 +1,197 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.declarative.requesters.error_handlers import HttpResponseFilter +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ErrorResolution, ResponseAction + + +@pytest.mark.parametrize( + "action, failure_type, http_codes, predicate, error_contains, error_message, response, expected_error_resolution", + [ + pytest.param( + ResponseAction.FAIL, + None, + {501, 503}, + "", + "", + "custom error message", + {"status_code": 503}, + ErrorResolution( + response_action=ResponseAction.FAIL, failure_type=FailureType.transient_error, error_message="custom error message" + ), + id="test_http_code_matches", + ), + pytest.param( + ResponseAction.IGNORE, + None, + {403}, + "", + "", + "", + {"status_code": 403}, + ErrorResolution( + response_action=ResponseAction.IGNORE, + failure_type=FailureType.config_error, + error_message="Forbidden. You don't have permission to access this resource.", + ), + id="test_http_code_matches_ignore_action", + ), + pytest.param( + ResponseAction.RETRY, + None, + {429}, + "", + "", + "", + {"status_code": 429}, + ErrorResolution( + response_action=ResponseAction.RETRY, failure_type=FailureType.transient_error, error_message="Too many requests." + ), + id="test_http_code_matches_retry_action", + ), + pytest.param( + ResponseAction.FAIL, + None, + {}, + '{{ response.the_body == "do_i_match" }}', + "", + "error message was: {{ response.failure }}", + {"status_code": 404, "json": {"the_body": "do_i_match", "failure": "i failed you"}}, + ErrorResolution( + response_action=ResponseAction.FAIL, failure_type=FailureType.system_error, error_message="error message was: i failed you" + ), + id="test_predicate_matches_json", + ), + pytest.param( + ResponseAction.FAIL, + None, + {}, + '{{ headers.the_key == "header_match" }}', + "", + "error from header: {{ headers.warning }}", + {"status_code": 404, "headers": {"the_key": "header_match", "warning": "this failed"}}, + ErrorResolution( + response_action=ResponseAction.FAIL, failure_type=FailureType.system_error, error_message="error from header: this failed" + ), + id="test_predicate_matches_headers", + ), + pytest.param( + ResponseAction.FAIL, + None, + {}, + None, + "DENIED", + "", + {"status_code": 403, "json": {"error": "REQUEST_DENIED"}}, + ErrorResolution( + response_action=ResponseAction.FAIL, + failure_type=FailureType.config_error, + error_message="Forbidden. You don't have permission to access this resource.", + ), + id="test_predicate_matches_headers", + ), + pytest.param( + ResponseAction.FAIL, + None, + {400, 404}, + '{{ headers.error == "invalid_input" or response.reason == "bad request"}}', + "", + "", + {"status_code": 403, "headers": {"error": "authentication_error"}, "json": {"reason": "permission denied"}}, + None, + id="test_response_does_not_match_filter", + ), + pytest.param( + ResponseAction.FAIL, + FailureType.config_error, + {403, 404}, + "", + "", + "check permissions", + {"status_code": 403}, + ErrorResolution(response_action=ResponseAction.FAIL, failure_type=FailureType.config_error, error_message="check permissions"), + id="test_http_code_matches_failure_type_config_error", + ), + pytest.param( + ResponseAction.FAIL, + FailureType.system_error, + {403, 404}, + "", + "", + "check permissions", + {"status_code": 403}, + ErrorResolution(response_action=ResponseAction.FAIL, failure_type=FailureType.system_error, error_message="check permissions"), + id="test_http_code_matches_failure_type_system_error", + ), + pytest.param( + ResponseAction.FAIL, + FailureType.transient_error, + {500}, + "", + "", + "rate limits", + {"status_code": 500}, + ErrorResolution(response_action=ResponseAction.FAIL, failure_type=FailureType.transient_error, error_message="rate limits"), + id="test_http_code_matches_failure_type_transient_error", + ), + pytest.param( + ResponseAction.RETRY, + FailureType.config_error, + {500}, + "", + "", + "rate limits", + {"status_code": 500}, + ErrorResolution(response_action=ResponseAction.RETRY, failure_type=FailureType.transient_error, error_message="rate limits"), + id="test_http_code_matches_failure_type_config_error_action_retry_uses_default_failure_type", + ), + pytest.param( + ResponseAction.RATE_LIMITED, + None, + {500}, + "", + "", + "rate limits", + {"status_code": 500}, + ErrorResolution( + response_action=ResponseAction.RATE_LIMITED, failure_type=FailureType.transient_error, error_message="rate limits" + ), + id="test_http_code_matches_response_action_rate_limited", + ), + ], +) +def test_matches( + requests_mock, action, failure_type, http_codes, predicate, error_contains, error_message, response, expected_error_resolution +): + requests_mock.register_uri( + "GET", + "https://airbyte.io/", + text=response.get("json") and json.dumps(response.get("json")), + headers=response.get("headers") or {}, + status_code=response.get("status_code"), + ) + response = requests.get("https://airbyte.io/") + response_filter = HttpResponseFilter( + action=action, + failure_type=failure_type, + config={}, + parameters={}, + http_codes=http_codes, + predicate=predicate, + error_message_contains=error_contains, + error_message=error_message, + ) + + actual_response_status = response_filter.matches(response) + if expected_error_resolution: + assert actual_response_status.response_action == expected_error_resolution.response_action + assert actual_response_status.failure_type == expected_error_resolution.failure_type + assert actual_response_status.error_message == expected_error_resolution.error_message + else: + assert actual_response_status is None diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py new file mode 100644 index 000000000000..31d9ae5e05f5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py @@ -0,0 +1,119 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy + + +@pytest.mark.parametrize( + "template_string, stop_condition, expected_token, page_size", + [ + ("token", None, "token", None), + ("token", None, "token", 5), + ("{{ config.config_key }}", None, "config_value", None), + ("{{ last_record.id }}", None, 1, None), + ("{{ response._metadata.content }}", None, "content_value", None), + ("{{ parameters.key }}", None, "value", None), + ("{{ response.invalid_key }}", None, None, None), + ("token", InterpolatedBoolean("{{False}}", parameters={}), "token", None), + ("token", InterpolatedBoolean("{{True}}", parameters={}), None, None), + ("token", "{{True}}", None, None), + ( + "{{ headers.next }}", + InterpolatedBoolean("{{ not headers.has_more }}", parameters={}), + "ready_to_go", + None, + ), + ( + "{{ headers.link.next.url }}", + InterpolatedBoolean("{{ not headers.link.next.url }}", parameters={}), + "https://adventure.io/api/v1/records?page=2&per_page=100", + None, + ), + ], + ids=[ + "test_static_token", + "test_static_token_with_page_size", + "test_token_from_config", + "test_token_from_last_record", + "test_token_from_response", + "test_token_from_parameters", + "test_token_not_found", + "test_static_token_with_stop_condition_false", + "test_static_token_with_stop_condition_true", + "test_static_token_with_string_stop_condition", + "test_token_from_header", + "test_token_from_response_header_links", + ], +) +def test_cursor_pagination_strategy(template_string, stop_condition, expected_token, page_size): + decoder = JsonDecoder(parameters={}) + config = {"config_key": "config_value"} + parameters = {"key": "value"} + strategy = CursorPaginationStrategy( + page_size=page_size, + cursor_value=template_string, + config=config, + stop_condition=stop_condition, + decoder=decoder, + parameters=parameters, + ) + + response = requests.Response() + link_str = '; rel="next"' + response.headers = {"has_more": True, "next": "ready_to_go", "link": link_str} + response_body = {"_metadata": {"content": "content_value"}, "accounts": [], "end": 99, "total": 200, "characters": {}} + response._content = json.dumps(response_body).encode("utf-8") + last_record = {"id": 1, "more_records": True} + + token = strategy.next_page_token(response, 1, last_record) + assert expected_token == token + assert page_size == strategy.get_page_size() + + +def test_last_record_points_to_the_last_item_in_last_records_array(): + last_records = [{"id": 0, "more_records": True}, {"id": 1, "more_records": True}] + strategy = CursorPaginationStrategy( + page_size=1, + cursor_value="{{ last_record.id }}", + config={}, + parameters={}, + ) + + response = requests.Response() + next_page_token = strategy.next_page_token(response, 2, last_records[-1]) + assert next_page_token == 1 + + +def test_last_record_is_node_if_no_records(): + strategy = CursorPaginationStrategy( + page_size=1, + cursor_value="{{ last_record.id }}", + config={}, + parameters={}, + ) + + response = requests.Response() + next_page_token = strategy.next_page_token(response, 0, None) + assert next_page_token is None + + +def test_reset_with_initial_token(): + strategy = CursorPaginationStrategy( + page_size=10, + cursor_value="{{ response.next_page }}", + config={}, + parameters={}, + ) + + assert strategy.initial_token is None + + strategy.reset("https://for-all-mankind.nasa.com/api/v1/astronauts") + + assert strategy.initial_token == "https://for-all-mankind.nasa.com/api/v1/astronauts" diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py new file mode 100644 index 000000000000..41d973a9a8ce --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py @@ -0,0 +1,362 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from unittest.mock import MagicMock + +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders import JsonDecoder, XmlDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.requesters.paginators.default_paginator import ( + DefaultPaginator, + PaginatorTestReadDecorator, + RequestOption, + RequestOptionType, +) +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment import OffsetIncrement +from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath + + +@pytest.mark.parametrize( + "page_token_request_option, stop_condition, expected_updated_path, expected_request_params, expected_headers, expected_body_data, expected_body_json, last_record, expected_next_page_token, limit, decoder, response_body", + [ + ( + RequestPath(parameters={}), + None, + "/next_url", + {"limit": 2}, + {}, + {}, + {}, + {"id": 1}, + {"next_page_token": "https://airbyte.io/next_url"}, + 2, + JsonDecoder, + {"next": "https://airbyte.io/next_url"}, + ), + ( + RequestOption(inject_into=RequestOptionType.request_parameter, field_name="from", parameters={}), + None, + None, + {"limit": 2, "from": "https://airbyte.io/next_url"}, + {}, + {}, + {}, + {"id": 1}, + {"next_page_token": "https://airbyte.io/next_url"}, + 2, + JsonDecoder, + {"next": "https://airbyte.io/next_url"}, + ), + ( + RequestOption(inject_into=RequestOptionType.request_parameter, field_name="from", parameters={}), + InterpolatedBoolean(condition="{{True}}", parameters={}), + None, + {"limit": 2}, + {}, + {}, + {}, + {"id": 1}, + None, + 2, + JsonDecoder, + {"next": "https://airbyte.io/next_url"}, + ), + ( + RequestOption(inject_into=RequestOptionType.header, field_name="from", parameters={}), + None, + None, + {"limit": 2}, + {"from": "https://airbyte.io/next_url"}, + {}, + {}, + {"id": 1}, + {"next_page_token": "https://airbyte.io/next_url"}, + 2, + JsonDecoder, + {"next": "https://airbyte.io/next_url"}, + ), + ( + RequestOption(inject_into=RequestOptionType.body_data, field_name="from", parameters={}), + None, + None, + {"limit": 2}, + {}, + {"from": "https://airbyte.io/next_url"}, + {}, + {"id": 1}, + {"next_page_token": "https://airbyte.io/next_url"}, + 2, + JsonDecoder, + {"next": "https://airbyte.io/next_url"}, + ), + ( + RequestOption(inject_into=RequestOptionType.body_json, field_name="from", parameters={}), + None, + None, + {"limit": 2}, + {}, + {}, + {"from": "https://airbyte.io/next_url"}, + {"id": 1}, + {"next_page_token": "https://airbyte.io/next_url"}, + 2, + JsonDecoder, + {"next": "https://airbyte.io/next_url"}, + ), + ( + RequestPath(parameters={}), + None, + "/next_url", + {"limit": 2}, + {}, + {}, + {}, + {"id": 1}, + {"next_page_token": "https://airbyte.io/next_url"}, + 2, + XmlDecoder, + b"https://airbyte.io/next_url", + ), + ( + RequestOption(inject_into=RequestOptionType.request_parameter, field_name="from", parameters={}), + None, + None, + {"limit": 2, "from": "https://airbyte.io/next_url"}, + {}, + {}, + {}, + {"id": 1}, + {"next_page_token": "https://airbyte.io/next_url"}, + 2, + XmlDecoder, + b"https://airbyte.io/next_url", + ), + ], + ids=[ + "test_default_paginator_path", + "test_default_paginator_request_param", + "test_default_paginator_no_token", + "test_default_paginator_cursor_header", + "test_default_paginator_cursor_body_data", + "test_default_paginator_cursor_body_json", + "test_default_paginator_path_with_xml_decoder", + "test_default_paginator_request_param_xml_decoder", + ], +) +def test_default_paginator_with_cursor( + page_token_request_option, + stop_condition, + expected_updated_path, + expected_request_params, + expected_headers, + expected_body_data, + expected_body_json, + last_record, + expected_next_page_token, + limit, + decoder, + response_body +): + page_size_request_option = RequestOption( + inject_into=RequestOptionType.request_parameter, field_name="{{parameters['page_limit']}}", parameters={"page_limit": "limit"} + ) + cursor_value = "{{ response.next }}" + url_base = "https://airbyte.io" + config = {} + parameters = {} + strategy = CursorPaginationStrategy( + page_size=limit, + cursor_value=cursor_value, + stop_condition=stop_condition, + decoder=decoder(parameters={}), + config=config, + parameters=parameters, + ) + paginator = DefaultPaginator( + page_size_option=page_size_request_option, + page_token_option=page_token_request_option, + pagination_strategy=strategy, + config=config, + url_base=url_base, + parameters={}, + ) + + response = requests.Response() + response.headers = {"A_HEADER": "HEADER_VALUE"} + response._content = json.dumps(response_body).encode("utf-8") if decoder == JsonDecoder else response_body + + actual_next_page_token = paginator.next_page_token(response, 2, last_record) + actual_next_path = paginator.path() + actual_request_params = paginator.get_request_params() + actual_headers = paginator.get_request_headers() + actual_body_data = paginator.get_request_body_data() + actual_body_json = paginator.get_request_body_json() + assert actual_next_page_token == expected_next_page_token + assert actual_next_path == expected_updated_path + assert actual_request_params == expected_request_params + assert actual_headers == expected_headers + assert actual_body_data == expected_body_data + assert actual_body_json == expected_body_json + + +@pytest.mark.parametrize( + "field_name_page_size_interpolation, field_name_page_token_interpolation, expected_request_params", + [ + ( + "{{parameters['page_limit']}}", + "{{parameters['page_token']}}", + {"parameters_limit": 50, "parameters_token": "https://airbyte.io/next_url"}, + ), + ("{{config['page_limit']}}", "{{config['page_token']}}", {"config_limit": 50, "config_token": "https://airbyte.io/next_url"}), + ], + ids=[ + "parameters_interpolation", + "config_interpolation", + ], +) +def test_paginator_request_param_interpolation( + field_name_page_size_interpolation: str, field_name_page_token_interpolation: str, expected_request_params: dict +): + config = {"page_limit": "config_limit", "page_token": "config_token"} + parameters = {"page_limit": "parameters_limit", "page_token": "parameters_token"} + page_size_request_option = RequestOption( + inject_into=RequestOptionType.request_parameter, + field_name=field_name_page_size_interpolation, + parameters=parameters, + ) + cursor_value = "{{ response.next }}" + url_base = "https://airbyte.io" + limit = 50 + strategy = CursorPaginationStrategy( + page_size=limit, + cursor_value=cursor_value, + stop_condition=None, + decoder=JsonDecoder(parameters={}), + config=config, + parameters=parameters, + ) + paginator = DefaultPaginator( + page_size_option=page_size_request_option, + page_token_option=RequestOption( + inject_into=RequestOptionType.request_parameter, field_name=field_name_page_token_interpolation, parameters=parameters + ), + pagination_strategy=strategy, + config=config, + url_base=url_base, + parameters=parameters, + ) + response = requests.Response() + response.headers = {"A_HEADER": "HEADER_VALUE"} + response_body = {"next": "https://airbyte.io/next_url"} + response._content = json.dumps(response_body).encode("utf-8") + last_record = {"id": 1} + paginator.next_page_token(response, 2, last_record) + actual_request_params = paginator.get_request_params() + assert actual_request_params == expected_request_params + + +def test_page_size_option_cannot_be_set_if_strategy_has_no_limit(): + page_size_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="page_size", parameters={}) + page_token_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="offset", parameters={}) + cursor_value = "{{ response.next }}" + url_base = "https://airbyte.io" + config = {} + parameters = {} + strategy = CursorPaginationStrategy(page_size=None, cursor_value=cursor_value, config=config, parameters=parameters) + try: + DefaultPaginator( + page_size_option=page_size_request_option, + page_token_option=page_token_request_option, + pagination_strategy=strategy, + config=config, + url_base=url_base, + parameters={}, + ) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "inject_on_first_request", + [ + (True), + (False), + ], + ids=[ + "test_reset_inject_on_first_request", + "test_reset_no_inject_on_first_request", + ], +) +def test_reset(inject_on_first_request): + page_size_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="limit", parameters={}) + page_token_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="offset", parameters={}) + url_base = "https://airbyte.io" + config = {} + strategy = OffsetIncrement(config={}, page_size=2, inject_on_first_request=inject_on_first_request, parameters={}) + paginator = DefaultPaginator( + strategy, config, url_base, parameters={}, page_size_option=page_size_request_option, page_token_option=page_token_request_option + ) + initial_request_parameters = paginator.get_request_params() + response = requests.Response() + response._content = json.dumps({}).encode("utf-8") + paginator.next_page_token(response, 2, {"a key": "a value"}) + request_parameters_for_second_request = paginator.get_request_params() + paginator.reset() + request_parameters_after_reset = paginator.get_request_params() + assert initial_request_parameters == request_parameters_after_reset + assert request_parameters_for_second_request != request_parameters_after_reset + + +def test_initial_token_with_offset_pagination(): + page_size_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="limit", parameters={}) + page_token_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="offset", parameters={}) + url_base = "https://airbyte.io" + config = {} + strategy = OffsetIncrement(config={}, page_size=2, parameters={}, inject_on_first_request=True) + paginator = DefaultPaginator( + strategy, config, url_base, parameters={}, page_size_option=page_size_request_option, page_token_option=page_token_request_option + ) + initial_request_parameters = paginator.get_request_params() + + assert initial_request_parameters == {"limit": 2, "offset": 0} + + +def test_limit_page_fetched(): + maximum_number_of_pages = 5 + number_of_next_performed = maximum_number_of_pages - 1 + paginator = PaginatorTestReadDecorator( + DefaultPaginator( + page_size_option=MagicMock(), + page_token_option=MagicMock(), + pagination_strategy=MagicMock(), + config=MagicMock(), + url_base=MagicMock(), + parameters={}, + ), + maximum_number_of_pages, + ) + + for _ in range(number_of_next_performed): + last_token = paginator.next_page_token(MagicMock(), 1, MagicMock()) + assert last_token + + assert not paginator.next_page_token(MagicMock(), 1, MagicMock()) + + +def test_paginator_with_page_option_no_page_size(): + pagination_strategy = OffsetIncrement(config={}, page_size=None, parameters={}) + + with pytest.raises(ValueError): + DefaultPaginator( + page_size_option=MagicMock(), + page_token_option=RequestOption("limit", RequestOptionType.request_parameter, parameters={}), + pagination_strategy=pagination_strategy, + config=MagicMock(), + url_base=MagicMock(), + parameters={}, + ), diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py new file mode 100644 index 000000000000..12b81010f43b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py @@ -0,0 +1,12 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination + + +def test(): + paginator = NoPagination(parameters={}) + next_page_token = paginator.next_page_token(requests.Response(), 0, []) + assert next_page_token == {} diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py new file mode 100644 index 000000000000..d655dc9e6fa3 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from typing import Any, Optional + +import pytest +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment import OffsetIncrement + + +@pytest.mark.parametrize( + "page_size, parameters, last_page_size, last_record, expected_next_page_token, expected_offset", + [ + pytest.param("2", {}, 2, {"id": 1}, 2, 2, id="test_same_page_size"), + pytest.param(2, {}, 2, {"id": 1}, 2, 2, id="test_same_page_size"), + pytest.param("{{ parameters['page_size'] }}", {"page_size": 3}, 2, {"id": 1}, None, 0, id="test_larger_page_size"), + pytest.param(None, {}, 0, [], None, 0, id="test_stop_if_no_records"), + pytest.param("{{ response['page_metadata']['limit'] }}", {}, 2, {"id": 1}, None, 0, id="test_page_size_from_response"), + ], +) +def test_offset_increment_paginator_strategy(page_size, parameters, last_page_size, last_record, expected_next_page_token, expected_offset): + paginator_strategy = OffsetIncrement(page_size=page_size, parameters=parameters, config={}) + assert paginator_strategy._offset == 0 + + response = requests.Response() + + response.headers = {"A_HEADER": "HEADER_VALUE"} + response_body = {"next": "https://airbyte.io/next_url", "page_metadata": {"limit": 5}} + response._content = json.dumps(response_body).encode("utf-8") + + next_page_token = paginator_strategy.next_page_token(response, last_page_size, last_record) + assert expected_next_page_token == next_page_token + assert expected_offset == paginator_strategy._offset + + paginator_strategy.reset() + assert 0 == paginator_strategy._offset + + +def test_offset_increment_paginator_strategy_rises(): + paginator_strategy = OffsetIncrement(page_size="{{ parameters['page_size'] }}", parameters={"page_size": "invalid value"}, config={}) + with pytest.raises(Exception) as exc: + paginator_strategy.get_page_size() + assert str(exc.value) == "invalid value is of type . Expected " + + +@pytest.mark.parametrize( + "inject_on_first_request, expected_initial_token", + [ + pytest.param(True, 0, id="test_with_inject_offset"), + pytest.param(False, None, id="test_without_inject_offset"), + ], +) +def test_offset_increment_paginator_strategy_initial_token(inject_on_first_request: bool, expected_initial_token: Optional[Any]): + paginator_strategy = OffsetIncrement(page_size=20, parameters={}, config={}, inject_on_first_request=inject_on_first_request) + + assert paginator_strategy.initial_token == expected_initial_token + + +@pytest.mark.parametrize( + "reset_value, expected_initial_token, expected_error", + [ + pytest.param(25, 25, None, id="test_reset_with_offset_value"), + pytest.param(None, 0, None, id="test_reset_with_default"), + pytest.param("Nope", None, ValueError, id="test_reset_with_invalid_value"), + ], +) +def test_offset_increment_reset(reset_value, expected_initial_token, expected_error): + paginator_strategy = OffsetIncrement(page_size=20, parameters={}, config={}, inject_on_first_request=True) + + if expected_error: + with pytest.raises(expected_error): + paginator_strategy.reset(reset_value=reset_value) + else: + if reset_value is None: + paginator_strategy.reset() + else: + paginator_strategy.reset(reset_value=reset_value) + assert paginator_strategy.initial_token == expected_initial_token diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_page_increment.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_page_increment.py new file mode 100644 index 000000000000..da2bf6d9450e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_page_increment.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from typing import Any, Optional + +import pytest +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment import PageIncrement + + +@pytest.mark.parametrize( + "page_size, start_from, last_page_size, last_record, expected_next_page_token, expected_offset", + [ + pytest.param(2, 1, 2, {"id": 1}, 2, 2, id="test_same_page_size_start_from_0"), + pytest.param(3, 1, 2, {"id": 1}, None, 1, id="test_larger_page_size_start_from_0"), + pytest.param(2, 0, 2, {"id": 1}, 1, 1, id="test_same_page_size_start_from_1"), + pytest.param(3, 0, 2, {"id": 1}, None, 0, id="test_larger_page_size_start_from_0"), + pytest.param(None, 0, 0, None, None, 0, id="test_no_page_size"), + pytest.param("2", 0, 2, {"id": 1}, 1, 1, id="test_page_size_from_string"), + pytest.param("{{ config['value'] }}", 0, 2, {"id": 1}, 1, 1, id="test_page_size_from_config"), + ], +) +def test_page_increment_paginator_strategy(page_size, start_from, last_page_size, last_record, expected_next_page_token, expected_offset): + paginator_strategy = PageIncrement(page_size=page_size, parameters={}, start_from_page=start_from, config={"value": 2}) + assert paginator_strategy._page == start_from + + response = requests.Response() + + response.headers = {"A_HEADER": "HEADER_VALUE"} + response_body = {"next": "https://airbyte.io/next_url"} + response._content = json.dumps(response_body).encode("utf-8") + + next_page_token = paginator_strategy.next_page_token(response, last_page_size, last_record) + assert expected_next_page_token == next_page_token + assert expected_offset == paginator_strategy._page + + paginator_strategy.reset() + assert start_from == paginator_strategy._page + + +@pytest.mark.parametrize("page_size", [pytest.param("{{ config['value'] }}"), pytest.param("not-an-integer")]) +def test_page_increment_paginator_strategy_malformed_page_size(page_size): + with pytest.raises(Exception, match=".* is of type . Expected "): + PageIncrement(page_size=page_size, parameters={}, start_from_page=0, config={"value": "not-an-integer"}) + + +@pytest.mark.parametrize( + "inject_on_first_request, start_from_page, expected_initial_token", + [ + pytest.param(True, 0, 0, id="test_with_inject_offset_page_start_from_0"), + pytest.param(True, 12, 12, id="test_with_inject_offset_page_start_from_12"), + pytest.param(False, 2, None, id="test_without_inject_offset"), + ], +) +def test_page_increment_paginator_strategy_initial_token( + inject_on_first_request: bool, start_from_page: int, expected_initial_token: Optional[Any] +): + paginator_strategy = PageIncrement( + page_size=20, parameters={}, start_from_page=start_from_page, inject_on_first_request=inject_on_first_request, config={} + ) + + assert paginator_strategy.initial_token == expected_initial_token + + +@pytest.mark.parametrize( + "reset_value, expected_initial_token, expected_error", + [ + pytest.param(25, 25, None, id="test_reset_with_offset_value"), + pytest.param(None, 0, None, id="test_reset_with_default"), + pytest.param("Nope", None, ValueError, id="test_reset_with_invalid_value"), + ], +) +def test_offset_increment_reset(reset_value, expected_initial_token, expected_error): + paginator_strategy = PageIncrement(page_size=100, parameters={}, config={}, inject_on_first_request=True) + + if expected_error: + with pytest.raises(expected_error): + paginator_strategy.reset(reset_value=reset_value) + else: + paginator_strategy.reset(reset_value=reset_value) + assert paginator_strategy.initial_token == expected_initial_token diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_request_option.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_request_option.py new file mode 100644 index 000000000000..5caa11f57f16 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_request_option.py @@ -0,0 +1,43 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType + + +@pytest.mark.parametrize( + "option_type, field_name, expected_field_name", + [ + (RequestOptionType.request_parameter, "field", "field"), + (RequestOptionType.header, "field", "field"), + (RequestOptionType.body_data, "field", "field"), + (RequestOptionType.body_json, "field", "field"), + (RequestOptionType.request_parameter, "since_{{ parameters['cursor_field'] }}", "since_updated_at"), + (RequestOptionType.header, "since_{{ parameters['cursor_field'] }}", "since_updated_at"), + (RequestOptionType.body_data, "since_{{ parameters['cursor_field'] }}", "since_updated_at"), + (RequestOptionType.body_json, "since_{{ parameters['cursor_field'] }}", "since_updated_at"), + (RequestOptionType.request_parameter, "since_{{ config['cursor_field'] }}", "since_created_at"), + (RequestOptionType.header, "since_{{ config['cursor_field'] }}", "since_created_at"), + (RequestOptionType.body_data, "since_{{ config['cursor_field'] }}", "since_created_at"), + (RequestOptionType.body_json, "since_{{ config['cursor_field'] }}", "since_created_at"), + ], + ids=[ + "test_limit_param_with_field_name", + "test_limit_header_with_field_name", + "test_limit_data_with_field_name", + "test_limit_json_with_field_name", + "test_limit_param_with_parameters_interpolation", + "test_limit_header_with_parameters_interpolation", + "test_limit_data_with_parameters_interpolation", + "test_limit_json_with_parameters_interpolation", + "test_limit_param_with_config_interpolation", + "test_limit_header_with_config_interpolation", + "test_limit_data_with_config_interpolation", + "test_limit_json_with_config_interpolation", + ], +) +def test_request_option(option_type: RequestOptionType, field_name: str, expected_field_name: str): + request_option = RequestOption(inject_into=option_type, field_name=field_name, parameters={"cursor_field": "updated_at"}) + assert request_option.field_name.eval({"cursor_field": "created_at"}) == expected_field_name + assert request_option.inject_into == option_type diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py new file mode 100644 index 000000000000..86c5e65fcda9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py @@ -0,0 +1,103 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import Mock, call + +from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.stop_condition import ( + CursorStopCondition, + PaginationStopCondition, + StopConditionPaginationStrategyDecorator, +) +from airbyte_cdk.sources.types import Record +from pytest import fixture + +ANY_RECORD = Mock() +NO_RECORD = None +ANY_RESPONSE = Mock() + + +@fixture +def mocked_cursor(): + return Mock(spec=DeclarativeCursor) + + +@fixture +def mocked_pagination_strategy(): + return Mock(spec=PaginationStrategy) + + +@fixture +def mocked_stop_condition(): + return Mock(spec=PaginationStopCondition) + + +def test_given_record_should_be_synced_when_is_met_return_false(mocked_cursor): + mocked_cursor.should_be_synced.return_value = True + assert not CursorStopCondition(mocked_cursor).is_met(ANY_RECORD) + + +def test_given_record_should_not_be_synced_when_is_met_return_true(mocked_cursor): + mocked_cursor.should_be_synced.return_value = False + assert CursorStopCondition(mocked_cursor).is_met(ANY_RECORD) + + +def test_given_stop_condition_is_met_when_next_page_token_then_return_none(mocked_pagination_strategy, mocked_stop_condition): + mocked_stop_condition.is_met.return_value = True + last_record = Mock(spec=Record) + + decorator = StopConditionPaginationStrategyDecorator(mocked_pagination_strategy, mocked_stop_condition) + + assert not decorator.next_page_token(ANY_RESPONSE, 2, last_record) + mocked_stop_condition.is_met.assert_has_calls([call(last_record)]) + + +def test_given_last_record_meets_condition_when_next_page_token_then_do_not_check_for_other_records( + mocked_pagination_strategy, mocked_stop_condition +): + mocked_stop_condition.is_met.return_value = True + last_record = Mock(spec=Record) + + StopConditionPaginationStrategyDecorator(mocked_pagination_strategy, mocked_stop_condition).next_page_token( + ANY_RESPONSE, 2, last_record + ) + + mocked_stop_condition.is_met.assert_called_once_with(last_record) + + +def test_given_stop_condition_is_not_met_when_next_page_token_then_delegate(mocked_pagination_strategy, mocked_stop_condition): + mocked_stop_condition.is_met.return_value = False + last_record = Mock(spec=Record) + decorator = StopConditionPaginationStrategyDecorator(mocked_pagination_strategy, mocked_stop_condition) + + next_page_token = decorator.next_page_token(ANY_RESPONSE, 2, last_record) + + assert next_page_token == mocked_pagination_strategy.next_page_token.return_value + mocked_pagination_strategy.next_page_token.assert_called_once_with(ANY_RESPONSE, 2, last_record) + mocked_stop_condition.is_met.assert_has_calls([call(last_record)]) + + +def test_given_no_records_when_next_page_token_then_delegate(mocked_pagination_strategy, mocked_stop_condition): + decorator = StopConditionPaginationStrategyDecorator(mocked_pagination_strategy, mocked_stop_condition) + + next_page_token = decorator.next_page_token(ANY_RESPONSE, 0, NO_RECORD) + + assert next_page_token == mocked_pagination_strategy.next_page_token.return_value + mocked_pagination_strategy.next_page_token.assert_called_once_with(ANY_RESPONSE, 0, NO_RECORD) + + +def test_when_reset_then_delegate(mocked_pagination_strategy, mocked_stop_condition): + decorator = StopConditionPaginationStrategyDecorator(mocked_pagination_strategy, mocked_stop_condition) + decorator.reset() + mocked_pagination_strategy.reset.assert_called_once_with() + + +def test_when_get_page_size_then_delegate(mocked_pagination_strategy, mocked_stop_condition): + decorator = StopConditionPaginationStrategyDecorator(mocked_pagination_strategy, mocked_stop_condition) + + page_size = decorator.get_page_size() + + assert page_size == mocked_pagination_strategy.get_page_size.return_value + mocked_pagination_strategy.get_page_size.assert_called_once_with() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_datetime_based_request_options_provider.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_datetime_based_request_options_provider.py new file mode 100644 index 000000000000..816f83f94f8a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_datetime_based_request_options_provider.py @@ -0,0 +1,120 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.requesters.request_options import DatetimeBasedRequestOptionsProvider +from airbyte_cdk.sources.declarative.types import StreamSlice + + +@pytest.mark.parametrize( + "start_time_option, end_time_option, partition_field_start, partition_field_end, stream_slice, expected_request_options", + [ + pytest.param( + RequestOption(field_name="after", inject_into=RequestOptionType.request_parameter, parameters={}), + RequestOption(field_name="before", inject_into=RequestOptionType.request_parameter, parameters={}), + "custom_start", + "custom_end", + StreamSlice(cursor_slice={"custom_start": "2024-06-01", "custom_end": "2024-06-02"}, partition={}), + {"after": "2024-06-01", "before": "2024-06-02"}, + id="test_request_params", + ), + pytest.param( + RequestOption(field_name="after", inject_into=RequestOptionType.request_parameter, parameters={}), + RequestOption(field_name="before", inject_into=RequestOptionType.request_parameter, parameters={}), + None, + None, + StreamSlice(cursor_slice={"start_time": "2024-06-01", "end_time": "2024-06-02"}, partition={}), + {"after": "2024-06-01", "before": "2024-06-02"}, + id="test_request_params_with_default_partition_fields", + ), + pytest.param( + None, + RequestOption(field_name="before", inject_into=RequestOptionType.request_parameter, parameters={}), + None, + None, + StreamSlice(cursor_slice={"start_time": "2024-06-01", "end_time": "2024-06-02"}, partition={}), + {"before": "2024-06-02"}, + id="test_request_params_no_start_time_option", + ), + pytest.param( + RequestOption(field_name="after", inject_into=RequestOptionType.request_parameter, parameters={}), + None, + None, + None, + StreamSlice(cursor_slice={"start_time": "2024-06-01", "end_time": "2024-06-02"}, partition={}), + {"after": "2024-06-01"}, + id="test_request_params_no_end_time_option", + ), + pytest.param( + RequestOption(field_name="after", inject_into=RequestOptionType.request_parameter, parameters={}), + RequestOption(field_name="before", inject_into=RequestOptionType.request_parameter, parameters={}), + None, + None, + None, + {}, + id="test_request_params_no_slice", + ), + pytest.param( + RequestOption(field_name="after", inject_into=RequestOptionType.header, parameters={}), + RequestOption(field_name="before", inject_into=RequestOptionType.header, parameters={}), + "custom_start", + "custom_end", + StreamSlice(cursor_slice={"custom_start": "2024-06-01", "custom_end": "2024-06-02"}, partition={}), + {"after": "2024-06-01", "before": "2024-06-02"}, + id="test_request_headers", + ), + pytest.param( + RequestOption(field_name="after", inject_into=RequestOptionType.body_data, parameters={}), + RequestOption(field_name="before", inject_into=RequestOptionType.body_data, parameters={}), + "custom_start", + "custom_end", + StreamSlice(cursor_slice={"custom_start": "2024-06-01", "custom_end": "2024-06-02"}, partition={}), + {"after": "2024-06-01", "before": "2024-06-02"}, + id="test_request_request_body_data", + ), + pytest.param( + RequestOption(field_name="after", inject_into=RequestOptionType.body_json, parameters={}), + RequestOption(field_name="before", inject_into=RequestOptionType.body_json, parameters={}), + "custom_start", + "custom_end", + StreamSlice(cursor_slice={"custom_start": "2024-06-01", "custom_end": "2024-06-02"}, partition={}), + {"after": "2024-06-01", "before": "2024-06-02"}, + id="test_request_request_body_json", + ), + ], +) +def test_datetime_based_request_options_provider( + start_time_option, + end_time_option, + partition_field_start, + partition_field_end, + stream_slice, + expected_request_options +): + config = {} + request_options_provider = DatetimeBasedRequestOptionsProvider( + start_time_option=start_time_option, + end_time_option=end_time_option, + partition_field_start=partition_field_start, + partition_field_end=partition_field_end, + config=config, + parameters={} + ) + + request_option_type = start_time_option.inject_into if isinstance(start_time_option, RequestOption) else None + match request_option_type: + case RequestOptionType.request_parameter: + actual_request_options = request_options_provider.get_request_params(stream_slice=stream_slice) + case RequestOptionType.header: + actual_request_options = request_options_provider.get_request_headers(stream_slice=stream_slice) + case RequestOptionType.body_data: + actual_request_options = request_options_provider.get_request_body_data(stream_slice=stream_slice) + case RequestOptionType.body_json: + actual_request_options = request_options_provider.get_request_body_json(stream_slice=stream_slice) + case _: + # We defer to testing the default RequestOptions using get_request_params() + actual_request_options = request_options_provider.get_request_params(stream_slice=stream_slice) + + assert actual_request_options == expected_request_options diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py new file mode 100644 index 000000000000..b8239d43e0a1 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py @@ -0,0 +1,131 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) + +state = {"date": "2021-01-01"} +stream_slice = {"start_date": "2020-01-01"} +next_page_token = {"offset": 12345, "page": 27} +config = {"option": "OPTION"} + + +@pytest.mark.parametrize( + "test_name, input_request_params, expected_request_params", + [ + ("test_static_param", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_value_depends_on_state", {"read_from_state": "{{ stream_state['date'] }}"}, {"read_from_state": "2021-01-01"}), + ("test_value_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['start_date'] }}"}, {"read_from_slice": "2020-01-01"}), + ("test_value_depends_on_next_page_token", {"read_from_token": "{{ next_page_token['offset'] }}"}, {"read_from_token": "12345"}), + ("test_value_depends_on_config", {"read_from_config": "{{ config['option'] }}"}, {"read_from_config": "OPTION"}), + ( + "test_parameter_is_interpolated", + {"{{ stream_state['date'] }} - {{stream_slice['start_date']}} - {{next_page_token['offset']}} - {{config['option']}}": "ABC"}, + {"2021-01-01 - 2020-01-01 - 12345 - OPTION": "ABC"}, + ), + ("test_boolean_false_value", {"boolean_false": "{{ False }}"}, {"boolean_false": "False"}), + ("test_integer_falsy_value", {"integer_falsy": "{{ 0 }}"}, {"integer_falsy": "0"}), + ("test_number_falsy_value", {"number_falsy": "{{ 0.0 }}"}, {"number_falsy": "0.0"}), + ("test_string_falsy_value", {"string_falsy": "{{ '' }}"}, {}), + ("test_none_value", {"none_value": "{{ None }}"}, {"none_value": "None"}), + ], +) +def test_interpolated_request_params(test_name, input_request_params, expected_request_params): + provider = InterpolatedRequestOptionsProvider(config=config, request_parameters=input_request_params, parameters={}) + + actual_request_params = provider.get_request_params(stream_state=state, stream_slice=stream_slice, next_page_token=next_page_token) + + assert actual_request_params == expected_request_params + + +@pytest.mark.parametrize( + "test_name, input_request_json, expected_request_json", + [ + ("test_static_json", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_value_depends_on_state", {"read_from_state": "{{ stream_state['date'] }}"}, {"read_from_state": "2021-01-01"}), + ("test_value_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['start_date'] }}"}, {"read_from_slice": "2020-01-01"}), + ("test_value_depends_on_next_page_token", {"read_from_token": "{{ next_page_token['offset'] }}"}, {"read_from_token": 12345}), + ("test_value_depends_on_config", {"read_from_config": "{{ config['option'] }}"}, {"read_from_config": "OPTION"}), + ( + "test_interpolated_keys", + {"{{ stream_state['date'] }}": 123, "{{ config['option'] }}": "ABC"}, + {"2021-01-01": 123, "OPTION": "ABC"}, + ), + ("test_boolean_false_value", {"boolean_false": "{{ False }}"}, {"boolean_false": False}), + ("test_integer_falsy_value", {"integer_falsy": "{{ 0 }}"}, {"integer_falsy": 0}), + ("test_number_falsy_value", {"number_falsy": "{{ 0.0 }}"}, {"number_falsy": 0.0}), + ("test_string_falsy_value", {"string_falsy": "{{ '' }}"}, {}), + ("test_none_value", {"none_value": "{{ None }}"}, {}), + ("test_string", """{"nested": { "key": "{{ config['option'] }}" }}""", {"nested": {"key": "OPTION"}}), + ("test_nested_objects", {"nested": {"key": "{{ config['option'] }}"}}, {"nested": {"key": "OPTION"}}), + ( + "test_nested_objects_interpolated keys", + {"nested": {"{{ stream_state['date'] }}": "{{ config['option'] }}"}}, + {"nested": {"2021-01-01": "OPTION"}}, + ), + ], +) +def test_interpolated_request_json(test_name, input_request_json, expected_request_json): + provider = InterpolatedRequestOptionsProvider(config=config, request_body_json=input_request_json, parameters={}) + + actual_request_json = provider.get_request_body_json(stream_state=state, stream_slice=stream_slice, next_page_token=next_page_token) + + assert actual_request_json == expected_request_json + + +@pytest.mark.parametrize( + "test_name, input_request_data, expected_request_data", + [ + ("test_static_map_data", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_map_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['start_date'] }}"}, {"read_from_slice": "2020-01-01"}), + ("test_map_depends_on_config", {"read_from_config": "{{ config['option'] }}"}, {"read_from_config": "OPTION"}), + ("test_defaults_to_empty_dict", None, {}), + ("test_interpolated_keys", {"{{ stream_state['date'] }} - {{ next_page_token['offset'] }}": "ABC"}, {"2021-01-01 - 12345": "ABC"}), + ], +) +def test_interpolated_request_data(test_name, input_request_data, expected_request_data): + provider = InterpolatedRequestOptionsProvider(config=config, request_body_data=input_request_data, parameters={}) + + actual_request_data = provider.get_request_body_data(stream_state=state, stream_slice=stream_slice, next_page_token=next_page_token) + + assert actual_request_data == expected_request_data + + +def test_error_on_create_for_both_request_json_and_data(): + request_json = {"body_key": "{{ stream_slice['start_date'] }}"} + request_data = "interpolate_me=5&invalid={{ config['option'] }}" + with pytest.raises(ValueError): + InterpolatedRequestOptionsProvider(config=config, request_body_json=request_json, request_body_data=request_data, parameters={}) + + +@pytest.mark.parametrize( + "request_option_type,request_input,contains_state", + [ + pytest.param("request_parameter", {"start": "{{ stream_state.get('start_date') }}"}, True, id="test_request_parameter_has_state"), + pytest.param("request_parameter", {"start": "{{ slice_interval.get('start_date') }}"}, False, id="test_request_parameter_no_state"), + pytest.param("request_header", {"start": "{{ stream_state.get('start_date') }}"}, True, id="test_request_header_has_state"), + pytest.param("request_header", {"start": "{{ slice_interval.get('start_date') }}"}, False, id="test_request_header_no_state"), + pytest.param("request_body_data", "[{'query': {'type': 'timestamp', 'value': stream_state.get('start_date')}}]", True, id="test_request_body_data_has_state"), + pytest.param("request_body_data", "[{'query': {'type': 'timestamp', 'value': stream_interval.get('start_date')}}]", False, id="test_request_body_data_no_state"), + pytest.param("request_body_json", {"start": "{{ stream_state.get('start_date') }}"}, True, id="test_request_body_json_has_state"), + pytest.param("request_body_json", {"start": "{{ slice_interval.get('start_date') }}"}, False, id="test_request_request_body_json_no_state"), + ] +) +def test_request_options_contain_stream_state(request_option_type, request_input, contains_state): + request_options_provider: InterpolatedRequestOptionsProvider + match request_option_type: + case "request_parameter": + request_options_provider = InterpolatedRequestOptionsProvider(config=config, request_parameters=request_input, parameters={}) + case "request_header": + request_options_provider = InterpolatedRequestOptionsProvider(config=config, request_headers=request_input, parameters={}) + case "request_body_data": + request_options_provider = InterpolatedRequestOptionsProvider(config=config, request_body_data=request_input, parameters={}) + case "request_body_json": + request_options_provider = InterpolatedRequestOptionsProvider(config=config, request_body_json=request_input, parameters={}) + case _: + request_options_provider = InterpolatedRequestOptionsProvider(config=config, parameters={}) + + assert request_options_provider.request_options_contain_stream_state() == contains_state diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_job_repository.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_job_repository.py new file mode 100644 index 000000000000..90768d8bbb60 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_job_repository.py @@ -0,0 +1,251 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + + +import json +from unittest import TestCase +from unittest.mock import Mock + +import pytest +from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus +from airbyte_cdk.sources.declarative.decoders import NoopDecoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordSelector, ResponseToFileExtractor +from airbyte_cdk.sources.declarative.requesters.error_handlers import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpRequester +from airbyte_cdk.sources.declarative.requesters.paginators import DefaultPaginator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.types import StreamSlice +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer +from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse + +_ANY_CONFIG = {} +_ANY_SLICE = StreamSlice(partition={}, cursor_slice={}) +_URL_BASE = "https://api.sendgrid.com/v3/" +_EXPORT_PATH = "marketing/contacts/exports" +_EXPORT_URL = f"{_URL_BASE}{_EXPORT_PATH}" +_A_JOB_ID = "a-job-id" +_ANOTHER_JOB_ID = "another-job-id" +_JOB_FIRST_URL = "https://job.result.api.com/1" +_JOB_SECOND_URL = "https://job.result.api.com/2" +_A_CSV_WITH_ONE_RECORD = """id,value +a_record_id,a_value +""" +_A_CURSOR_FOR_PAGINATION = "a-cursor-for-pagination" + + +class HttpJobRepositoryTest(TestCase): + def setUp(self) -> None: + message_repository = Mock() + error_handler = DefaultErrorHandler(config=_ANY_CONFIG, parameters={}) + + self._create_job_requester = HttpRequester( + name="stream : create_job", + url_base=_URL_BASE, + path=_EXPORT_PATH, + error_handler=error_handler, + http_method=HttpMethod.POST, + config=_ANY_CONFIG, + disable_retries=False, + parameters={}, + message_repository=message_repository, + use_cache=False, + stream_response=False, + ) + + self._polling_job_requester = HttpRequester( + name="stream : polling", + url_base=_URL_BASE, + path=_EXPORT_PATH + "/{{stream_slice['create_job_response'].json()['id']}}", + error_handler=error_handler, + http_method=HttpMethod.GET, + config=_ANY_CONFIG, + disable_retries=False, + parameters={}, + message_repository=message_repository, + use_cache=False, + stream_response=False, + ) + + self._download_retriever = SimpleRetriever( + requester=HttpRequester( + name="stream : fetch_result", + url_base="", + path="{{stream_slice['url']}}", + error_handler=error_handler, + http_method=HttpMethod.GET, + config=_ANY_CONFIG, + disable_retries=False, + parameters={}, + message_repository=message_repository, + use_cache=False, + stream_response=True, + ), + record_selector=RecordSelector( + extractor=ResponseToFileExtractor(), + record_filter=None, + transformations=[], + schema_normalization=TypeTransformer(TransformConfig.NoTransform), + config=_ANY_CONFIG, + parameters={}, + ), + primary_key=None, + name="any name", + paginator=DefaultPaginator( + decoder=NoopDecoder(), + page_size_option=None, + page_token_option=RequestOption( + field_name="locator", + inject_into=RequestOptionType.request_parameter, + parameters={}, + ), + pagination_strategy=CursorPaginationStrategy( + cursor_value="{{ headers['Sforce-Locator'] }}", + decoder=NoopDecoder(), + config=_ANY_CONFIG, + parameters={}, + ), + url_base=_URL_BASE, + config=_ANY_CONFIG, + parameters={}, + ), + config=_ANY_CONFIG, + parameters={}, + ) + + self._repository = AsyncHttpJobRepository( + creation_requester=self._create_job_requester, + polling_requester=self._polling_job_requester, + download_retriever=self._download_retriever, + abort_requester=None, + delete_requester=None, + status_extractor=DpathExtractor(decoder=JsonDecoder(parameters={}), field_path=["status"], config={}, parameters={} or {}), + status_mapping={ + "ready": AsyncJobStatus.COMPLETED, + "failure": AsyncJobStatus.FAILED, + "pending": AsyncJobStatus.RUNNING, + }, + urls_extractor=DpathExtractor(decoder=JsonDecoder(parameters={}), field_path=["urls"], config={}, parameters={} or {}), + ) + + self._http_mocker = HttpMocker() + self._http_mocker.__enter__() + + def tearDown(self) -> None: + self._http_mocker.__exit__(None, None, None) + + def test_given_different_statuses_when_update_jobs_status_then_update_status_properly(self) -> None: + self._mock_create_response(_A_JOB_ID) + self._http_mocker.get( + HttpRequest(url=f"{_EXPORT_URL}/{_A_JOB_ID}"), + [ + HttpResponse(body=json.dumps({"id": _A_JOB_ID, "status": "pending"})), + HttpResponse(body=json.dumps({"id": _A_JOB_ID, "status": "failure"})), + HttpResponse(body=json.dumps({"id": _A_JOB_ID, "status": "ready"})), + ] + ) + job = self._repository.start(_ANY_SLICE) + + self._repository.update_jobs_status([job]) + assert job.status() == AsyncJobStatus.RUNNING + self._repository.update_jobs_status([job]) + assert job.status() == AsyncJobStatus.FAILED + self._repository.update_jobs_status([job]) + assert job.status() == AsyncJobStatus.COMPLETED + + def test_given_unknown_status_when_update_jobs_status_then_raise_error(self) -> None: + self._mock_create_response(_A_JOB_ID) + self._http_mocker.get( + HttpRequest(url=f"{_EXPORT_URL}/{_A_JOB_ID}"), + HttpResponse(body=json.dumps({"id": _A_JOB_ID, "status": "invalid_status"})), + ) + job = self._repository.start(_ANY_SLICE) + + with pytest.raises(ValueError): + self._repository.update_jobs_status([job]) + + def test_given_multiple_jobs_when_update_jobs_status_then_all_the_jobs_are_updated(self) -> None: + self._http_mocker.post( + HttpRequest(url=_EXPORT_URL), + [ + HttpResponse(body=json.dumps({"id": _A_JOB_ID})), + HttpResponse(body=json.dumps({"id": _ANOTHER_JOB_ID})), + ], + ) + self._http_mocker.get( + HttpRequest(url=f"{_EXPORT_URL}/{_A_JOB_ID}"), + HttpResponse(body=json.dumps({"id": _A_JOB_ID, "status": "ready"})), + ) + self._http_mocker.get( + HttpRequest(url=f"{_EXPORT_URL}/{_ANOTHER_JOB_ID}"), + HttpResponse(body=json.dumps({"id": _A_JOB_ID, "status": "ready"})), + ) + a_job = self._repository.start(_ANY_SLICE) + another_job = self._repository.start(_ANY_SLICE) + + self._repository.update_jobs_status([a_job, another_job]) + + assert a_job.status() == AsyncJobStatus.COMPLETED + assert another_job.status() == AsyncJobStatus.COMPLETED + + def test_given_pagination_when_fetch_records_then_yield_records_from_all_pages(self) -> None: + self._mock_create_response(_A_JOB_ID) + self._http_mocker.get( + HttpRequest(url=f"{_EXPORT_URL}/{_A_JOB_ID}"), + HttpResponse(body=json.dumps({ + "id": _A_JOB_ID, + "status": "ready", + "urls": [_JOB_FIRST_URL] + })) + ) + self._http_mocker.get( + HttpRequest(url=_JOB_FIRST_URL), + HttpResponse(body=_A_CSV_WITH_ONE_RECORD, headers={"Sforce-Locator": _A_CURSOR_FOR_PAGINATION}), + ) + self._http_mocker.get( + HttpRequest(url=_JOB_FIRST_URL, query_params={"locator": _A_CURSOR_FOR_PAGINATION}), + HttpResponse(body=_A_CSV_WITH_ONE_RECORD), + ) + + job = self._repository.start(_ANY_SLICE) + self._repository.update_jobs_status([job]) + records = list(self._repository.fetch_records(job)) + + assert len(records) == 2 + + def test_given_multiple_urls_when_fetch_records_then_fetch_from_multiple_urls(self) -> None: + self._mock_create_response(_A_JOB_ID) + self._http_mocker.get( + HttpRequest(url=f"{_EXPORT_URL}/{_A_JOB_ID}"), + HttpResponse(body=json.dumps({ + "id": _A_JOB_ID, + "status": "ready", + "urls": [ + _JOB_FIRST_URL, + _JOB_SECOND_URL, + ] + })) + ) + self._http_mocker.get( + HttpRequest(url=_JOB_FIRST_URL), + HttpResponse(body=_A_CSV_WITH_ONE_RECORD), + ) + self._http_mocker.get( + HttpRequest(url=_JOB_SECOND_URL), + HttpResponse(body=_A_CSV_WITH_ONE_RECORD), + ) + + job = self._repository.start(_ANY_SLICE) + self._repository.update_jobs_status([job]) + records = list(self._repository.fetch_records(job)) + + assert len(records) == 2 + + def _mock_create_response(self, job_id: str) -> None: + self._http_mocker.post( + HttpRequest(url=_EXPORT_URL), + HttpResponse(body=json.dumps({"id": job_id})), + ) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_requester.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_requester.py new file mode 100644 index 000000000000..404bf9f50e15 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_requester.py @@ -0,0 +1,690 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Mapping, Optional +from unittest import mock +from unittest.mock import MagicMock +from urllib.parse import parse_qs, urlparse + +import pytest as pytest +import requests +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ConstantBackoffStrategy, ExponentialBackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpMethod, HttpRequester +from airbyte_cdk.sources.declarative.requesters.request_options import InterpolatedRequestOptionsProvider +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.http.exceptions import RequestBodyException, UserDefinedBackoffException +from airbyte_cdk.sources.types import Config +from requests import PreparedRequest + + +@pytest.fixture +def http_requester_factory(): + def factory( + name: str = "name", + url_base: str = "https://test_base_url.com", + path: str = "/", + http_method: str = HttpMethod.GET, + request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None, + authenticator: Optional[DeclarativeAuthenticator] = None, + error_handler: Optional[ErrorHandler] = None, + config: Optional[Config] = None, + parameters: Mapping[str, Any] = None, + disable_retries: bool = False, + message_repository: Optional[MessageRepository] = None, + use_cache: bool = False, + ) -> HttpRequester: + return HttpRequester( + name=name, + url_base=url_base, + path=path, + config=config or {}, + parameters=parameters or {}, + authenticator=authenticator, + http_method=http_method, + request_options_provider=request_options_provider, + error_handler=error_handler, + disable_retries=disable_retries, + message_repository=message_repository or MagicMock(), + use_cache=use_cache, + ) + + return factory + + +def test_http_requester(): + http_method = HttpMethod.GET + + request_options_provider = MagicMock() + request_params = {"param": "value"} + request_body_data = "body_key_1=value_1&body_key_2=value2" + request_body_json = {"body_field": "body_value"} + request_options_provider.get_request_params.return_value = request_params + request_options_provider.get_request_body_data.return_value = request_body_data + request_options_provider.get_request_body_json.return_value = request_body_json + + request_headers_provider = MagicMock() + request_headers = {"header": "value"} + request_headers_provider.get_request_headers.return_value = request_headers + + authenticator = MagicMock() + + error_handler = MagicMock() + max_retries = 10 + backoff_time = 1000 + response_status = MagicMock() + response_status.retry_in.return_value = 10 + error_handler.max_retries = max_retries + error_handler.backoff_time.return_value = backoff_time + + config = {"url": "https://airbyte.io"} + stream_slice = {"id": "1234"} + + name = "stream_name" + + requester = HttpRequester( + name=name, + url_base=InterpolatedString.create("{{ config['url'] }}", parameters={}), + path=InterpolatedString.create("v1/{{ stream_slice['id'] }}", parameters={}), + http_method=http_method, + request_options_provider=request_options_provider, + authenticator=authenticator, + error_handler=error_handler, + config=config, + parameters={}, + ) + + assert requester.get_url_base() == "https://airbyte.io/" + assert requester.get_path(stream_state={}, stream_slice=stream_slice, next_page_token={}) == "v1/1234" + assert requester.get_authenticator() == authenticator + assert requester.get_method() == http_method + assert requester.get_request_params(stream_state={}, stream_slice=None, next_page_token=None) == request_params + assert requester.get_request_body_data(stream_state={}, stream_slice=None, next_page_token=None) == request_body_data + assert requester.get_request_body_json(stream_state={}, stream_slice=None, next_page_token=None) == request_body_json + + +@pytest.mark.parametrize( + "test_name, base_url, expected_base_url", + [ + ("test_no_trailing_slash", "https://example.com", "https://example.com/"), + ("test_with_trailing_slash", "https://example.com/", "https://example.com/"), + ("test_with_v1_no_trailing_slash", "https://example.com/v1", "https://example.com/v1/"), + ("test_with_v1_with_trailing_slash", "https://example.com/v1/", "https://example.com/v1/"), + ], +) +def test_base_url_has_a_trailing_slash(test_name, base_url, expected_base_url): + requester = HttpRequester( + name="name", + url_base=base_url, + path="deals", + http_method=HttpMethod.GET, + request_options_provider=MagicMock(), + authenticator=MagicMock(), + error_handler=MagicMock(), + config={}, + parameters={}, + ) + assert requester.get_url_base() == expected_base_url + + +@pytest.mark.parametrize( + "test_name, path, expected_path", + [ + ("test_no_leading_slash", "deals", "deals"), + ("test_with_leading_slash", "/deals", "deals"), + ("test_with_v1_no_leading_slash", "v1/deals", "v1/deals"), + ("test_with_v1_with_leading_slash", "/v1/deals", "v1/deals"), + ("test_with_v1_with_trailing_slash", "v1/deals/", "v1/deals/"), + ], +) +def test_path(test_name, path, expected_path): + requester = HttpRequester( + name="name", + url_base="https://example.com", + path=path, + http_method=HttpMethod.GET, + request_options_provider=MagicMock(), + authenticator=MagicMock(), + error_handler=MagicMock(), + config={}, + parameters={}, + ) + assert requester.get_path(stream_state={}, stream_slice={}, next_page_token={}) == expected_path + + +def create_requester( + url_base: Optional[str] = None, + parameters: Optional[Mapping[str, Any]] = {}, + config: Optional[Config] = None, + path: Optional[str] = None, + authenticator: Optional[DeclarativeAuthenticator] = None, + error_handler: Optional[ErrorHandler] = None, +) -> HttpRequester: + requester = HttpRequester( + name="name", + url_base=url_base or "https://example.com", + path=path or "deals", + http_method=HttpMethod.GET, + request_options_provider=None, + authenticator=authenticator, + error_handler=error_handler, + config=config or {}, + parameters=parameters or {}, + ) + requester._http_client._session.send = MagicMock() + req = requests.Response() + req.status_code = 200 + requester._http_client._session.send.return_value = req + return requester + + +def test_basic_send_request(): + options_provider = MagicMock() + options_provider.get_request_headers.return_value = {"my_header": "my_value"} + requester = create_requester() + requester._request_options_provider = options_provider + requester.send_request() + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + assert sent_request.method == "GET" + assert sent_request.url == "https://example.com/deals" + assert sent_request.headers["my_header"] == "my_value" + assert sent_request.body is None + + +@pytest.mark.parametrize( + "provider_data, provider_json, param_data, param_json, authenticator_data, authenticator_json, expected_exception, expected_body", + [ + # merging data params from the three sources + ({"field": "value"}, None, None, None, None, None, None, "field=value"), + ({"field": "value"}, None, {"field2": "value"}, None, None, None, None, "field=value&field2=value"), + ({"field": "value"}, None, {"field2": "value"}, None, {"authfield": "val"}, None, None, "field=value&field2=value&authfield=val"), + ({"field": "value"}, None, {"field": "value"}, None, None, None, ValueError, None), + ({"field": "value"}, None, None, None, {"field": "value"}, None, ValueError, None), + ({"field": "value"}, None, {"field2": "value"}, None, {"field": "value"}, None, ValueError, None), + # merging json params from the three sources + (None, {"field": "value"}, None, None, None, None, None, '{"field": "value"}'), + (None, {"field": "value"}, None, {"field2": "value"}, None, None, None, '{"field": "value", "field2": "value"}'), + ( + None, + {"field": "value"}, + None, + {"field2": "value"}, + None, + {"authfield": "val"}, + None, + '{"field": "value", "field2": "value", "authfield": "val"}', + ), + (None, {"field": "value"}, None, {"field": "value"}, None, None, ValueError, None), + (None, {"field": "value"}, None, None, None, {"field": "value"}, ValueError, None), + # raise on mixed data and json params + ({"field": "value"}, {"field": "value"}, None, None, None, None, RequestBodyException, None), + ({"field": "value"}, None, None, {"field": "value"}, None, None, RequestBodyException, None), + (None, None, {"field": "value"}, {"field": "value"}, None, None, RequestBodyException, None), + (None, None, None, None, {"field": "value"}, {"field": "value"}, RequestBodyException, None), + ({"field": "value"}, None, None, None, None, {"field": "value"}, RequestBodyException, None), + ], +) +def test_send_request_data_json( + provider_data, provider_json, param_data, param_json, authenticator_data, authenticator_json, expected_exception, expected_body +): + options_provider = MagicMock() + options_provider.get_request_body_data.return_value = provider_data + options_provider.get_request_body_json.return_value = provider_json + authenticator = MagicMock() + authenticator.get_request_body_data.return_value = authenticator_data + authenticator.get_request_body_json.return_value = authenticator_json + requester = create_requester(authenticator=authenticator) + requester._request_options_provider = options_provider + if expected_exception is not None: + with pytest.raises(expected_exception): + requester.send_request(request_body_data=param_data, request_body_json=param_json) + else: + requester.send_request(request_body_data=param_data, request_body_json=param_json) + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + if expected_body is not None: + assert sent_request.body == expected_body.decode("UTF-8") if not isinstance(expected_body, str) else expected_body + + +@pytest.mark.parametrize( + "provider_data, param_data, authenticator_data, expected_exception, expected_body", + [ + # assert body string from one source works + ("field=value", None, None, None, "field=value"), + (None, "field=value", None, None, "field=value"), + (None, None, "field=value", None, "field=value"), + # assert body string from multiple sources fails + ("field=value", "field=value", None, ValueError, None), + ("field=value", None, "field=value", ValueError, None), + (None, "field=value", "field=value", ValueError, None), + ("field=value", "field=value", "field=value", ValueError, None), + # assert body string and mapping from different source fails + ("field=value", {"abc": "def"}, None, ValueError, None), + ({"abc": "def"}, "field=value", None, ValueError, None), + ("field=value", None, {"abc": "def"}, ValueError, None), + ], +) +def test_send_request_string_data(provider_data, param_data, authenticator_data, expected_exception, expected_body): + options_provider = MagicMock() + options_provider.get_request_body_data.return_value = provider_data + authenticator = MagicMock() + authenticator.get_request_body_data.return_value = authenticator_data + requester = create_requester(authenticator=authenticator) + requester._request_options_provider = options_provider + if expected_exception is not None: + with pytest.raises(expected_exception): + requester.send_request(request_body_data=param_data) + else: + requester.send_request(request_body_data=param_data) + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + if expected_body is not None: + assert sent_request.body == expected_body + + +@pytest.mark.parametrize( + "provider_headers, param_headers, authenticator_headers, expected_exception, expected_headers", + [ + # merging headers from the three sources + ({"header": "value"}, None, None, None, {"header": "value"}), + ({"header": "value"}, {"header2": "value"}, None, None, {"header": "value", "header2": "value"}), + ( + {"header": "value"}, + {"header2": "value"}, + {"authheader": "val"}, + None, + {"header": "value", "header2": "value", "authheader": "val"}, + ), + # raise on conflicting headers + ({"header": "value"}, {"header": "value"}, None, ValueError, None), + ({"header": "value"}, None, {"header": "value"}, ValueError, None), + ({"header": "value"}, {"header2": "value"}, {"header": "value"}, ValueError, None), + ], +) +def test_send_request_headers(provider_headers, param_headers, authenticator_headers, expected_exception, expected_headers): + # headers set by the requests framework, do not validate + default_headers = {"User-Agent": mock.ANY, "Accept-Encoding": mock.ANY, "Accept": mock.ANY, "Connection": mock.ANY} + options_provider = MagicMock() + options_provider.get_request_headers.return_value = provider_headers + authenticator = MagicMock() + authenticator.get_auth_header.return_value = authenticator_headers or {} + requester = create_requester(authenticator=authenticator) + requester._request_options_provider = options_provider + if expected_exception is not None: + with pytest.raises(expected_exception): + requester.send_request(request_headers=param_headers) + else: + requester.send_request(request_headers=param_headers) + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + assert sent_request.headers == {**default_headers, **expected_headers} + + +@pytest.mark.parametrize( + "provider_params, param_params, authenticator_params, expected_exception, expected_params", + [ + # merging params from the three sources + ({"param": "value"}, None, None, None, {"param": "value"}), + ({"param": "value"}, {"param2": "value"}, None, None, {"param": "value", "param2": "value"}), + ({"param": "value"}, {"param2": "value"}, {"authparam": "val"}, None, {"param": "value", "param2": "value", "authparam": "val"}), + # raise on conflicting params + ({"param": "value"}, {"param": "value"}, None, ValueError, None), + ({"param": "value"}, None, {"param": "value"}, ValueError, None), + ({"param": "value"}, {"param2": "value"}, {"param": "value"}, ValueError, None), + ], +) +def test_send_request_params(provider_params, param_params, authenticator_params, expected_exception, expected_params): + options_provider = MagicMock() + options_provider.get_request_params.return_value = provider_params + authenticator = MagicMock() + authenticator.get_request_params.return_value = authenticator_params + requester = create_requester(authenticator=authenticator) + requester._request_options_provider = options_provider + if expected_exception is not None: + with pytest.raises(expected_exception): + requester.send_request(request_params=param_params) + else: + requester.send_request(request_params=param_params) + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + parsed_url = urlparse(sent_request.url) + query_params = {key: value[0] for key, value in parse_qs(parsed_url.query).items()} + assert query_params == expected_params + + +@pytest.mark.parametrize( + "request_parameters, config, expected_query_params", + [ + pytest.param( + {"k": '{"updatedDateFrom": "2023-08-20T00:00:00Z", "updatedDateTo": "2023-08-20T23:59:59Z"}'}, + {}, + "k=%7B%22updatedDateFrom%22%3A+%222023-08-20T00%3A00%3A00Z%22%2C+%22updatedDateTo%22%3A+%222023-08-20T23%3A59%3A59Z%22%7D", + id="test-request-parameter-dictionary", + ), + pytest.param( + {"k": "1,2"}, + {}, + "k=1%2C2", # k=1,2 + id="test-request-parameter-comma-separated-numbers", + ), + pytest.param( + {"k": "a,b"}, + {}, + "k=a%2Cb", # k=a,b + id="test-request-parameter-comma-separated-strings", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": {"updatedDateFrom": "2023-08-20T00:00:00Z", "updatedDateTo": "2023-08-20T23:59:59Z"}}, + # {'updatedDateFrom': '2023-08-20T00:00:00Z', 'updatedDateTo': '2023-08-20T23:59:59Z'} + "k=%7B%27updatedDateFrom%27%3A+%272023-08-20T00%3A00%3A00Z%27%2C+%27updatedDateTo%27%3A+%272023-08-20T23%3A59%3A59Z%27%7D", + id="test-request-parameter-from-config-object", + ), + pytest.param( + {"k": "[1,2]"}, + {}, + "k=1&k=2", + id="test-request-parameter-list-of-numbers", + ), + pytest.param( + {"k": '["a", "b"]'}, + {}, + "k=a&k=b", + id="test-request-parameter-list-of-strings", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": [1, 2]}, + "k=1&k=2", + id="test-request-parameter-from-config-list-of-numbers", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": ["a", "b"]}, + "k=a&k=b", + id="test-request-parameter-from-config-list-of-strings", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": ["a,b"]}, + "k=a%2Cb", + id="test-request-parameter-from-config-comma-separated-strings", + ), + pytest.param( + {'["a", "b"]': '{{ config["k"] }}'}, + {"k": [1, 2]}, + "%5B%22a%22%2C+%22b%22%5D=1&%5B%22a%22%2C+%22b%22%5D=2", + id="test-key-with-list-to-be-interpolated", + ), + ], +) +def test_request_param_interpolation(request_parameters, config, expected_query_params): + options_provider = InterpolatedRequestOptionsProvider( + config=config, + request_parameters=request_parameters, + request_body_data={}, + request_headers={}, + parameters={}, + ) + requester = create_requester(error_handler=DefaultErrorHandler(parameters={}, config={})) + requester._request_options_provider = options_provider + requester.send_request() + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + assert sent_request.url.split("?", 1)[-1] == expected_query_params + + +@pytest.mark.parametrize( + "request_parameters, config, invalid_value_for_key", + [ + pytest.param( + {"k": {"updatedDateFrom": "2023-08-20T00:00:00Z", "updatedDateTo": "2023-08-20T23:59:59Z"}}, + {}, + "k", + id="test-request-parameter-object-of-the-updated-info", + ), + pytest.param( + {"a": '{{ config["k"] }}', "b": {"end_timestamp": 1699109113}}, + {"k": 1699108113}, + "b", + id="test-key-with-multiple-keys", + ), + ], +) +def test_request_param_interpolation_with_incorrect_values(request_parameters, config, invalid_value_for_key): + options_provider = InterpolatedRequestOptionsProvider( + config=config, + request_parameters=request_parameters, + request_body_data={}, + request_headers={}, + parameters={}, + ) + requester = create_requester() + requester._request_options_provider = options_provider + with pytest.raises(ValueError) as error: + requester.send_request() + + assert ( + error.value.args[0] == f"Invalid value for `{invalid_value_for_key}` parameter. The values of request params cannot be an object." + ) + + +@pytest.mark.parametrize( + "request_body_data, config, expected_request_body_data", + [ + pytest.param( + {"k": '{"updatedDateFrom": "2023-08-20T00:00:00Z", "updatedDateTo": "2023-08-20T23:59:59Z"}'}, + {}, + # k={"updatedDateFrom": "2023-08-20T00:00:00Z", "updatedDateTo": "2023-08-20T23:59:59Z"} + "k=%7B%22updatedDateFrom%22%3A+%222023-08-20T00%3A00%3A00Z%22%2C+%22updatedDateTo%22%3A+%222023-08-20T23%3A59%3A59Z%22%7D", + id="test-request-body-dictionary", + ), + pytest.param( + {"k": "1,2"}, + {}, + "k=1%2C2", # k=1,2 + id="test-request-body-comma-separated-numbers", + ), + pytest.param( + {"k": "a,b"}, + {}, + "k=a%2Cb", # k=a,b + id="test-request-body-comma-separated-strings", + ), + pytest.param( + {"k": "[1,2]"}, + {}, + "k=1&k=2", + id="test-request-body-list-of-numbers", + ), + pytest.param( + {"k": '["a", "b"]'}, + {}, + "k=a&k=b", + id="test-request-body-list-of-strings", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": {"updatedDateFrom": "2023-08-20T00:00:00Z", "updatedDateTo": "2023-08-20T23:59:59Z"}}, + # k={'updatedDateFrom': '2023-08-20T00:00:00Z', 'updatedDateTo': '2023-08-20T23:59:59Z'} + "k=%7B%27updatedDateFrom%27%3A+%272023-08-20T00%3A00%3A00Z%27%2C+%27updatedDateTo%27%3A+%272023-08-20T23%3A59%3A59Z%27%7D", + id="test-request-body-from-config-object", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": [1, 2]}, + "k=1&k=2", + id="test-request-body-from-config-list-of-numbers", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": ["a", "b"]}, + "k=a&k=b", + id="test-request-body-from-config-list-of-strings", + ), + pytest.param( + {"k": '{{ config["k"] }}'}, + {"k": ["a,b"]}, + "k=a%2Cb", # k=a,b + id="test-request-body-from-config-comma-separated-strings", + ), + pytest.param( + {'["a", "b"]': '{{ config["k"] }}'}, + {"k": [1, 2]}, + "%5B%22a%22%2C+%22b%22%5D=1&%5B%22a%22%2C+%22b%22%5D=2", # ["a", "b"]=1&["a", "b"]=2 + id="test-key-with-list-is-not-interpolated", + ), + pytest.param( + {"k": "{'updatedDateFrom': '2023-08-20T00:00:00Z', 'updatedDateTo': '2023-08-20T23:59:59Z'}"}, + {}, + # k={'updatedDateFrom': '2023-08-20T00:00:00Z', 'updatedDateTo': '2023-08-20T23:59:59Z'} + "k=%7B%27updatedDateFrom%27%3A+%272023-08-20T00%3A00%3A00Z%27%2C+%27updatedDateTo%27%3A+%272023-08-20T23%3A59%3A59Z%27%7D", + id="test-single-quotes-are-retained", + ), + ], +) +def test_request_body_interpolation(request_body_data, config, expected_request_body_data): + options_provider = InterpolatedRequestOptionsProvider( + config=config, + request_parameters={}, + request_body_data=request_body_data, + request_headers={}, + parameters={}, + ) + requester = create_requester(error_handler=DefaultErrorHandler(parameters={}, config={})) + requester._request_options_provider = options_provider + requester.send_request() + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + assert sent_request.body == expected_request_body_data + + +@pytest.mark.parametrize( + "requester_path, param_path, expected_path", + [ + ("deals", None, "/deals"), + ("deals", "deals2", "/deals2"), + ("deals", "/deals2", "/deals2"), + ( + "deals/{{ stream_slice.start }}/{{ next_page_token.next_page_token }}/{{ config.config_key }}/{{ parameters.param_key }}", + None, + "/deals/2012/pagetoken/config_value/param_value", + ), + ], +) +def test_send_request_path(requester_path, param_path, expected_path): + requester = create_requester(config={"config_key": "config_value"}, path=requester_path, parameters={"param_key": "param_value"}) + requester.send_request(stream_slice={"start": "2012"}, next_page_token={"next_page_token": "pagetoken"}, path=param_path) + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + parsed_url = urlparse(sent_request.url) + assert parsed_url.path == expected_path + + +def test_send_request_url_base(): + requester = create_requester( + url_base="https://example.org/{{ config.config_key }}/{{ parameters.param_key }}", + config={"config_key": "config_value"}, + parameters={"param_key": "param_value"}, + error_handler=DefaultErrorHandler(parameters={}, config={}), + ) + requester.send_request() + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + assert sent_request.url == "https://example.org/config_value/param_value/deals" + + +def test_send_request_stream_slice_next_page_token(): + options_provider = MagicMock() + requester = create_requester(error_handler=DefaultErrorHandler(parameters={}, config={})) + requester._request_options_provider = options_provider + stream_slice = {"id": "1234"} + next_page_token = {"next_page_token": "next_page_token"} + requester.send_request(stream_slice=stream_slice, next_page_token=next_page_token) + options_provider.get_request_params.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=next_page_token + ) + options_provider.get_request_body_data.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=next_page_token + ) + options_provider.get_request_body_json.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=next_page_token + ) + options_provider.get_request_headers.assert_called_once_with( + stream_state=None, stream_slice=stream_slice, next_page_token=next_page_token + ) + + +@pytest.mark.parametrize( + "test_name, base_url, path, expected_full_url", + [ + ("test_no_slashes", "https://airbyte.io", "my_endpoint", "https://airbyte.io/my_endpoint"), + ("test_trailing_slash_on_base_url", "https://airbyte.io/", "my_endpoint", "https://airbyte.io/my_endpoint"), + ( + "test_trailing_slash_on_base_url_and_leading_slash_on_path", + "https://airbyte.io/", + "/my_endpoint", + "https://airbyte.io/my_endpoint", + ), + ("test_leading_slash_on_path", "https://airbyte.io", "/my_endpoint", "https://airbyte.io/my_endpoint"), + ("test_trailing_slash_on_path", "https://airbyte.io", "/my_endpoint/", "https://airbyte.io/my_endpoint/"), + ("test_nested_path_no_leading_slash", "https://airbyte.io", "v1/my_endpoint", "https://airbyte.io/v1/my_endpoint"), + ("test_nested_path_with_leading_slash", "https://airbyte.io", "/v1/my_endpoint", "https://airbyte.io/v1/my_endpoint"), + ], +) +def test_join_url(test_name, base_url, path, expected_full_url): + requester = HttpRequester( + name="name", + url_base=base_url, + path=path, + http_method=HttpMethod.GET, + request_options_provider=None, + config={}, + parameters={}, + error_handler=DefaultErrorHandler(parameters={}, config={}), + ) + requester._http_client._session.send = MagicMock() + response = requests.Response() + response.status_code = 200 + requester._http_client._session.send.return_value = response + requester.send_request() + sent_request: PreparedRequest = requester._http_client._session.send.call_args_list[0][0][0] + assert sent_request.url == expected_full_url + + +@pytest.mark.usefixtures("mock_sleep") +def test_request_attempt_count_is_tracked_across_retries(http_requester_factory): + request_mock = MagicMock(spec=requests.PreparedRequest) + request_mock.headers = {} + request_mock.url = "https://example.com/deals" + request_mock.method = "GET" + request_mock.body = {} + backoff_strategy = ConstantBackoffStrategy(parameters={}, config={}, backoff_time_in_seconds=0.1) + error_handler = DefaultErrorHandler(parameters={}, config={}, max_retries=1, backoff_strategies=[backoff_strategy]) + http_requester = http_requester_factory(error_handler=error_handler) + http_requester._http_client._session.send = MagicMock() + response = requests.Response() + response.status_code = 500 + http_requester._http_client._session.send.return_value = response + + with pytest.raises(UserDefinedBackoffException): + http_requester._http_client._send_with_retry(request=request_mock, request_kwargs={}) + + assert http_requester._http_client._request_attempt_count.get(request_mock) == http_requester._http_client._max_retries + 1 + + +@pytest.mark.usefixtures("mock_sleep") +def test_request_attempt_count_with_exponential_backoff_strategy(http_requester_factory): + request_mock = MagicMock(spec=requests.PreparedRequest) + request_mock.headers = {} + request_mock.url = "https://example.com/deals" + request_mock.method = "GET" + request_mock.body = {} + backoff_strategy = ExponentialBackoffStrategy(parameters={}, config={}, factor=0.01) + error_handler = DefaultErrorHandler(parameters={}, config={}, max_retries=2, backoff_strategies=[backoff_strategy]) + http_requester = http_requester_factory(error_handler=error_handler) + http_requester._http_client._session.send = MagicMock() + response = requests.Response() + response.status_code = 500 + http_requester._http_client._session.send.return_value = response + + with pytest.raises(UserDefinedBackoffException): + http_requester._http_client._send_with_retry(request=request_mock, request_kwargs={}) + + assert http_requester._http_client._request_attempt_count.get(request_mock) == http_requester._http_client._max_retries + 1 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py new file mode 100644 index 000000000000..3f80b7eefd31 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py @@ -0,0 +1,32 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest as pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_input_provider import InterpolatedRequestInputProvider + + +@pytest.mark.parametrize( + "test_name, input_request_data, expected_request_data", + [ + ("test_static_map_data", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_map_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['slice_key'] }}"}, {"read_from_slice": "slice_value"}), + ("test_map_depends_on_config", {"read_from_config": "{{ config['config_key'] }}"}, {"read_from_config": "value_of_config"}), + ( + "test_map_depends_on_parameters", + {"read_from_parameters": "{{ parameters['read_from_parameters'] }}"}, + {"read_from_parameters": "value_of_parameters"}, + ), + ("test_defaults_to_empty_dictionary", None, {}), + ], +) +def test_initialize_interpolated_mapping_request_input_provider(test_name, input_request_data, expected_request_data): + config = {"config_key": "value_of_config"} + stream_slice = {"slice_key": "slice_value"} + parameters = {"read_from_parameters": "value_of_parameters"} + provider = InterpolatedRequestInputProvider(request_inputs=input_request_data, config=config, parameters=parameters) + actual_request_data = provider.eval_request_inputs(stream_state={}, stream_slice=stream_slice) + + assert isinstance(provider._interpolator, InterpolatedMapping) + assert actual_request_data == expected_request_data diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/test_simple_retriever.py b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/test_simple_retriever.py new file mode 100644 index 000000000000..2fd0594bf1b0 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/test_simple_retriever.py @@ -0,0 +1,762 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +from unittest.mock import MagicMock, Mock, patch + +import pytest +import requests +from airbyte_cdk import YamlDeclarativeSource +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type +from airbyte_cdk.sources.declarative.auth.declarative_authenticator import NoAuth +from airbyte_cdk.sources.declarative.incremental import DatetimeBasedCursor, DeclarativeCursor, ResumableFullRefreshCursor +from airbyte_cdk.sources.declarative.models import DeclarativeStream as DeclarativeStreamModel +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ModelToComponentFactory +from airbyte_cdk.sources.declarative.partition_routers import SinglePartitionRouter +from airbyte_cdk.sources.declarative.requesters.paginators import DefaultPaginator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies import PageIncrement +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever, SimpleRetrieverTestReadDecorator +from airbyte_cdk.sources.types import Record, StreamSlice + +A_SLICE_STATE = {"slice_state": "slice state value"} +A_STREAM_SLICE = StreamSlice(cursor_slice={"stream slice": "slice value"}, partition={}) +A_STREAM_STATE = {"stream state": "state value"} + +primary_key = "pk" +records = [{"id": 1}, {"id": 2}] +request_response_logs = [ + AirbyteLogMessage(level=Level.INFO, message="request:{}"), + AirbyteLogMessage(level=Level.INFO, message="response{}"), +] +config = {} + + +@patch.object(SimpleRetriever, "_read_pages", return_value=iter([])) +def test_simple_retriever_full(mock_http_stream): + requester = MagicMock() + request_params = {"param": "value"} + requester.get_request_params.return_value = request_params + + paginator = MagicMock() + next_page_token = {"cursor": "cursor_value"} + paginator.path.return_value = None + paginator.next_page_token.return_value = next_page_token + paginator.get_requesyyt_headers.return_value = {} + + record_selector = MagicMock() + record_selector.select_records.return_value = records + + cursor = MagicMock(spec=DeclarativeCursor) + stream_slices = [{"date": "2022-01-01"}, {"date": "2022-01-02"}] + cursor.stream_slices.return_value = stream_slices + + response = requests.Response() + response.status_code = 200 + + underlying_state = {"date": "2021-01-01"} + cursor.get_stream_state.return_value = underlying_state + + requester.get_authenticator.return_value = NoAuth({}) + url_base = "https://airbyte.io" + requester.get_url_base.return_value = url_base + path = "/v1" + requester.get_path.return_value = path + http_method = HttpMethod.GET + requester.get_method.return_value = http_method + should_retry = True + requester.interpret_response_status.return_value = should_retry + request_body_json = {"body": "json"} + requester.request_body_json.return_value = request_body_json + + request_body_data = {"body": "data"} + requester.get_request_body_data.return_value = request_body_data + request_body_json = {"body": "json"} + requester.get_request_body_json.return_value = request_body_json + request_kwargs = {"kwarg": "value"} + requester.request_kwargs.return_value = request_kwargs + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + paginator=paginator, + record_selector=record_selector, + stream_slicer=cursor, + cursor=cursor, + parameters={}, + config={}, + ) + + assert retriever.primary_key == primary_key + assert retriever.state == underlying_state + assert retriever._next_page_token(response) == next_page_token + assert retriever._request_params(None, None, None) == {} + assert retriever.stream_slices() == stream_slices + + assert retriever._last_response is None + assert retriever._last_record is None + assert list(retriever._parse_response(response, stream_state={}, records_schema={})) == records + assert retriever._last_response == response + assert retriever._last_page_size == 2 + + [r for r in retriever.read_records(SyncMode.full_refresh)] + paginator.reset.assert_called() + + +@patch.object(SimpleRetriever, "_read_pages", return_value=iter([*request_response_logs, *records])) +def test_simple_retriever_with_request_response_logs(mock_http_stream): + requester = MagicMock() + paginator = MagicMock() + record_selector = MagicMock() + stream_slicer = DatetimeBasedCursor( + start_datetime="", + end_datetime="", + step="P1D", + cursor_field="id", + datetime_format="", + cursor_granularity="P1D", + config={}, + parameters={}, + ) + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + paginator=paginator, + record_selector=record_selector, + stream_slicer=stream_slicer, + parameters={}, + config={}, + ) + + actual_messages = [r for r in retriever.read_records(SyncMode.full_refresh)] + paginator.reset.assert_called() + + assert isinstance(actual_messages[0], AirbyteLogMessage) + assert isinstance(actual_messages[1], AirbyteLogMessage) + assert actual_messages[2] == records[0] + assert actual_messages[3] == records[1] + + +@pytest.mark.parametrize( + "initial_state, expected_reset_value, expected_next_page", + [ + pytest.param(None, None, 1, id="test_initial_sync_no_state"), + pytest.param({"next_page_token": 10}, 10, 11, id="test_reset_with_next_page_token"), + ], +) +def test_simple_retriever_resumable_full_refresh_cursor_page_increment(initial_state, expected_reset_value, expected_next_page): + expected_records = [ + Record(data={"id": "abc"}, associated_slice=None), + Record(data={"id": "def"}, associated_slice=None), + Record(data={"id": "ghi"}, associated_slice=None), + Record(data={"id": "jkl"}, associated_slice=None), + Record(data={"id": "mno"}, associated_slice=None), + Record(data={"id": "123"}, associated_slice=None), + Record(data={"id": "456"}, associated_slice=None), + Record(data={"id": "789"}, associated_slice=None), + ] + + response = requests.Response() + response.status_code = 200 + response._content = json.dumps({"data": [record.data for record in expected_records[:5]]}).encode("utf-8") + + requester = MagicMock() + requester.send_request.side_effect = [ + response, + response, + ] + + record_selector = MagicMock() + record_selector.select_records.side_effect = [ + [ + expected_records[0], + expected_records[1], + expected_records[2], + expected_records[3], + expected_records[4], + ], + [ + expected_records[5], + expected_records[6], + expected_records[7], + ], + ] + + page_increment_strategy = PageIncrement(config={}, page_size=5, parameters={}) + paginator = DefaultPaginator(config={}, pagination_strategy=page_increment_strategy, url_base="https://airbyte.io", parameters={}) + paginator.reset = Mock(wraps=paginator.reset) + + stream_slicer = ResumableFullRefreshCursor(parameters={}) + if initial_state: + stream_slicer.set_initial_state(initial_state) + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + paginator=paginator, + record_selector=record_selector, + stream_slicer=stream_slicer, + cursor=stream_slicer, + parameters={}, + config={}, + ) + + stream_slice = list(stream_slicer.stream_slices())[0] + actual_records = [r for r in retriever.read_records(records_schema={}, stream_slice=stream_slice)] + + assert len(actual_records) == 5 + assert actual_records == expected_records[:5] + assert retriever.state == {"next_page_token": expected_next_page} + + actual_records = [r for r in retriever.read_records(records_schema={}, stream_slice=stream_slice)] + assert len(actual_records) == 3 + assert actual_records == expected_records[5:] + assert retriever.state == {"__ab_full_refresh_sync_complete": True} + + paginator.reset.assert_called_once_with(reset_value=expected_reset_value) + + +@pytest.mark.parametrize( + "initial_state, expected_reset_value, expected_next_page", + [ + pytest.param(None, None, 1, id="test_initial_sync_no_state"), + pytest.param( + {"next_page_token": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=tracy_stevens"}, + "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=tracy_stevens", + "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", + id="test_reset_with_next_page_token", + ), + ], +) +def test_simple_retriever_resumable_full_refresh_cursor_reset_cursor_pagination( + initial_state, expected_reset_value, expected_next_page, requests_mock +): + expected_records = [ + Record(data={"name": "ed_baldwin"}, associated_slice=None), + Record(data={"name": "danielle_poole"}, associated_slice=None), + Record(data={"name": "tracy_stevens"}, associated_slice=None), + Record(data={"name": "deke_slayton"}, associated_slice=None), + Record(data={"name": "molly_cobb"}, associated_slice=None), + Record(data={"name": "gordo_stevens"}, associated_slice=None), + Record(data={"name": "margo_madison"}, associated_slice=None), + Record(data={"name": "ellen_waverly"}, associated_slice=None), + ] + + content = """ +name: users +type: DeclarativeStream +retriever: + type: SimpleRetriever + decoder: + type: JsonDecoder + paginator: + type: "DefaultPaginator" + page_token_option: + type: RequestPath + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response.next_page }}" + requester: + path: /astronauts + type: HttpRequester + url_base: "https://for-all-mankind.nasa.com/api/v1" + http_method: GET + authenticator: + type: ApiKeyAuthenticator + api_token: "{{ config['api_key'] }}" + inject_into: + type: RequestOption + field_name: Api-Key + inject_into: header + request_headers: {} + request_body_json: {} + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: ["data"] + partition_router: [] +primary_key: [] + """ + + factory = ModelToComponentFactory() + stream_manifest = YamlDeclarativeSource._parse(content) + stream = factory.create_component(model_type=DeclarativeStreamModel, component_definition=stream_manifest, config={}) + response_body = { + "data": [r.data for r in expected_records[:5]], + "next_page": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", + } + requests_mock.get("https://for-all-mankind.nasa.com/api/v1/astronauts", json=response_body) + requests_mock.get("https://for-all-mankind.nasa.com/astronauts?next_page=tracy_stevens", json=response_body) + response_body_2 = { + "data": [r.data for r in expected_records[5:]], + } + requests_mock.get("https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens", json=response_body_2) + stream.retriever.paginator.reset = Mock(wraps=stream.retriever.paginator.reset) + stream_slicer = ResumableFullRefreshCursor(parameters={}) + if initial_state: + stream_slicer.set_initial_state(initial_state) + stream.retriever.stream_slices = stream_slicer + stream.retriever.cursor = stream_slicer + stream_slice = list(stream_slicer.stream_slices())[0] + actual_records = [r for r in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice)] + + assert len(actual_records) == 5 + assert actual_records == expected_records[:5] + assert stream.retriever.state == {"next_page_token": "https://for-all-mankind.nasa.com/api/v1/astronauts?next_page=gordo_stevens"} + requests_mock.get("https://for-all-mankind.nasa.com/astronauts?next_page=tracy_stevens", json=response_body) + requests_mock.get("https://for-all-mankind.nasa.com/astronauts?next_page=gordo_stevens", json=response_body_2) + actual_records = [r for r in stream.retriever.read_records(records_schema={}, stream_slice=stream_slice)] + assert len(actual_records) == 3 + assert actual_records == expected_records[5:] + assert stream.retriever.state == {"__ab_full_refresh_sync_complete": True} + + stream.retriever.paginator.reset.assert_called_once_with(reset_value=expected_reset_value) + + +def test_simple_retriever_resumable_full_refresh_cursor_reset_skip_completed_stream(): + expected_records = [ + Record(data={"id": "abc"}, associated_slice=None), + Record(data={"id": "def"}, associated_slice=None), + ] + + response = requests.Response() + response.status_code = 200 + response._content = json.dumps({}).encode("utf-8") + + requester = MagicMock() + requester.send_request.side_effect = [ + response, + ] + + record_selector = MagicMock() + record_selector.select_records.return_value = [ + expected_records[0], + expected_records[1], + ] + + page_increment_strategy = PageIncrement(config={}, page_size=5, parameters={}) + paginator = DefaultPaginator(config={}, pagination_strategy=page_increment_strategy, url_base="https://airbyte.io", parameters={}) + paginator.reset = Mock(wraps=paginator.reset) + + stream_slicer = ResumableFullRefreshCursor(parameters={}) + stream_slicer.set_initial_state({"__ab_full_refresh_sync_complete": True}) + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + paginator=paginator, + record_selector=record_selector, + stream_slicer=stream_slicer, + cursor=stream_slicer, + parameters={}, + config={}, + ) + + stream_slice = list(stream_slicer.stream_slices())[0] + actual_records = [r for r in retriever.read_records(records_schema={}, stream_slice=stream_slice)] + + assert len(actual_records) == 0 + assert retriever.state == {"__ab_full_refresh_sync_complete": True} + + paginator.reset.assert_not_called() + + +@pytest.mark.parametrize( + "test_name, paginator_mapping, request_options_provider_mapping, expected_mapping", + [ + ("test_empty_headers", {}, {}, {}), + ("test_header_from_pagination_and_slicer", {"offset": 1000}, {"key": "value"}, {"key": "value", "offset": 1000}), + ("test_header_from_stream_slicer", {}, {"slice": "slice_value"}, {"slice": "slice_value"}), + ("test_duplicate_header_slicer_paginator", {"k": "v"}, {"k": "slice_value"}, None), + ], +) +def test_get_request_options_from_pagination(test_name, paginator_mapping, request_options_provider_mapping, expected_mapping): + # This test does not test request headers because they must be strings + paginator = MagicMock() + paginator.get_request_params.return_value = paginator_mapping + paginator.get_request_body_data.return_value = paginator_mapping + paginator.get_request_body_json.return_value = paginator_mapping + + request_options_provider = MagicMock() + request_options_provider.get_request_params.return_value = request_options_provider_mapping + request_options_provider.get_request_body_data.return_value = request_options_provider_mapping + request_options_provider.get_request_body_json.return_value = request_options_provider_mapping + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=MagicMock(), + record_selector=record_selector, + paginator=paginator, + request_option_provider=request_options_provider, + parameters={}, + config={}, + ) + + request_option_type_to_method = { + RequestOptionType.request_parameter: retriever._request_params, + RequestOptionType.body_data: retriever._request_body_data, + RequestOptionType.body_json: retriever._request_body_json, + } + + for _, method in request_option_type_to_method.items(): + if expected_mapping is not None: + actual_mapping = method(None, None, None) + assert actual_mapping == expected_mapping + else: + try: + method(None, None, None) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "test_name, paginator_mapping, expected_mapping", + [ + ("test_only_base_headers", {}, {"key": "value"}), + ("test_header_from_pagination", {"offset": 1000}, {"key": "value", "offset": "1000"}), + ("test_duplicate_header", {"key": 1000}, None), + ], +) +def test_get_request_headers(test_name, paginator_mapping, expected_mapping): + # This test is separate from the other request options because request headers must be strings + paginator = MagicMock() + paginator.get_request_headers.return_value = paginator_mapping + requester = MagicMock(use_cache=False) + + stream_slicer = MagicMock() + stream_slicer.get_request_headers.return_value = {"key": "value"} + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + stream_slicer=stream_slicer, + paginator=paginator, + parameters={}, + config={}, + ) + + request_option_type_to_method = { + RequestOptionType.header: retriever._request_headers, + } + + for _, method in request_option_type_to_method.items(): + if expected_mapping: + actual_mapping = method(None, None, None) + assert actual_mapping == expected_mapping + else: + try: + method(None, None, None) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "test_name, paginator_mapping, ignore_stream_slicer_parameters_on_paginated_requests, next_page_token, expected_mapping", + [ + ( + "test_do_not_ignore_stream_slicer_params_if_ignore_is_true_but_no_next_page_token", + {"key_from_pagination": "1000"}, + True, + None, + {"key_from_pagination": "1000"}, + ), + ( + "test_do_not_ignore_stream_slicer_params_if_ignore_is_false_and_no_next_page_token", + {"key_from_pagination": "1000"}, + False, + None, + {"key_from_pagination": "1000", "key_from_slicer": "value"}, + ), + ( + "test_ignore_stream_slicer_params_on_paginated_request", + {"key_from_pagination": "1000"}, + True, + {"page": 2}, + {"key_from_pagination": "1000"}, + ), + ( + "test_do_not_ignore_stream_slicer_params_on_paginated_request", + {"key_from_pagination": "1000"}, + False, + {"page": 2}, + {"key_from_pagination": "1000", "key_from_slicer": "value"}, + ), + ], +) +def test_ignore_stream_slicer_parameters_on_paginated_requests( + test_name, paginator_mapping, ignore_stream_slicer_parameters_on_paginated_requests, next_page_token, expected_mapping +): + # This test is separate from the other request options because request headers must be strings + paginator = MagicMock() + paginator.get_request_headers.return_value = paginator_mapping + requester = MagicMock(use_cache=False) + + stream_slicer = MagicMock() + stream_slicer.get_request_headers.return_value = {"key_from_slicer": "value"} + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + stream_slicer=stream_slicer, + paginator=paginator, + ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, + parameters={}, + config={}, + ) + + request_option_type_to_method = { + RequestOptionType.header: retriever._request_headers, + } + + for _, method in request_option_type_to_method.items(): + actual_mapping = method(None, None, next_page_token={"next_page_token": "1000"}) + assert actual_mapping == expected_mapping + + +@pytest.mark.parametrize( + "test_name, request_options_provider_body_data, paginator_body_data, expected_body_data", + [ + ("test_only_slicer_mapping", {"key": "value"}, {}, {"key": "value"}), + ("test_only_slicer_string", "key=value", {}, "key=value"), + ("test_slicer_mapping_and_paginator_no_duplicate", {"key": "value"}, {"offset": 1000}, {"key": "value", "offset": 1000}), + ("test_slicer_mapping_and_paginator_with_duplicate", {"key": "value"}, {"key": 1000}, None), + ("test_slicer_string_and_paginator", "key=value", {"offset": 1000}, None), + ], +) +def test_request_body_data(test_name, request_options_provider_body_data, paginator_body_data, expected_body_data): + paginator = MagicMock() + paginator.get_request_body_data.return_value = paginator_body_data + requester = MagicMock(use_cache=False) + + # stream_slicer = MagicMock() + # stream_slicer.get_request_body_data.return_value = request_options_provider_body_data + request_option_provider = MagicMock() + request_option_provider.get_request_body_data.return_value = request_options_provider_body_data + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + paginator=paginator, + request_option_provider=request_option_provider, + parameters={}, + config={}, + ) + + if expected_body_data: + actual_body_data = retriever._request_body_data(None, None, None) + assert actual_body_data == expected_body_data + else: + try: + retriever._request_body_data(None, None, None) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "test_name, requester_path, paginator_path, expected_path", + [ + ("test_path_from_requester", "/v1/path", None, None), + ("test_path_from_paginator", "/v1/path/", "/v2/paginator", "/v2/paginator"), + ], +) +def test_path(test_name, requester_path, paginator_path, expected_path): + paginator = MagicMock() + paginator.path.return_value = paginator_path + requester = MagicMock(use_cache=False) + + requester.get_path.return_value = requester_path + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + paginator=paginator, + parameters={}, + config={}, + ) + + actual_path = retriever._paginator_path() + assert actual_path == expected_path + + +def test_limit_stream_slices(): + maximum_number_of_slices = 4 + stream_slicer = MagicMock() + stream_slicer.stream_slices.return_value = _generate_slices(maximum_number_of_slices * 2) + retriever = SimpleRetrieverTestReadDecorator( + name="stream_name", + primary_key=primary_key, + requester=MagicMock(), + paginator=MagicMock(), + record_selector=MagicMock(), + stream_slicer=stream_slicer, + maximum_number_of_slices=maximum_number_of_slices, + parameters={}, + config={}, + ) + + truncated_slices = list(retriever.stream_slices()) + + assert truncated_slices == _generate_slices(maximum_number_of_slices) + + +@pytest.mark.parametrize( + "test_name, first_greater_than_second", + [ + ("test_first_greater_than_second", True), + ("test_second_greater_than_first", False), + ], +) +def test_when_read_records_then_cursor_close_slice_with_greater_record(test_name, first_greater_than_second): + first_record = Record({"first": 1}, StreamSlice(cursor_slice={}, partition={})) + second_record = Record({"second": 2}, StreamSlice(cursor_slice={}, partition={})) + records = [first_record, second_record] + record_selector = MagicMock() + record_selector.select_records.return_value = records + cursor = MagicMock(spec=DeclarativeCursor) + cursor.is_greater_than_or_equal.return_value = first_greater_than_second + paginator = MagicMock() + paginator.get_request_headers.return_value = {} + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=MagicMock(), + paginator=paginator, + record_selector=record_selector, + stream_slicer=cursor, + cursor=cursor, + parameters={}, + config={}, + ) + stream_slice = StreamSlice(cursor_slice={}, partition={"repository": "airbyte"}) + + def retriever_read_pages(_, __, ___): + return retriever._parse_records(response=MagicMock(), stream_state={}, stream_slice=stream_slice, records_schema={}) + + with patch.object( + SimpleRetriever, + "_read_pages", + return_value=iter([first_record, second_record]), + side_effect=retriever_read_pages, + ): + list(retriever.read_records(stream_slice=stream_slice, records_schema={})) + cursor.close_slice.assert_called_once_with(stream_slice, first_record if first_greater_than_second else second_record) + + +def test_given_stream_data_is_not_record_when_read_records_then_update_slice_with_optional_record(): + stream_data = [AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="a log message"))] + record_selector = MagicMock() + record_selector.select_records.return_value = [] + cursor = MagicMock(spec=DeclarativeCursor) + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=MagicMock(), + paginator=Mock(), + record_selector=record_selector, + stream_slicer=cursor, + cursor=cursor, + parameters={}, + config={}, + ) + stream_slice = StreamSlice(cursor_slice={}, partition={"repository": "airbyte"}) + + def retriever_read_pages(_, __, ___): + return retriever._parse_records(response=MagicMock(), stream_state={}, stream_slice=stream_slice, records_schema={}) + + with patch.object( + SimpleRetriever, + "_read_pages", + return_value=iter(stream_data), + side_effect=retriever_read_pages, + ): + list(retriever.read_records(stream_slice=stream_slice, records_schema={})) + cursor.observe.assert_not_called() + cursor.close_slice.assert_called_once_with(stream_slice, None) + + +def _generate_slices(number_of_slices): + return [{"date": f"2022-01-0{day + 1}"} for day in range(number_of_slices)] + + +@patch.object(SimpleRetriever, "_read_pages", return_value=iter([])) +def test_given_state_selector_when_read_records_use_stream_state(http_stream_read_pages, mocker): + requester = MagicMock() + paginator = MagicMock() + record_selector = MagicMock() + cursor = MagicMock(spec=DeclarativeCursor) + cursor.select_state = MagicMock(return_value=A_SLICE_STATE) + cursor.get_stream_state = MagicMock(return_value=A_STREAM_STATE) + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + paginator=paginator, + record_selector=record_selector, + stream_slicer=cursor, + cursor=cursor, + parameters={}, + config={}, + ) + + list(retriever.read_records(stream_slice=A_STREAM_SLICE, records_schema={})) + + http_stream_read_pages.assert_called_once_with(mocker.ANY, A_STREAM_STATE, A_STREAM_SLICE) + + +def test_emit_log_request_response_messages(mocker): + record_selector = MagicMock() + record_selector.select_records.return_value = records + + request = requests.PreparedRequest() + request.headers = {"header": "value"} + request.url = "http://byrde.enterprises.com/casinos" + + response = requests.Response() + response.request = request + response.status_code = 200 + + format_http_message_mock = mocker.patch("airbyte_cdk.sources.declarative.retrievers.simple_retriever.format_http_message") + requester = MagicMock() + retriever = SimpleRetrieverTestReadDecorator( + name="stream_name", + primary_key=primary_key, + requester=requester, + paginator=MagicMock(), + record_selector=record_selector, + stream_slicer=SinglePartitionRouter(parameters={}), + parameters={}, + config={}, + ) + + retriever._fetch_next_page(stream_state={}, stream_slice=StreamSlice(cursor_slice={}, partition={})) + + assert requester.send_request.call_args_list[0][1]["log_formatter"] is not None + assert requester.send_request.call_args_list[0][1]["log_formatter"](response) == format_http_message_mock.return_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/__init__.py new file mode 100644 index 000000000000..655bcbe54fb8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/__init__.py @@ -0,0 +1,6 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from .source_test.SourceTest import SourceTest + +__all__ = ["SourceTest"] diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/SourceTest.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/SourceTest.py new file mode 100644 index 000000000000..8d6a26cb7657 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/SourceTest.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +class SourceTest: + def __init__(self): + pass diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/schemas/sample_stream.json b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/schemas/sample_stream.json new file mode 100644 index 000000000000..6ef7fbed5577 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/schemas/sample_stream.json @@ -0,0 +1,12 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": ["null", "object"], + "properties": { + "type": { + "$ref": "sample_shared_schema.json" + }, + "id": { + "type": ["null", "string"] + } + } +} diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/schemas/shared/sample_shared_schema.json b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/schemas/shared/sample_shared_schema.json new file mode 100644 index 000000000000..95ea8a1655f2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/source_test/schemas/shared/sample_shared_schema.json @@ -0,0 +1,11 @@ +{ + "type": ["null", "object"], + "properties": { + "id_internal": { + "type": ["null", "integer"] + }, + "name": { + "type": ["null", "string"] + } + } +} diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_default_schema_loader.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_default_schema_loader.py new file mode 100644 index 000000000000..c04c4fdcd33a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_default_schema_loader.py @@ -0,0 +1,32 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.schema import DefaultSchemaLoader + + +@pytest.mark.parametrize( + "found_schema, found_error, expected_schema", + [ + pytest.param( + {"type": "object", "properties": {}}, None, {"type": "object", "properties": {}}, id="test_has_schema_in_default_location" + ), + pytest.param(None, FileNotFoundError, {}, id="test_schema_file_does_not_exist"), + ], +) +def test_get_json_schema(found_schema, found_error, expected_schema): + default_schema_loader = DefaultSchemaLoader({}, {}) + + json_file_schema_loader = MagicMock() + if found_schema: + json_file_schema_loader.get_json_schema.return_value = {"type": "object", "properties": {}} + if found_error: + json_file_schema_loader.get_json_schema.side_effect = found_error + + default_schema_loader.default_loader = json_file_schema_loader + + actual_schema = default_schema_loader.get_json_schema() + assert actual_schema == expected_schema diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_inline_schema_loader.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_inline_schema_loader.py new file mode 100644 index 000000000000..ad44ee334cfc --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_inline_schema_loader.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.schema import InlineSchemaLoader + + +@pytest.mark.parametrize( + "test_name, input_schema, expected_schema", + [ + ("schema", {"k": "string"}, {"k": "string"}), + ("empty_schema", {}, {}), + ], +) +def test_static_schema_loads(test_name, input_schema, expected_schema): + schema_loader = InlineSchemaLoader(input_schema, {}) + + assert schema_loader.get_json_schema() == expected_schema diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_file_schema_loader.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_file_schema_loader.py new file mode 100644 index 000000000000..8ef467618634 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_json_file_schema_loader.py @@ -0,0 +1,37 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from unittest.mock import patch + +import pytest +from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import JsonFileSchemaLoader, _default_file_path + + +@pytest.mark.parametrize( + "test_name, input_path, expected_resource, expected_path", + [ + ("path_prefixed_with_dot", "./source_example/schemas/lists.json", "source_example", "schemas/lists.json"), + ("path_prefixed_with_slash", "/source_example/schemas/lists.json", "source_example", "schemas/lists.json"), + ("path_starting_with_source", "source_example/schemas/lists.json", "source_example", "schemas/lists.json"), + ("path_starting_missing_source", "schemas/lists.json", "schemas", "lists.json"), + ("path_with_file_only", "lists.json", "", "lists.json"), + ("empty_path_does_not_crash", "", "", ""), + ("empty_path_with_slash_does_not_crash", "/", "", ""), + ], +) +def test_extract_resource_and_schema_path(test_name, input_path, expected_resource, expected_path): + json_schema = JsonFileSchemaLoader({}, {}, input_path) + actual_resource, actual_path = json_schema.extract_resource_and_schema_path(input_path) + + assert actual_resource == expected_resource + assert actual_path == expected_path + + +@patch("airbyte_cdk.sources.declarative.schema.json_file_schema_loader.sys") +def test_exclude_cdk_packages(mocked_sys): + keys = ["airbyte_cdk.sources.concurrent_source.concurrent_source_adapter", "source_gitlab.utils"] + mocked_sys.modules = {key: "" for key in keys} + + default_file_path = _default_file_path() + + assert "source_gitlab" in default_file_path diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/spec/test_spec.py b/airbyte-cdk/python/unit_tests/sources/declarative/spec/test_spec.py new file mode 100644 index 000000000000..1e1ef498082f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/spec/test_spec.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.models import AdvancedAuth, AuthFlowType, ConnectorSpecification +from airbyte_cdk.sources.declarative.models.declarative_component_schema import AuthFlow +from airbyte_cdk.sources.declarative.spec.spec import Spec + + +@pytest.mark.parametrize( + "spec, expected_connection_specification", + [ + ( + Spec(connection_specification={"client_id": "my_client_id"}, parameters={}), + ConnectorSpecification(connectionSpecification={"client_id": "my_client_id"}), + ), + ( + Spec(connection_specification={"client_id": "my_client_id"}, parameters={}, documentation_url="https://airbyte.io"), + ConnectorSpecification(connectionSpecification={"client_id": "my_client_id"}, documentationUrl="https://airbyte.io"), + ), + ( + Spec(connection_specification={"client_id": "my_client_id"}, parameters={}, advanced_auth=AuthFlow(auth_flow_type="oauth2.0")), + ConnectorSpecification( + connectionSpecification={"client_id": "my_client_id"}, advanced_auth=AdvancedAuth(auth_flow_type=AuthFlowType.oauth2_0) + ), + ), + ], + ids=[ + "test_only_connection_specification", + "test_with_doc_url", + "test_auth_flow", + ], +) +def test_spec(spec, expected_connection_specification): + assert spec.generate_spec() == expected_connection_specification diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/states/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/states/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/states/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_concurrent_declarative_source.py new file mode 100644 index 000000000000..da3e9af3ac02 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -0,0 +1,1280 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import copy +import json +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union + +import freezegun +import isodate +import pendulum +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + FailureType, + Status, + StreamDescriptor, + SyncMode, +) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ConcurrentDeclarativeSource +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.checkpoint import Cursor +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.types import Record, StreamSlice +from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse +from airbyte_cdk.utils import AirbyteTracedException +from deprecated.classic import deprecated + +_CONFIG = { + "start_date": "2024-07-01T00:00:00.000Z" +} + +_CATALOG = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="party_members", json_schema={}, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ), + ConfiguredAirbyteStream( + stream=AirbyteStream(name="palaces", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ConfiguredAirbyteStream( + stream=AirbyteStream(name="locations", json_schema={}, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ), + ConfiguredAirbyteStream( + stream=AirbyteStream(name="party_members_skills", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] +) +_LOCATIONS_RESPONSE = HttpResponse(json.dumps([ + {"id": "444", "name": "Yongen-jaya", "updated_at": "2024-08-10"}, + {"id": "scramble", "name": "Shibuya", "updated_at": "2024-08-10"}, + {"id": "aoyama", "name": "Aoyama-itchome", "updated_at": "2024-08-10"}, + {"id": "shin123", "name": "Shinjuku", "updated_at": "2024-08-10"}, +])) +_PALACES_RESPONSE = HttpResponse(json.dumps([ + {"id": "0", "world": "castle", "owner": "kamoshida"}, + {"id": "1", "world": "museum", "owner": "madarame"}, + {"id": "2", "world": "bank", "owner": "kaneshiro"}, + {"id": "3", "world": "pyramid", "owner": "futaba"}, + {"id": "4", "world": "spaceport", "owner": "okumura"}, + {"id": "5", "world": "casino", "owner": "nijima"}, + {"id": "6", "world": "cruiser", "owner": "shido"}, +])) +_PARTY_MEMBERS_SKILLS_RESPONSE = HttpResponse(json.dumps([ + {"id": "0", "name": "hassou tobi"}, + {"id": "1", "name": "mafreidyne"}, + {"id": "2", "name": "myriad truths"}, +])) +_EMPTY_RESPONSE = HttpResponse(json.dumps([])) +_NOW = "2024-09-10T00:00:00" +_NO_STATE_PARTY_MEMBERS_SLICES_AND_RESPONSES = [ + ({"start": "2024-07-01", "end": "2024-07-15"}, HttpResponse(json.dumps([{"id": "amamiya", "first_name": "ren", "last_name": "amamiya", "updated_at": "2024-07-10"}]))), + ({"start": "2024-07-16", "end": "2024-07-30"}, _EMPTY_RESPONSE), + ({"start": "2024-07-31", "end": "2024-08-14"}, HttpResponse(json.dumps([{"id": "nijima", "first_name": "makoto", "last_name": "nijima", "updated_at": "2024-08-10"}, ]))), + ({"start": "2024-08-15", "end": "2024-08-29"}, _EMPTY_RESPONSE), + ({"start": "2024-08-30", "end": "2024-09-10"}, HttpResponse(json.dumps([{"id": "yoshizawa", "first_name": "sumire", "last_name": "yoshizawa", "updated_at": "2024-09-10"}]))), +] +_MANIFEST = { + "version": "5.0.0", + "definitions": { + "selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [] + } + }, + "requester": { + "type": "HttpRequester", + "url_base": "https://persona.metaverse.com", + "http_method": "GET", + "authenticator": { + "type": "BasicHttpAuthenticator", + "username": "{{ config['api_key'] }}", + "password": "{{ config['secret_key'] }}" + }, + "error_handler": { + "type": "DefaultErrorHandler", + "response_filters": [ + { + "http_codes": [403], + "action": "FAIL", + "failure_type": "config_error", + "error_message": "Access denied due to lack of permission or invalid API/Secret key or wrong data region." + }, + { + "http_codes": [404], + "action": "IGNORE", + "error_message": "No data available for the time range requested." + } + ] + }, + }, + "retriever": { + "type": "SimpleRetriever", + "record_selector": { + "$ref": "#/definitions/selector" + }, + "paginator": { + "type": "NoPagination" + }, + "requester": { + "$ref": "#/definitions/requester" + } + }, + "incremental_cursor": { + "type": "DatetimeBasedCursor", + "start_datetime": { + "datetime": "{{ format_datetime(config['start_date'], '%Y-%m-%d') }}" + }, + "end_datetime": { + "datetime": "{{ now_utc().strftime('%Y-%m-%d') }}" + }, + "datetime_format": "%Y-%m-%d", + "cursor_datetime_formats": ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"], + "cursor_granularity": "P1D", + "step": "P15D", + "cursor_field": "updated_at", + "lookback_window": "P5D", + "start_time_option": { + "type": "RequestOption", + "field_name": "start", + "inject_into": "request_parameter" + }, + "end_time_option": { + "type": "RequestOption", + "field_name": "end", + "inject_into": "request_parameter" + } + }, + "base_stream": { + "retriever": { + "$ref": "#/definitions/retriever" + } + }, + "base_incremental_stream": { + "retriever": { + "$ref": "#/definitions/retriever", + "requester": { + "$ref": "#/definitions/requester" + } + }, + "incremental_sync": { + "$ref": "#/definitions/incremental_cursor" + } + }, + "party_members_stream": { + "$ref": "#/definitions/base_incremental_stream", + "retriever": { + "$ref": "#/definitions/base_incremental_stream/retriever", + "record_selector": { + "$ref": "#/definitions/selector" + } + }, + "$parameters": { + "name": "party_members", + "primary_key": "id", + "path": "/party_members" + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { + "description": "The identifier", + "type": ["null", "string"], + }, + "name": { + "description": "The name of the party member", + "type": ["null", "string"] + } + } + } + } + }, + "palaces_stream": { + "$ref": "#/definitions/base_stream", + "$parameters": { + "name": "palaces", + "primary_key": "id", + "path": "/palaces" + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { + "description": "The identifier", + "type": ["null", "string"], + }, + "name": { + "description": "The name of the metaverse palace", + "type": ["null", "string"] + } + } + } + } + }, + "locations_stream": { + "$ref": "#/definitions/base_incremental_stream", + "retriever": { + "$ref": "#/definitions/base_incremental_stream/retriever", + "requester": { + "$ref": "#/definitions/base_incremental_stream/retriever/requester", + "request_parameters": { + "m": "active", + "i": "1", + "g": "country" + } + }, + "record_selector": { + "$ref": "#/definitions/selector" + } + }, + "incremental_sync": { + "$ref": "#/definitions/incremental_cursor", + "step": "P1M", + "cursor_field": "updated_at" + }, + "$parameters": { + "name": "locations", + "primary_key": "id", + "path": "/locations" + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { + "description": "The identifier", + "type": ["null", "string"], + }, + "name": { + "description": "The name of the neighborhood location", + "type": ["null", "string"] + } + } + } + } + }, + "party_members_skills_stream": { + "$ref": "#/definitions/base_stream", + "retriever": { + "$ref": "#/definitions/base_incremental_stream/retriever", + "record_selector": { + "$ref": "#/definitions/selector" + }, + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "type": "ParentStreamConfig", + "stream": "#/definitions/party_members_stream", + "parent_key": "id", + "partition_field": "party_member_id", + } + ] + } + }, + "$parameters": { + "name": "party_members_skills", + "primary_key": "id", + "path": "/party_members/{{stream_slice.party_member_id}}/skills" + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { + "description": "The identifier", + "type": ["null", "string"], + }, + "name": { + "description": "The name of the party member", + "type": ["null", "string"] + } + } + } + } + }, + }, + "streams": [ + "#/definitions/party_members_stream", + "#/definitions/palaces_stream", + "#/definitions/locations_stream", + "#/definitions/party_members_skills_stream" + ], + "check": { + "stream_names": ["party_members", "palaces", "locations"] + }, + "concurrency_level": { + "type": "ConcurrencyLevel", + "default_concurrency": "{{ config['num_workers'] or 10 }}", + "max_concurrency": 25, + } +} + + +@deprecated("See note in docstring for more information") +class DeclarativeStreamDecorator(Stream): + """ + Helper class that wraps an existing DeclarativeStream but allows for overriding the output of read_records() to + make it easier to mock behavior and test how low-code streams integrate with the Concurrent CDK. + + NOTE: We are not using that for now but the intent was to scope the tests to only testing that streams were properly instantiated and + interacted together properly. However in practice, we had a couple surprises like `get_cursor` and `stream_slices` needed to be + re-implemented as well. Because of that, we've move away from that in favour of doing tests that integrate up until the HTTP request. + The drawback of that is that we are dependent on any change later (like if the DatetimeBasedCursor changes, this will affect those + tests) but it feels less flaky than this. If we have new information in the future to infirm that, feel free to re-use this class as + necessary. + """ + + def __init__(self, declarative_stream: DeclarativeStream, slice_to_records_mapping: Mapping[tuple[str, str], List[Mapping[str, Any]]]): + self._declarative_stream = declarative_stream + self._slice_to_records_mapping = slice_to_records_mapping + + @property + def name(self) -> str: + return self._declarative_stream.name + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return self._declarative_stream.primary_key + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + if isinstance(stream_slice, StreamSlice): + slice_key = (stream_slice.get("start_time"), stream_slice.get("end_time")) + + # Extra logic to simulate raising an error during certain partitions to validate error handling + if slice_key == ("2024-08-05", "2024-09-04"): + raise AirbyteTracedException( + message=f"Received an unexpected error during interval with start: {slice_key[0]} and end: {slice_key[1]}.", + failure_type=FailureType.config_error) + + if slice_key in self._slice_to_records_mapping: + yield from self._slice_to_records_mapping.get(slice_key) + else: + yield from [] + else: + raise ValueError(f"stream_slice should be of type StreamSlice, but received {type(stream_slice)}") + + def get_json_schema(self) -> Mapping[str, Any]: + return self._declarative_stream.get_json_schema() + + def get_cursor(self) -> Optional[Cursor]: + return self._declarative_stream.get_cursor() + + +def test_group_streams(): + """ + Tests the grouping of low-code streams into ones that can be processed concurrently vs ones that must be processed concurrently + """ + + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="party_members", json_schema={}, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ConfiguredAirbyteStream( + stream=AirbyteStream(name="palaces", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ConfiguredAirbyteStream( + stream=AirbyteStream(name="locations", json_schema={}, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ConfiguredAirbyteStream( + stream=AirbyteStream(name="party_members_skills", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + + state = [] + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=catalog, state=state) + concurrent_streams = source._concurrent_streams + synchronous_streams = source._synchronous_streams + + # 2 incremental streams + assert len(concurrent_streams) == 2 + concurrent_stream_0, concurrent_stream_1 = concurrent_streams + assert isinstance(concurrent_stream_0, DefaultStream) + assert concurrent_stream_0.name == "party_members" + assert isinstance(concurrent_stream_1, DefaultStream) + assert concurrent_stream_1.name == "locations" + + # 1 full refresh stream, 1 substream + assert len(synchronous_streams) == 2 + synchronous_stream_0, synchronous_stream_1 = synchronous_streams + assert isinstance(synchronous_stream_0, DeclarativeStream) + assert synchronous_stream_0.name == "palaces" + assert isinstance(synchronous_stream_1, DeclarativeStream) + assert synchronous_stream_1.name == "party_members_skills" + + +@freezegun.freeze_time(time_to_freeze=datetime(2024, 9, 1, 0, 0, 0, 0, tzinfo=timezone.utc)) +def test_create_concurrent_cursor(): + """ + Validate that the ConcurrentDeclarativeSource properly instantiates a ConcurrentCursor from the + low-code DatetimeBasedCursor component + """ + + incoming_locations_state = { + "slices": [ + {"start": "2024-07-01T00:00:00", "end": "2024-07-31T00:00:00"}, + ], + "state_type": "date-range" + } + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="locations", namespace=None), + stream_state=AirbyteStateBlob(**incoming_locations_state) + ), + ), + ] + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=_CATALOG, state=state) + + party_members_stream = source._concurrent_streams[0] + assert isinstance(party_members_stream, DefaultStream) + party_members_cursor = party_members_stream.cursor + + assert isinstance(party_members_cursor, ConcurrentCursor) + assert party_members_cursor._stream_name == "party_members" + assert party_members_cursor._cursor_field.cursor_field_key == "updated_at" + assert party_members_cursor._start == pendulum.parse(_CONFIG.get("start_date")) + assert party_members_cursor._end_provider() == datetime(year=2024, month=9, day=1, tzinfo=timezone.utc) + assert party_members_cursor._slice_boundary_fields == ("start_time", "end_time") + assert party_members_cursor._slice_range == timedelta(days=15) + assert party_members_cursor._lookback_window == timedelta(days=5) + assert party_members_cursor._cursor_granularity == timedelta(days=1) + + locations_stream = source._concurrent_streams[1] + assert isinstance(locations_stream, DefaultStream) + locations_cursor = locations_stream.cursor + + assert isinstance(locations_cursor, ConcurrentCursor) + assert locations_cursor._stream_name == "locations" + assert locations_cursor._cursor_field.cursor_field_key == "updated_at" + assert locations_cursor._start == pendulum.parse(_CONFIG.get("start_date")) + assert locations_cursor._end_provider() == datetime(year=2024, month=9, day=1, tzinfo=timezone.utc) + assert locations_cursor._slice_boundary_fields == ("start_time", "end_time") + assert locations_cursor._slice_range == isodate.Duration(months=1) + assert locations_cursor._lookback_window == timedelta(days=5) + assert locations_cursor._cursor_granularity == timedelta(days=1) + assert locations_cursor.state == { + "slices": [ + { + "start": datetime(2024, 7, 1, 0, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2024, 7, 31, 0, 0, 0, 0, tzinfo=timezone.utc), + } + ], + "state_type": "date-range" + } + + +def test_check(): + """ + Verifies that the ConcurrentDeclarativeSource check command is run against synchronous streams + """ + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest("https://persona.metaverse.com/party_members?start=2024-07-01&end=2024-07-15"), + HttpResponse(json.dumps({"id": "amamiya", "first_name": "ren", "last_name": "amamiya", "updated_at": "2024-07-10"})), + ) + http_mocker.get( + HttpRequest("https://persona.metaverse.com/palaces"), + HttpResponse(json.dumps({"id": "palace_1"})), + ) + http_mocker.get( + HttpRequest("https://persona.metaverse.com/locations?m=active&i=1&g=country&start=2024-07-01&end=2024-07-31"), + HttpResponse(json.dumps({"id": "location_1"})), + ) + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=None, state=None) + + connection_status = source.check(logger=source.logger, config=_CONFIG) + + assert connection_status.status == Status.SUCCEEDED + + +def test_discover(): + """ + Verifies that the ConcurrentDeclarativeSource discover command returns concurrent and synchronous catalog definitions + """ + expected_stream_names = ["party_members", "palaces", "locations", "party_members_skills"] + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=None, state=None) + + actual_catalog = source.discover(logger=source.logger, config=_CONFIG) + + assert len(actual_catalog.streams) == 4 + assert actual_catalog.streams[0].name in expected_stream_names + assert actual_catalog.streams[1].name in expected_stream_names + assert actual_catalog.streams[2].name in expected_stream_names + assert actual_catalog.streams[3].name in expected_stream_names + + +def _mock_requests(http_mocker: HttpMocker, url: str, query_params: List[Dict[str, str]], responses: List[HttpResponse]) -> None: + assert len(query_params) == len(responses), "Expecting as many slices as response" + + for i in range(len(query_params)): + http_mocker.get(HttpRequest(url, query_params=query_params[i]), responses[i]) + + +def _mock_party_members_requests(http_mocker: HttpMocker, slices_and_responses: List[Tuple[Dict[str, str], HttpResponse]]) -> None: + slices = list(map(lambda slice_and_response: slice_and_response[0], slices_and_responses)) + responses = list(map(lambda slice_and_response: slice_and_response[1], slices_and_responses)) + + _mock_requests( + http_mocker, + "https://persona.metaverse.com/party_members", + slices, + responses, + ) + + +def _mock_locations_requests(http_mocker: HttpMocker, slices: List[Dict[str, str]]) -> None: + locations_query_params = list(map(lambda _slice: _slice | {"m": "active", "i": "1", "g": "country"}, slices)) + _mock_requests( + http_mocker, + "https://persona.metaverse.com/locations", + locations_query_params, + [_LOCATIONS_RESPONSE] * len(slices), + ) + + +def _mock_party_members_skills_requests(http_mocker: HttpMocker) -> None: + """ + This method assumes _mock_party_members_requests has been called before else the stream won't work. + """ + http_mocker.get(HttpRequest("https://persona.metaverse.com/party_members/amamiya/skills"), _PARTY_MEMBERS_SKILLS_RESPONSE) + http_mocker.get(HttpRequest("https://persona.metaverse.com/party_members/nijima/skills"), _PARTY_MEMBERS_SKILLS_RESPONSE) + http_mocker.get(HttpRequest("https://persona.metaverse.com/party_members/yoshizawa/skills"), _PARTY_MEMBERS_SKILLS_RESPONSE) + + +@freezegun.freeze_time(_NOW) +def test_read_with_concurrent_and_synchronous_streams(): + """ + Verifies that a ConcurrentDeclarativeSource processes concurrent streams followed by synchronous streams + """ + location_slices = [ + {"start": "2024-07-01", "end": "2024-07-31"}, + {"start": "2024-08-01", "end": "2024-08-31"}, + {"start": "2024-09-01", "end": "2024-09-10"}, + ] + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=_CATALOG, state=None) + disable_emitting_sequential_state_messages(source=source) + + with HttpMocker() as http_mocker: + _mock_party_members_requests(http_mocker, _NO_STATE_PARTY_MEMBERS_SLICES_AND_RESPONSES) + _mock_locations_requests(http_mocker, location_slices) + http_mocker.get(HttpRequest("https://persona.metaverse.com/palaces"), _PALACES_RESPONSE) + _mock_party_members_skills_requests(http_mocker) + + messages = list(source.read(logger=source.logger, config=_CONFIG, catalog=_CATALOG, state=[])) + + # See _mock_party_members_requests + party_members_records = get_records_for_stream("party_members", messages) + assert len(party_members_records) == 3 + + party_members_states = get_states_for_stream(stream_name="party_members", messages=messages) + assert len(party_members_states) == 6 + assert party_members_states[5].stream.stream_state.__dict__ == AirbyteStateBlob( + state_type="date-range", + slices=[{"start": "2024-07-01", "end": "2024-09-10", "most_recent_cursor_value": "2024-09-10"}] + ).__dict__ + + # Expects 12 records, 3 slices, 4 records each slice + locations_records = get_records_for_stream(stream_name="locations", messages=messages) + assert len(locations_records) == 12 + + # 3 partitions == 3 state messages + final state message + # Because we cannot guarantee the order partitions finish, we only validate that the final state has the latest checkpoint value + locations_states = get_states_for_stream(stream_name="locations", messages=messages) + assert len(locations_states) == 4 + assert locations_states[3].stream.stream_state.__dict__ == AirbyteStateBlob( + state_type="date-range", + slices=[{"start": "2024-07-01", "end": "2024-09-10", "most_recent_cursor_value": "2024-08-10"}] + ).__dict__ + + # Expects 7 records, 1 empty slice, 7 records in slice + palaces_records = get_records_for_stream("palaces", messages) + assert len(palaces_records) == 7 + + palaces_states = get_states_for_stream(stream_name="palaces", messages=messages) + assert len(palaces_states) == 1 + assert palaces_states[0].stream.stream_state.__dict__ == AirbyteStateBlob(__ab_full_refresh_sync_complete=True).__dict__ + + # Expects 3 records, 3 slices, 3 records in slice + party_members_skills_records = get_records_for_stream("party_members_skills", messages) + assert len(party_members_skills_records) == 9 + + party_members_skills_states = get_states_for_stream(stream_name="party_members_skills", messages=messages) + assert len(party_members_skills_states) == 3 + assert party_members_skills_states[0].stream.stream_state.__dict__ == { + "states": [ + {"partition": {"parent_slice": {}, "party_member_id": "amamiya"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + } + assert party_members_skills_states[1].stream.stream_state.__dict__ == { + "states": [ + {"partition": {"parent_slice": {}, "party_member_id": "amamiya"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "party_member_id": "nijima"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + } + assert party_members_skills_states[2].stream.stream_state.__dict__ == { + "states": [ + {"partition": {"parent_slice": {}, "party_member_id": "amamiya"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "party_member_id": "nijima"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"parent_slice": {}, "party_member_id": "yoshizawa"}, "cursor": {"__ab_full_refresh_sync_complete": True}} + ] + } + + +@freezegun.freeze_time(_NOW) +def test_read_with_concurrent_and_synchronous_streams_with_concurrent_state(): + """ + Verifies that a ConcurrentDeclarativeSource processes concurrent streams correctly using the incoming + concurrent state format + """ + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="locations", namespace=None), + stream_state=AirbyteStateBlob( + state_type="date-range", + slices=[{"start": "2024-07-01", "end": "2024-07-31"}], + ), + ), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="party_members", namespace=None), + stream_state=AirbyteStateBlob( + state_type="date-range", + slices=[ + {"start": "2024-07-16", "end": "2024-07-30"}, + {"start": "2024-07-31", "end": "2024-08-14"}, + {"start": "2024-08-30", "end": "2024-09-09"}, + ] + ), + ), + ), + ] + + party_members_slices_and_responses = _NO_STATE_PARTY_MEMBERS_SLICES_AND_RESPONSES + [ + ( + {"start": "2024-09-04", "end": "2024-09-10"}, # considering lookback window + HttpResponse( + json.dumps([{"id": "yoshizawa", "first_name": "sumire", "last_name": "yoshizawa", "updated_at": "2024-09-10"}]) + ), + ) + ] + location_slices = [ + {"start": "2024-07-26", "end": "2024-08-25"}, + {"start": "2024-08-26", "end": "2024-09-10"}, + ] + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=_CATALOG, state=state) + disable_emitting_sequential_state_messages(source=source) + + with HttpMocker() as http_mocker: + _mock_party_members_requests(http_mocker, party_members_slices_and_responses) + _mock_locations_requests(http_mocker, location_slices) + http_mocker.get(HttpRequest("https://persona.metaverse.com/palaces"), _PALACES_RESPONSE) + _mock_party_members_skills_requests(http_mocker) + + messages = list(source.read(logger=source.logger, config=_CONFIG, catalog=_CATALOG, state=state)) + + # Expects 8 records, skip successful intervals and are left with 2 slices, 4 records each slice + locations_records = get_records_for_stream("locations", messages) + assert len(locations_records) == 8 + + locations_states = get_states_for_stream(stream_name="locations", messages=messages) + assert len(locations_states) == 3 + assert locations_states[2].stream.stream_state.__dict__ == AirbyteStateBlob( + state_type="date-range", + slices=[{"start": "2024-07-01", "end": "2024-09-10", "most_recent_cursor_value": "2024-08-10"}] + ).__dict__ + + # slices to sync are: + # * {"start": "2024-07-01", "end": "2024-07-15"}: one record in _NO_STATE_PARTY_MEMBERS_SLICES_AND_RESPONSES + # * {"start": "2024-09-04", "end": "2024-09-10"}: one record from the lookback window defined in this test + party_members_records = get_records_for_stream("party_members", messages) + assert len(party_members_records) == 2 + + party_members_states = get_states_for_stream(stream_name="party_members", messages=messages) + assert len(party_members_states) == 4 + assert party_members_states[3].stream.stream_state.__dict__ == AirbyteStateBlob( + state_type="date-range", + slices=[{"start": "2024-07-01", "end": "2024-09-10", "most_recent_cursor_value": "2024-09-10"}] + ).__dict__ + + # Expects 7 records, 1 empty slice, 7 records in slice + palaces_records = get_records_for_stream("palaces", messages) + assert len(palaces_records) == 7 + + # Expects 3 records, 3 slices, 3 records in slice + party_members_skills_records = get_records_for_stream("party_members_skills", messages) + assert len(party_members_skills_records) == 9 + + +@freezegun.freeze_time(_NOW) +def test_read_with_concurrent_and_synchronous_streams_with_sequential_state(): + """ + Verifies that a ConcurrentDeclarativeSource processes concurrent streams correctly using the incoming + legacy state format + """ + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="locations", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-08-06"), + ), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="party_members", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-08-21"), + ), + ) + ] + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=_CATALOG, state=state) + disable_emitting_sequential_state_messages(source=source) + + party_members_slices_and_responses = _NO_STATE_PARTY_MEMBERS_SLICES_AND_RESPONSES + [ + ({"start": "2024-08-16", "end": "2024-08-30"}, HttpResponse(json.dumps([{"id": "nijima", "first_name": "makoto", "last_name": "nijima", "updated_at": "2024-08-10"}]))), # considering lookback window + ({"start": "2024-08-31", "end": "2024-09-10"}, HttpResponse(json.dumps([{"id": "yoshizawa", "first_name": "sumire", "last_name": "yoshizawa", "updated_at": "2024-09-10"}]))), + ] + location_slices = [ + {"start": "2024-08-01", "end": "2024-08-31"}, + {"start": "2024-09-01", "end": "2024-09-10"}, + ] + + with HttpMocker() as http_mocker: + _mock_party_members_requests(http_mocker, party_members_slices_and_responses) + _mock_locations_requests(http_mocker, location_slices) + http_mocker.get(HttpRequest("https://persona.metaverse.com/palaces"), _PALACES_RESPONSE) + _mock_party_members_skills_requests(http_mocker) + + messages = list(source.read(logger=source.logger, config=_CONFIG, catalog=_CATALOG, state=state)) + + # Expects 8 records, skip successful intervals and are left with 2 slices, 4 records each slice + locations_records = get_records_for_stream("locations", messages) + assert len(locations_records) == 8 + + locations_states = get_states_for_stream(stream_name="locations", messages=messages) + assert len(locations_states) == 3 + assert locations_states[2].stream.stream_state.__dict__ == AirbyteStateBlob( + state_type="date-range", + slices=[{"start": "2024-07-01", "end": "2024-09-10", "most_recent_cursor_value": "2024-08-10"}] + ).__dict__ + + # From extra slices defined in party_members_slices_and_responses + party_members_records = get_records_for_stream("party_members", messages) + assert len(party_members_records) == 2 + + party_members_states = get_states_for_stream(stream_name="party_members", messages=messages) + assert len(party_members_states) == 3 + assert party_members_states[2].stream.stream_state.__dict__ == AirbyteStateBlob( + state_type="date-range", + slices=[{"start": "2024-07-01", "end": "2024-09-10", "most_recent_cursor_value": "2024-09-10"}] + ).__dict__ + + # Expects 7 records, 1 empty slice, 7 records in slice + palaces_records = get_records_for_stream("palaces", messages) + assert len(palaces_records) == 7 + + # Expects 3 records, 3 slices, 3 records in slice + party_members_skills_records = get_records_for_stream("party_members_skills", messages) + assert len(party_members_skills_records) == 9 + + +@freezegun.freeze_time(_NOW) +def test_read_concurrent_with_failing_partition_in_the_middle(): + """ + Verify that partial state is emitted when only some partitions are successful during a concurrent sync attempt + """ + location_slices = [ + {"start": "2024-07-01", "end": "2024-07-31"}, + # missing slice `{"start": "2024-08-01", "end": "2024-08-31"}` here + {"start": "2024-09-01", "end": "2024-09-10"}, + ] + expected_stream_state = { + "state_type": "date-range", + "slices": [location_slice | {"most_recent_cursor_value": "2024-08-10"} for location_slice in location_slices], + } + + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="locations", json_schema={}, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ), + ] + ) + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=catalog, state=[]) + disable_emitting_sequential_state_messages(source=source) + + location_slices = [ + {"start": "2024-07-01", "end": "2024-07-31"}, + # missing slice `{"start": "2024-08-01", "end": "2024-08-31"}` here + {"start": "2024-09-01", "end": "2024-09-10"}, + ] + + with HttpMocker() as http_mocker: + _mock_locations_requests(http_mocker, location_slices) + + messages = [] + try: + for message in source.read(logger=source.logger, config=_CONFIG, catalog=catalog, state=[]): + messages.append(message) + except AirbyteTracedException: + assert get_states_for_stream(stream_name="locations", messages=messages)[-1].stream.stream_state.__dict__ == expected_stream_state + + +@freezegun.freeze_time(_NOW) +def test_read_concurrent_skip_streams_not_in_catalog(): + """ + Verifies that the ConcurrentDeclarativeSource only syncs streams that are specified in the incoming ConfiguredCatalog + """ + with HttpMocker() as http_mocker: + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="palaces", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ConfiguredAirbyteStream( + stream=AirbyteStream(name="locations", json_schema={}, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ), + ] + ) + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=_CONFIG, catalog=catalog, state=None) + # locations requests + location_slices = [ + {"start": "2024-07-01", "end": "2024-07-31"}, + {"start": "2024-08-01", "end": "2024-08-31"}, + {"start": "2024-09-01", "end": "2024-09-10"}, + ] + locations_query_params = list(map(lambda _slice: _slice | {"m": "active", "i": "1", "g": "country"}, location_slices)) + _mock_requests( + http_mocker, + "https://persona.metaverse.com/locations", + locations_query_params, + [_LOCATIONS_RESPONSE] * len(location_slices), + ) + + # palaces requests + http_mocker.get(HttpRequest("https://persona.metaverse.com/palaces"), _PALACES_RESPONSE) + + disable_emitting_sequential_state_messages(source=source) + + messages = list(source.read(logger=source.logger, config=_CONFIG, catalog=catalog, state=[])) + + locations_records = get_records_for_stream(stream_name="locations", messages=messages) + assert len(locations_records) == 12 + locations_states = get_states_for_stream(stream_name="locations", messages=messages) + assert len(locations_states) == 4 + + palaces_records = get_records_for_stream("palaces", messages) + assert len(palaces_records) == 7 + palaces_states = get_states_for_stream(stream_name="palaces", messages=messages) + assert len(palaces_states) == 1 + + assert len(get_records_for_stream(stream_name="party_members", messages=messages)) == 0 + assert len(get_states_for_stream(stream_name="party_members", messages=messages)) == 0 + + assert len(get_records_for_stream(stream_name="party_members_skills", messages=messages)) == 0 + assert len(get_states_for_stream(stream_name="party_members_skills", messages=messages)) == 0 + + +def test_default_perform_interpolation_on_concurrency_level(): + config = { + "start_date": "2024-07-01T00:00:00.000Z", + "num_workers": 20 + } + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="palaces", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ] + ) + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=config, catalog=catalog, state=[]) + assert source._concurrent_source._initial_number_partitions_to_generate == 10 # We floor the number of initial partitions on creation + + +def test_default_to_single_threaded_when_no_concurrency_level(): + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="palaces", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ] + ) + + manifest = copy.deepcopy(_MANIFEST) + del manifest["concurrency_level"] + + source = ConcurrentDeclarativeSource(source_config=manifest, config=_CONFIG, catalog=catalog, state=[]) + assert source._concurrent_source._initial_number_partitions_to_generate == 1 + + +def test_concurrency_level_initial_number_partitions_to_generate_is_always_one_or_more(): + config = { + "start_date": "2024-07-01T00:00:00.000Z", + "num_workers": 1 + } + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="palaces", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ), + ] + ) + + manifest = copy.deepcopy(_MANIFEST) + manifest["concurrency_level"] = { + "type": "ConcurrencyLevel", + "default_concurrency": "{{ config.get('num_workers', 1) }}", + "max_concurrency": 25, + } + + source = ConcurrentDeclarativeSource(source_config=_MANIFEST, config=config, catalog=catalog, state=[]) + assert source._concurrent_source._initial_number_partitions_to_generate == 1 + + +def test_streams_with_stream_state_interpolation_should_be_synchronous(): + manifest_with_stream_state_interpolation = copy.deepcopy(_MANIFEST) + + # Add stream_state interpolation to the location stream's HttpRequester + manifest_with_stream_state_interpolation["definitions"]["locations_stream"]["retriever"]["requester"]["request_parameters"] = { + "after": "{{ stream_state['updated_at'] }}", + } + + # Add a RecordFilter component that uses stream_state interpolation to the party member stream + manifest_with_stream_state_interpolation["definitions"]["party_members_stream"]["retriever"]["record_selector"]["record_filter"] = { + "type": "RecordFilter", + "condition": "{{ record.updated_at > stream_state['updated_at'] }}" + } + + source = ConcurrentDeclarativeSource( + source_config=manifest_with_stream_state_interpolation, + config=_CONFIG, + catalog=_CATALOG, + state=None + ) + + assert len(source._concurrent_streams) == 0 + assert len(source._synchronous_streams) == 4 + + +def test_given_partition_routing_and_incremental_sync_then_stream_is_not_concurrent(): + manifest = { + "version": "5.0.0", + "definitions": { + "selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": [] + } + }, + "requester": { + "type": "HttpRequester", + "url_base": "https://persona.metaverse.com", + "http_method": "GET", + "authenticator": { + "type": "BasicHttpAuthenticator", + "username": "{{ config['api_key'] }}", + "password": "{{ config['secret_key'] }}" + }, + "error_handler": { + "type": "DefaultErrorHandler", + "response_filters": [ + { + "http_codes": [403], + "action": "FAIL", + "failure_type": "config_error", + "error_message": "Access denied due to lack of permission or invalid API/Secret key or wrong data region." + }, + { + "http_codes": [404], + "action": "IGNORE", + "error_message": "No data available for the time range requested." + } + ] + }, + }, + "retriever": { + "type": "SimpleRetriever", + "record_selector": { + "$ref": "#/definitions/selector" + }, + "paginator": { + "type": "NoPagination" + }, + "requester": { + "$ref": "#/definitions/requester" + } + }, + "incremental_cursor": { + "type": "DatetimeBasedCursor", + "start_datetime": { + "datetime": "{{ format_datetime(config['start_date'], '%Y-%m-%d') }}" + }, + "end_datetime": { + "datetime": "{{ now_utc().strftime('%Y-%m-%d') }}" + }, + "datetime_format": "%Y-%m-%d", + "cursor_datetime_formats": ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"], + "cursor_granularity": "P1D", + "step": "P15D", + "cursor_field": "updated_at", + "lookback_window": "P5D", + "start_time_option": { + "type": "RequestOption", + "field_name": "start", + "inject_into": "request_parameter" + }, + "end_time_option": { + "type": "RequestOption", + "field_name": "end", + "inject_into": "request_parameter" + } + }, + "base_stream": { + "retriever": { + "$ref": "#/definitions/retriever" + } + }, + "base_incremental_stream": { + "retriever": { + "$ref": "#/definitions/retriever", + "requester": { + "$ref": "#/definitions/requester" + } + }, + "incremental_sync": { + "$ref": "#/definitions/incremental_cursor" + } + }, + "incremental_party_members_skills_stream": { + "$ref": "#/definitions/base_incremental_stream", + "retriever": { + "$ref": "#/definitions/base_incremental_stream/retriever", + "partition_router": { + "type": "ListPartitionRouter", + "cursor_field": "party_member_id", + "values": ["party_member1", "party_member2"], + } + }, + "$parameters": { + "name": "incremental_party_members_skills", + "primary_key": "id", + "path": "/party_members/{{stream_slice.party_member_id}}/skills" + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { + "description": "The identifier", + "type": ["null", "string"], + }, + "name": { + "description": "The name of the party member", + "type": ["null", "string"] + } + } + } + } + }, + }, + "streams": [ + "#/definitions/incremental_party_members_skills_stream" + ], + "check": { + "stream_names": ["incremental_party_members_skills"] + }, + "concurrency_level": { + "type": "ConcurrencyLevel", + "default_concurrency": "{{ config['num_workers'] or 10 }}", + "max_concurrency": 25, + } + } + + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name="incremental_party_members_skills", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + + state = [] + + source = ConcurrentDeclarativeSource(source_config=manifest, config=_CONFIG, catalog=catalog, state=state) + + assert len(source._concurrent_streams) == 0 + assert len(source._synchronous_streams) == 1 + + +def create_wrapped_stream(stream: DeclarativeStream) -> Stream: + slice_to_records_mapping = get_mocked_read_records_output(stream_name=stream.name) + + return DeclarativeStreamDecorator(declarative_stream=stream, slice_to_records_mapping=slice_to_records_mapping) + + +def get_mocked_read_records_output(stream_name: str) -> Mapping[tuple[str, str], List[StreamData]]: + match stream_name: + case "locations": + slices = [ + # Slices used during first incremental sync + StreamSlice(cursor_slice={"start": "2024-07-01", "end": "2024-07-31"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-08-01", "end": "2024-08-31"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-09-01", "end": "2024-09-09"}, partition={}), + + # Slices used during incremental checkpoint sync + StreamSlice(cursor_slice={'start': '2024-07-26', 'end': '2024-08-25'}, partition={}), + StreamSlice(cursor_slice={'start': '2024-08-26', 'end': '2024-09-09'}, partition={}), + + # Slices used during incremental sync with some partitions that exit with an error + StreamSlice(cursor_slice={"start": "2024-07-05", "end": "2024-08-04"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-08-05", "end": "2024-09-04"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-09-05", "end": "2024-09-09"}, partition={}), + ] + + records = [ + {"id": "444", "name": "Yongen-jaya", "updated_at": "2024-08-10"}, + {"id": "scramble", "name": "Shibuya", "updated_at": "2024-08-10"}, + {"id": "aoyama", "name": "Aoyama-itchome", "updated_at": "2024-08-10"}, + {"id": "shin123", "name": "Shinjuku", "updated_at": "2024-08-10"}, + ] + case "party_members": + slices = [ + # Slices used during first incremental sync + StreamSlice(cursor_slice={"start": "2024-07-01", "end": "2024-07-15"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-07-16", "end": "2024-07-30"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-07-31", "end": "2024-08-14"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-08-15", "end": "2024-08-29"}, partition={}), + StreamSlice(cursor_slice={"start": "2024-08-30", "end": "2024-09-09"}, partition={}), + + # Slices used during incremental checkpoint sync. Unsuccessful partitions use the P5D lookback window which explains + # the skew of records midway through + StreamSlice(cursor_slice={"start": "2024-07-01", "end": "2024-07-16"}, partition={}), + StreamSlice(cursor_slice={'start': '2024-07-30', 'end': '2024-08-13'}, partition={}), + StreamSlice(cursor_slice={'start': '2024-08-14', 'end': '2024-08-14'}, partition={}), + StreamSlice(cursor_slice={'start': '2024-09-04', 'end': '2024-09-09'}, partition={}), + ] + + records = [ + {"id": "amamiya", "first_name": "ren", "last_name": "amamiya", "updated_at": "2024-07-10"}, + {"id": "nijima", "first_name": "makoto", "last_name": "nijima", "updated_at": "2024-08-10"}, + {"id": "yoshizawa", "first_name": "sumire", "last_name": "yoshizawa", "updated_at": "2024-09-10"}, + ] + case "palaces": + slices = [StreamSlice(cursor_slice={}, partition={})] + + records = [ + {"id": "0", "world": "castle", "owner": "kamoshida"}, + {"id": "1", "world": "museum", "owner": "madarame"}, + {"id": "2", "world": "bank", "owner": "kaneshiro"}, + {"id": "3", "world": "pyramid", "owner": "futaba"}, + {"id": "4", "world": "spaceport", "owner": "okumura"}, + {"id": "5", "world": "casino", "owner": "nijima"}, + {"id": "6", "world": "cruiser", "owner": "shido"}, + ] + + case "party_members_skills": + slices = [StreamSlice(cursor_slice={}, partition={})] + + records = [ + {"id": "0", "name": "hassou tobi"}, + {"id": "1", "name": "mafreidyne"}, + {"id": "2", "name": "myriad truths"}, + ] + case _: + raise ValueError(f"Stream '{stream_name}' does not have associated mocked records") + + return {(_slice.get("start"), _slice.get("end")): [Record(data=stream_data, associated_slice=_slice) for stream_data in records] for _slice in slices} + + +def get_records_for_stream(stream_name: str, messages: List[AirbyteMessage]) -> List[AirbyteRecordMessage]: + return [message.record for message in messages if message.record and message.record.stream == stream_name] + + +def get_states_for_stream(stream_name: str, messages: List[AirbyteMessage]) -> List[AirbyteStateMessage]: + return [message.state for message in messages if message.state and message.state.stream.stream_descriptor.name == stream_name] + + +def disable_emitting_sequential_state_messages(source: ConcurrentDeclarativeSource) -> None: + for concurrent_streams in source._concurrent_streams: # type: ignore # This is the easiest way to disable behavior from the test + concurrent_streams.cursor._connector_state_converter._is_sequential_state = False # type: ignore # see above diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_declarative_stream.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_declarative_stream.py new file mode 100644 index 000000000000..8906b625fb8f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_declarative_stream.py @@ -0,0 +1,221 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteTraceMessage, Level, SyncMode, TraceType, Type +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.types import StreamSlice + +SLICE_NOT_CONSIDERED_FOR_EQUALITY = {} + +_name = "stream" +_primary_key = "pk" +_cursor_field = "created_at" +_json_schema = {"name": {"type": "string"}} + + +def test_declarative_stream(): + schema_loader = _schema_loader() + + state = MagicMock() + records = [ + {"pk": 1234, "field": "value"}, + {"pk": 4567, "field": "different_value"}, + AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="This is a log message")), + AirbyteMessage(type=Type.TRACE, trace=AirbyteTraceMessage(type=TraceType.ERROR, emitted_at=12345)), + ] + stream_slices = [ + StreamSlice(partition={}, cursor_slice={"date": "2021-01-01"}), + StreamSlice(partition={}, cursor_slice={"date": "2021-01-02"}), + StreamSlice(partition={}, cursor_slice={"date": "2021-01-03"}), + ] + + retriever = MagicMock() + retriever.state = state + retriever.read_records.return_value = records + retriever.stream_slices.return_value = stream_slices + + config = {"api_key": "open_sesame"} + + stream = DeclarativeStream( + name=_name, + primary_key=_primary_key, + stream_cursor_field="{{ parameters['cursor_field'] }}", + schema_loader=schema_loader, + retriever=retriever, + config=config, + parameters={"cursor_field": "created_at"}, + ) + + assert stream.name == _name + assert stream.get_json_schema() == _json_schema + assert stream.state == state + input_slice = stream_slices[0] + assert list(stream.read_records(SyncMode.full_refresh, _cursor_field, input_slice, state)) == records + assert stream.primary_key == _primary_key + assert stream.cursor_field == _cursor_field + assert stream.stream_slices(sync_mode=SyncMode.incremental, cursor_field=_cursor_field, stream_state=None) == stream_slices + + +def test_declarative_stream_using_empty_slice(): + """ + Tests that a declarative_stream + """ + schema_loader = _schema_loader() + + records = [ + {"pk": 1234, "field": "value"}, + {"pk": 4567, "field": "different_value"}, + AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="This is a log message")), + AirbyteMessage(type=Type.TRACE, trace=AirbyteTraceMessage(type=TraceType.ERROR, emitted_at=12345)), + ] + + retriever = MagicMock() + retriever.read_records.return_value = records + + config = {"api_key": "open_sesame"} + + stream = DeclarativeStream( + name=_name, + primary_key=_primary_key, + stream_cursor_field="{{ parameters['cursor_field'] }}", + schema_loader=schema_loader, + retriever=retriever, + config=config, + parameters={"cursor_field": "created_at"}, + ) + + assert stream.name == _name + assert stream.get_json_schema() == _json_schema + assert list(stream.read_records(SyncMode.full_refresh, _cursor_field, {})) == records + + +def test_read_records_raises_exception_if_stream_slice_is_not_per_partition_stream_slice(): + schema_loader = _schema_loader() + + retriever = MagicMock() + retriever.state = MagicMock() + retriever.read_records.return_value = [] + stream_slice = {"date": "2021-01-01"} + retriever.stream_slices.return_value = [stream_slice] + + stream = DeclarativeStream( + name=_name, + primary_key=_primary_key, + stream_cursor_field="{{ parameters['cursor_field'] }}", + schema_loader=schema_loader, + retriever=retriever, + config={}, + parameters={"cursor_field": "created_at"}, + ) + + with pytest.raises(ValueError): + list(stream.read_records(SyncMode.full_refresh, _cursor_field, stream_slice, MagicMock())) + + +def test_state_checkpoint_interval(): + stream = DeclarativeStream( + name="any name", + primary_key="any primary key", + stream_cursor_field="{{ parameters['cursor_field'] }}", + schema_loader=MagicMock(), + retriever=MagicMock(), + config={}, + parameters={}, + ) + + assert stream.state_checkpoint_interval is None + + +def test_state_migrations(): + intermediate_state = {"another_key", "another_value"} + final_state = {"yet_another_key", "yet_another_value"} + first_state_migration = MagicMock() + first_state_migration.should_migrate.return_value = True + first_state_migration.migrate.return_value = intermediate_state + second_state_migration = MagicMock() + second_state_migration.should_migrate.return_value = True + second_state_migration.migrate.return_value = final_state + + stream = DeclarativeStream( + name="any name", + primary_key="any primary key", + stream_cursor_field="{{ parameters['cursor_field'] }}", + schema_loader=MagicMock(), + retriever=MagicMock(), + state_migrations=[first_state_migration, second_state_migration], + config={}, + parameters={}, + ) + + input_state = {"a_key": "a_value"} + + stream.state = input_state + assert stream.state == final_state + first_state_migration.should_migrate.assert_called_once_with(input_state) + first_state_migration.migrate.assert_called_once_with(input_state) + second_state_migration.should_migrate.assert_called_once_with(intermediate_state) + second_state_migration.migrate.assert_called_once_with(intermediate_state) + + +def test_no_state_migration_is_applied_if_the_state_should_not_be_migrated(): + state_migration = MagicMock() + state_migration.should_migrate.return_value = False + + stream = DeclarativeStream( + name="any name", + primary_key="any primary key", + stream_cursor_field="{{ parameters['cursor_field'] }}", + schema_loader=MagicMock(), + retriever=MagicMock(), + state_migrations=[state_migration], + config={}, + parameters={}, + ) + + input_state = {"a_key": "a_value"} + + stream.state = input_state + assert stream.state == input_state + state_migration.should_migrate.assert_called_once_with(input_state) + assert not state_migration.migrate.called + + +@pytest.mark.parametrize( + "use_cursor, expected_supports_checkpointing", + [ + pytest.param(True, True, id="test_retriever_has_cursor"), + pytest.param(False, False, id="test_retriever_has_cursor"), + ], +) +def test_is_resumable(use_cursor, expected_supports_checkpointing): + schema_loader = _schema_loader() + + state = MagicMock() + + retriever = MagicMock() + retriever.state = state + retriever.cursor = MagicMock() if use_cursor else None + + config = {"api_key": "open_sesame"} + + stream = DeclarativeStream( + name=_name, + primary_key=_primary_key, + stream_cursor_field="{{ parameters['cursor_field'] }}", + schema_loader=schema_loader, + retriever=retriever, + config=config, + parameters={"cursor_field": "created_at"}, + ) + + assert stream.is_resumable == expected_supports_checkpointing + + +def _schema_loader(): + schema_loader = MagicMock() + schema_loader.get_json_schema.return_value = _json_schema + return schema_loader diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_manifest_declarative_source.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_manifest_declarative_source.py new file mode 100644 index 000000000000..2d350fa12b4b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_manifest_declarative_source.py @@ -0,0 +1,1316 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +import os +import sys +from copy import deepcopy +from pathlib import Path +from typing import Any, List, Mapping +from unittest.mock import call, patch + +import pytest +import requests +import yaml +from airbyte_cdk.models import ( + AirbyteLogMessage, + AirbyteMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + Level, + SyncMode, + Type, +) +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from jsonschema.exceptions import ValidationError + +logger = logging.getLogger("airbyte") + +EXTERNAL_CONNECTION_SPECIFICATION = { + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": {"api_token": {"type": "string"}}, +} + + +class MockManifestDeclarativeSource(ManifestDeclarativeSource): + """ + Mock test class that is needed to monkey patch how we read from various files that make up a declarative source because of how our + tests write configuration files during testing. It is also used to properly namespace where files get written in specific + cases like when we temporarily write files like spec.yaml to the package unit_tests, which is the directory where it will + be read in during the tests. + """ + + +class TestManifestDeclarativeSource: + @pytest.fixture + def use_external_yaml_spec(self): + # Our way of resolving the absolute path to root of the airbyte-cdk unit test directory where spec.yaml files should + # be written to (i.e. ~/airbyte/airbyte-cdk/python/unit-tests) because that is where they are read from during testing. + module = sys.modules[__name__] + module_path = os.path.abspath(module.__file__) + test_path = os.path.dirname(module_path) + spec_root = test_path.split("/sources/declarative")[0] + + spec = {"documentationUrl": "https://airbyte.com/#yaml-from-external", "connectionSpecification": EXTERNAL_CONNECTION_SPECIFICATION} + + yaml_path = os.path.join(spec_root, "spec.yaml") + with open(yaml_path, "w") as f: + f.write(yaml.dump(spec)) + yield + os.remove(yaml_path) + + def test_valid_manifest(self): + manifest = { + "version": "3.8.2", + "definitions": {}, + "description": "This is a sample source connector that is very valid.", + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + { + "type": "DeclarativeStream", + "$parameters": {"name": "stream_with_custom_requester", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": { + "type": "CustomRequester", + "class_name": "unit_tests.sources.declarative.external_component.SampleCustomComponent", + "path": "/v3/marketing/lists", + "custom_request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + source = ManifestDeclarativeSource(source_config=manifest) + + check_stream = source.connection_checker + check_stream.check_connection(source, logging.getLogger(""), {}) + + streams = source.streams({}) + assert len(streams) == 2 + assert isinstance(streams[0], DeclarativeStream) + assert isinstance(streams[1], DeclarativeStream) + assert source.resolved_manifest["description"] == "This is a sample source connector that is very valid." + + def test_manifest_with_spec(self): + manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + } + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + "spec": { + "type": "Spec", + "documentation_url": "https://airbyte.com/#yaml-from-manifest", + "connection_specification": { + "title": "Test Spec", + "type": "object", + "required": ["api_key"], + "additionalProperties": False, + "properties": { + "api_key": {"type": "string", "airbyte_secret": True, "title": "API Key", "description": "Test API Key", "order": 0} + }, + }, + }, + } + source = ManifestDeclarativeSource(source_config=manifest) + connector_specification = source.spec(logger) + assert connector_specification is not None + assert connector_specification.documentationUrl == "https://airbyte.com/#yaml-from-manifest" + assert connector_specification.connectionSpecification["title"] == "Test Spec" + assert connector_specification.connectionSpecification["required"][0] == "api_key" + assert connector_specification.connectionSpecification["additionalProperties"] is False + assert connector_specification.connectionSpecification["properties"]["api_key"] == { + "type": "string", + "airbyte_secret": True, + "title": "API Key", + "description": "Test API Key", + "order": 0, + } + + def test_manifest_with_external_spec(self, use_external_yaml_spec): + manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + } + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + source = MockManifestDeclarativeSource(source_config=manifest) + + connector_specification = source.spec(logger) + + assert connector_specification.documentationUrl == "https://airbyte.com/#yaml-from-external" + assert connector_specification.connectionSpecification == EXTERNAL_CONNECTION_SPECIFICATION + + def test_source_is_not_created_if_toplevel_fields_are_unknown(self): + manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + } + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + "not_a_valid_field": "error", + } + with pytest.raises(ValidationError): + ManifestDeclarativeSource(source_config=manifest) + + def test_source_missing_checker_fails_validation(self): + manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + } + ], + "check": {"type": "CheckStream"}, + } + with pytest.raises(ValidationError): + ManifestDeclarativeSource(source_config=manifest) + + def test_source_with_missing_streams_fails(self): + manifest = {"version": "0.29.3", "definitions": None, "check": {"type": "CheckStream", "stream_names": ["lists"]}} + with pytest.raises(ValidationError): + ManifestDeclarativeSource(source_config=manifest) + + def test_source_with_missing_version_fails(self): + manifest = { + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + } + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + with pytest.raises(ValidationError): + ManifestDeclarativeSource(source_config=manifest) + + @pytest.mark.parametrize( + "cdk_version, manifest_version, expected_error", + [ + pytest.param("0.35.0", "0.30.0", None, id="manifest_version_less_than_cdk_package_should_run"), + pytest.param("1.5.0", "0.29.0", None, id="manifest_version_less_than_cdk_major_package_should_run"), + pytest.param("0.29.0", "0.29.0", None, id="manifest_version_matching_cdk_package_should_run"), + pytest.param( + "0.29.0", + "0.25.0", + ValidationError, + id="manifest_version_before_beta_that_uses_the_beta_0.29.0_cdk_package_should_throw_error", + ), + pytest.param( + "1.5.0", + "0.25.0", + ValidationError, + id="manifest_version_before_beta_that_uses_package_later_major_version_than_beta_0.29.0_cdk_package_should_throw_error", + ), + pytest.param("0.34.0", "0.35.0", ValidationError, id="manifest_version_greater_than_cdk_package_should_throw_error"), + pytest.param("0.29.0", "-1.5.0", ValidationError, id="manifest_version_has_invalid_major_format"), + pytest.param("0.29.0", "0.invalid.0", ValidationError, id="manifest_version_has_invalid_minor_format"), + pytest.param("0.29.0", "0.29.0.1", ValidationError, id="manifest_version_has_extra_version_parts"), + pytest.param("0.29.0", "5.0", ValidationError, id="manifest_version_has_too_few_version_parts"), + pytest.param("0.29.0:dev", "0.29.0", ValidationError, id="manifest_version_has_extra_release"), + ], + ) + @patch("importlib.metadata.version") + def test_manifest_versions(self, version, cdk_version, manifest_version, expected_error): + # Used to mock the metadata.version() for test scenarios which normally returns the actual version of the airbyte-cdk package + version.return_value = cdk_version + + manifest = { + "version": manifest_version, + "definitions": {}, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + { + "type": "DeclarativeStream", + "$parameters": {"name": "stream_with_custom_requester", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": { + "type": "CustomRequester", + "class_name": "unit_tests.sources.declarative.external_component.SampleCustomComponent", + "path": "/v3/marketing/lists", + "custom_request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + if expected_error: + with pytest.raises(expected_error): + ManifestDeclarativeSource(source_config=manifest) + else: + ManifestDeclarativeSource(source_config=manifest) + + def test_source_with_invalid_stream_config_fails_validation(self): + manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + } + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + } + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + with pytest.raises(ValidationError): + ManifestDeclarativeSource(source_config=manifest) + + def test_source_with_no_external_spec_and_no_in_yaml_spec_fails(self): + manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + } + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + source = ManifestDeclarativeSource(source_config=manifest) + + # We expect to fail here because we have not created a temporary spec.yaml file + with pytest.raises(FileNotFoundError): + source.spec(logger) + + def test_manifest_without_at_least_one_stream(self): + manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + with pytest.raises(ValidationError): + ManifestDeclarativeSource(source_config=manifest) + + @patch("airbyte_cdk.sources.declarative.declarative_source.DeclarativeSource.read") + def test_given_debug_when_read_then_set_log_level(self, declarative_source_read): + any_valid_manifest = { + "version": "0.29.3", + "definitions": { + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": {"type": "CursorPagination", "cursor_value": "{{ response._metadata.next }}"}, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "10"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + "streams": [ + { + "type": "DeclarativeStream", + "$parameters": {"name": "lists", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": { + "path": "/v3/marketing/lists", + "authenticator": {"type": "BearerAuthenticator", "api_token": "{{ config.apikey }}"}, + "request_parameters": {"page_size": "{{ 10 }}"}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + { + "type": "DeclarativeStream", + "$parameters": {"name": "stream_with_custom_requester", "primary_key": "id", "url_base": "https://api.sendgrid.com"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 10, + }, + }, + "requester": { + "type": "CustomRequester", + "class_name": "unit_tests.sources.declarative.external_component.SampleCustomComponent", + "path": "/v3/marketing/lists", + "custom_request_parameters": {"page_size": 10}, + }, + "record_selector": {"extractor": {"field_path": ["result"]}}, + }, + }, + ], + "check": {"type": "CheckStream", "stream_names": ["lists"]}, + } + source = ManifestDeclarativeSource(source_config=any_valid_manifest, debug=True) + + debug_logger = logging.getLogger("logger.debug") + list(source.read(debug_logger, {}, {}, {})) + + assert debug_logger.isEnabledFor(logging.DEBUG) + + +def request_log_message(request: dict) -> AirbyteMessage: + return AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message=f"request:{json.dumps(request)}")) + + +def response_log_message(response: dict) -> AirbyteMessage: + return AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message=f"response:{json.dumps(response)}")) + + +def _create_request(): + url = "https://example.com/api" + headers = {"Content-Type": "application/json"} + return requests.Request("POST", url, headers=headers, json={"key": "value"}).prepare() + + +def _create_response(body): + response = requests.Response() + response.status_code = 200 + response._content = bytes(json.dumps(body), "utf-8") + response.headers["Content-Type"] = "application/json" + return response + + +def _create_page(response_body): + response = _create_response(response_body) + response.request = _create_request() + return response + + +@pytest.mark.parametrize( + "test_name, manifest, pages, expected_records, expected_calls", + [ + ( + "test_read_manifest_no_pagination_no_partitions", + { + "version": "0.34.2", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "Rates", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "ABC": {"type": "number"}, + "AED": {"type": "number"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + "request_parameters": {}, + "request_headers": {}, + "request_body_json": {}, + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": ["rates"]}}, + "paginator": {"type": "NoPagination"}, + }, + } + ], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["api_key"], + "properties": {"api_key": {"type": "string", "title": "API Key", "airbyte_secret": True}}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, + }, + ( + _create_page({"rates": [{"ABC": 0}, {"AED": 1}], "_metadata": {"next": "next"}}), + _create_page({"rates": [{"USD": 2}], "_metadata": {"next": "next"}}), + ) + * 10, + [{"ABC": 0}, {"AED": 1}], + [call({}, {})], + ), + ( + "test_read_manifest_with_added_fields", + { + "version": "0.34.2", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "Rates", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "ABC": {"type": "number"}, + "AED": {"type": "number"}, + }, + "type": "object", + }, + }, + "transformations": [ + { + "type": "AddFields", + "fields": [{"type": "AddedFieldDefinition", "path": ["added_field_key"], "value": "added_field_value"}], + } + ], + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + "request_parameters": {}, + "request_headers": {}, + "request_body_json": {}, + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": ["rates"]}}, + "paginator": {"type": "NoPagination"}, + }, + } + ], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["api_key"], + "properties": {"api_key": {"type": "string", "title": "API Key", "airbyte_secret": True}}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, + }, + ( + _create_page({"rates": [{"ABC": 0}, {"AED": 1}], "_metadata": {"next": "next"}}), + _create_page({"rates": [{"USD": 2}], "_metadata": {"next": "next"}}), + ) + * 10, + [{"ABC": 0, "added_field_key": "added_field_value"}, {"AED": 1, "added_field_key": "added_field_value"}], + [call({}, {})], + ), + ( + "test_read_with_pagination_no_partitions", + { + "version": "0.34.2", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "Rates", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": { + "ABC": {"type": "number"}, + "AED": {"type": "number"}, + "USD": {"type": "number"}, + }, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + "request_parameters": {}, + "request_headers": {}, + "request_body_json": {}, + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": ["rates"]}}, + "paginator": { + "type": "DefaultPaginator", + "page_size": 2, + "page_size_option": {"inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"inject_into": "path", "type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 2, + }, + }, + }, + } + ], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["api_key"], + "properties": {"api_key": {"type": "string", "title": "API Key", "airbyte_secret": True}}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, + }, + ( + _create_page({"rates": [{"ABC": 0}, {"AED": 1}], "_metadata": {"next": "next"}}), + _create_page({"rates": [{"USD": 2}], "_metadata": {}}), + ) + * 10, + [{"ABC": 0}, {"AED": 1}, {"USD": 2}], + [call({}, {}), call({"next_page_token": "next"}, {"next_page_token": "next"})], + ), + ( + "test_no_pagination_with_partition_router", + { + "version": "0.34.2", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "Rates", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {"ABC": {"type": "number"}, "AED": {"type": "number"}, "partition": {"type": "number"}}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + "request_parameters": {}, + "request_headers": {}, + "request_body_json": {}, + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "partition_router": {"type": "ListPartitionRouter", "values": ["0", "1"], "cursor_field": "partition"}, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": ["rates"]}}, + "paginator": {"type": "NoPagination"}, + }, + } + ], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["api_key"], + "properties": {"api_key": {"type": "string", "title": "API Key", "airbyte_secret": True}}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, + }, + ( + _create_page({"rates": [{"ABC": 0, "partition": 0}, {"AED": 1, "partition": 0}], "_metadata": {"next": "next"}}), + _create_page({"rates": [{"ABC": 2, "partition": 1}], "_metadata": {"next": "next"}}), + ), + [{"ABC": 0, "partition": 0}, {"AED": 1, "partition": 0}, {"ABC": 2, "partition": 1}], + [ + call({"states": []}, {"partition": "0"}, None), + call( + {"states": [{"partition": {"partition": "0"}, "cursor": {"__ab_full_refresh_sync_complete": True}}]}, + {"partition": "1"}, + None, + ), + ], + ), + ( + "test_with_pagination_and_partition_router", + { + "version": "0.34.2", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["Rates"]}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "Rates", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {"ABC": {"type": "number"}, "AED": {"type": "number"}, "partition": {"type": "number"}}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.apilayer.com", + "path": "/exchangerates_data/latest", + "http_method": "GET", + "request_parameters": {}, + "request_headers": {}, + "request_body_json": {}, + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "partition_router": {"type": "ListPartitionRouter", "values": ["0", "1"], "cursor_field": "partition"}, + "record_selector": {"type": "RecordSelector", "extractor": {"type": "DpathExtractor", "field_path": ["rates"]}}, + "paginator": { + "type": "DefaultPaginator", + "page_size": 2, + "page_size_option": {"inject_into": "request_parameter", "field_name": "page_size"}, + "page_token_option": {"inject_into": "path", "type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ response._metadata.next }}", + "page_size": 2, + }, + }, + }, + } + ], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["api_key"], + "properties": {"api_key": {"type": "string", "title": "API Key", "airbyte_secret": True}}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, + }, + ( + _create_page({"rates": [{"ABC": 0, "partition": 0}, {"AED": 1, "partition": 0}], "_metadata": {"next": "next"}}), + _create_page({"rates": [{"USD": 3, "partition": 0}], "_metadata": {}}), + _create_page({"rates": [{"ABC": 2, "partition": 1}], "_metadata": {}}), + ), + [{"ABC": 0, "partition": 0}, {"AED": 1, "partition": 0}, {"USD": 3, "partition": 0}, {"ABC": 2, "partition": 1}], + [ + call({"states": []}, {"partition": "0"}, None), + call({"states": []}, {"partition": "0"}, {"next_page_token": "next"}), + call( + {"states": [{"partition": {"partition": "0"}, "cursor": {"__ab_full_refresh_sync_complete": True}}]}, + {"partition": "1"}, + None, + ), + ], + ), + ], +) +def test_read_manifest_declarative_source(test_name, manifest, pages, expected_records, expected_calls): + _stream_name = "Rates" + with patch.object(SimpleRetriever, "_fetch_next_page", side_effect=pages) as mock_retriever: + output_data = [message.record.data for message in _run_read(manifest, _stream_name) if message.record] + assert output_data == expected_records + mock_retriever.assert_has_calls(expected_calls) + + +def test_only_parent_streams_use_cache(): + applications_stream = { + "type": "DeclarativeStream", + "$parameters": {"name": "applications", "primary_key": "id", "url_base": "https://harvest.greenhouse.io/v1/"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "per_page"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ headers['link']['next']['url'] }}", + "stop_condition": "{{ 'next' not in headers['link'] }}", + "page_size": 100, + }, + }, + "requester": { + "path": "applications", + "authenticator": {"type": "BasicHttpAuthenticator", "username": "{{ config['api_key'] }}"}, + }, + "record_selector": {"extractor": {"type": "DpathExtractor", "field_path": []}}, + }, + } + + manifest = { + "version": "0.29.3", + "definitions": {}, + "streams": [ + deepcopy(applications_stream), + { + "type": "DeclarativeStream", + "$parameters": {"name": "applications_interviews", "primary_key": "id", "url_base": "https://harvest.greenhouse.io/v1/"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "per_page"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ headers['link']['next']['url'] }}", + "stop_condition": "{{ 'next' not in headers['link'] }}", + "page_size": 100, + }, + }, + "requester": { + "path": "applications_interviews", + "authenticator": {"type": "BasicHttpAuthenticator", "username": "{{ config['api_key'] }}"}, + }, + "record_selector": {"extractor": {"type": "DpathExtractor", "field_path": []}}, + "partition_router": { + "parent_stream_configs": [ + {"parent_key": "id", "partition_field": "parent_id", "stream": deepcopy(applications_stream)} + ], + "type": "SubstreamPartitionRouter", + }, + }, + }, + { + "type": "DeclarativeStream", + "$parameters": {"name": "jobs", "primary_key": "id", "url_base": "https://harvest.greenhouse.io/v1/"}, + "schema_loader": { + "name": "{{ parameters.stream_name }}", + "file_path": "./source_sendgrid/schemas/{{ parameters.name }}.yaml", + }, + "retriever": { + "paginator": { + "type": "DefaultPaginator", + "page_size": 10, + "page_size_option": {"type": "RequestOption", "inject_into": "request_parameter", "field_name": "per_page"}, + "page_token_option": {"type": "RequestPath"}, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": "{{ headers['link']['next']['url'] }}", + "stop_condition": "{{ 'next' not in headers['link'] }}", + "page_size": 100, + }, + }, + "requester": { + "path": "jobs", + "authenticator": {"type": "BasicHttpAuthenticator", "username": "{{ config['api_key'] }}"}, + }, + "record_selector": {"extractor": {"type": "DpathExtractor", "field_path": []}}, + }, + }, + ], + "check": {"type": "CheckStream", "stream_names": ["applications"]}, + } + source = ManifestDeclarativeSource(source_config=manifest) + + streams = source.streams({}) + assert len(streams) == 3 + + # Main stream with caching (parent for substream `applications_interviews`) + assert streams[0].name == "applications" + assert streams[0].retriever.requester.use_cache + + # Substream + assert streams[1].name == "applications_interviews" + assert not streams[1].retriever.requester.use_cache + + # Parent stream created for substream + assert streams[1].retriever.stream_slicer._partition_router.parent_stream_configs[0].stream.name == "applications" + assert streams[1].retriever.stream_slicer._partition_router.parent_stream_configs[0].stream.retriever.requester.use_cache + + # Main stream without caching + assert streams[2].name == "jobs" + assert not streams[2].retriever.requester.use_cache + + +def _run_read(manifest: Mapping[str, Any], stream_name: str) -> List[AirbyteMessage]: + source = ManifestDeclarativeSource(source_config=manifest) + catalog = ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=stream_name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ) + return list(source.read(logger, {}, catalog, {})) + + +def test_declarative_component_schema_valid_ref_links(): + def load_yaml(file_path) -> Mapping[str, Any]: + with open(file_path, "r") as file: + return yaml.safe_load(file) + + def extract_refs(data, base_path="#") -> List[str]: + refs = [] + if isinstance(data, dict): + for key, value in data.items(): + if key == "$ref" and isinstance(value, str) and value.startswith("#"): + ref_path = value + refs.append(ref_path) + else: + refs.extend(extract_refs(value, base_path)) + elif isinstance(data, list): + for item in data: + refs.extend(extract_refs(item, base_path)) + return refs + + def resolve_pointer(data: Mapping[str, Any], pointer: str) -> bool: + parts = pointer.split("/")[1:] # Skip the first empty part due to leading '#/' + current = data + try: + for part in parts: + part = part.replace("~1", "/").replace("~0", "~") # Unescape JSON Pointer + current = current[part] + return True + except (KeyError, TypeError): + return False + + def validate_refs(yaml_file: str) -> List[str]: + data = load_yaml(yaml_file) + refs = extract_refs(data) + invalid_refs = [ref for ref in refs if not resolve_pointer(data, ref.replace("#", ""))] + return invalid_refs + + yaml_file_path = ( + Path(__file__).resolve().parent.parent.parent.parent / "airbyte_cdk/sources/declarative/declarative_component_schema.yaml" + ) + assert not validate_refs(yaml_file_path) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_types.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_types.py new file mode 100644 index 000000000000..b6eb42f940b6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_types.py @@ -0,0 +1,62 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +import pytest +from airbyte_cdk.sources.types import StreamSlice + + +@pytest.mark.parametrize( + "stream_slice, expected_partition", + [ + pytest.param(StreamSlice(partition={}, cursor_slice={}), {}, id="test_partition_with_empty_partition"), + pytest.param( + StreamSlice(partition=StreamSlice(partition={}, cursor_slice={}), cursor_slice={}), {}, id="test_partition_nested_empty" + ), + pytest.param( + StreamSlice(partition={"key": "value"}, cursor_slice={}), {"key": "value"}, id="test_partition_with_mapping_partition" + ), + pytest.param(StreamSlice(partition={}, cursor_slice={"cursor": "value"}), {}, id="test_partition_with_only_cursor"), + pytest.param( + StreamSlice(partition=StreamSlice(partition={}, cursor_slice={}), cursor_slice={"cursor": "value"}), + {}, + id="test_partition_nested_empty_and_cursor_value_mapping", + ), + pytest.param( + StreamSlice(partition=StreamSlice(partition={}, cursor_slice={"cursor": "value"}), cursor_slice={}), + {}, + id="test_partition_nested_empty_and_cursor_value", + ), + ], +) +def test_partition(stream_slice, expected_partition): + partition = stream_slice.partition + + assert partition == expected_partition + + +@pytest.mark.parametrize( + "stream_slice, expected_cursor_slice", + [ + pytest.param(StreamSlice(partition={}, cursor_slice={}), {}, id="test_cursor_slice_with_empty_cursor"), + pytest.param( + StreamSlice(partition={}, cursor_slice=StreamSlice(partition={}, cursor_slice={})), {}, id="test_cursor_slice_nested_empty" + ), + pytest.param( + StreamSlice(partition={}, cursor_slice={"key": "value"}), {"key": "value"}, id="test_cursor_slice_with_mapping_cursor_slice" + ), + pytest.param(StreamSlice(partition={"partition": "value"}, cursor_slice={}), {}, id="test_cursor_slice_with_only_partition"), + pytest.param( + StreamSlice(partition={"partition": "value"}, cursor_slice=StreamSlice(partition={}, cursor_slice={})), + {}, + id="test_cursor_slice_nested_empty_and_partition_mapping", + ), + pytest.param( + StreamSlice(partition=StreamSlice(partition={"partition": "value"}, cursor_slice={}), cursor_slice={}), + {}, + id="test_cursor_slice_nested_empty_and_partition", + ), + ], +) +def test_cursor_slice(stream_slice, expected_cursor_slice): + cursor_slice = stream_slice.cursor_slice + + assert cursor_slice == expected_cursor_slice diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_yaml_declarative_source.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_yaml_declarative_source.py new file mode 100644 index 000000000000..fc35f5b3d3f2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_yaml_declarative_source.py @@ -0,0 +1,149 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import os +import tempfile + +import pytest +from airbyte_cdk.sources.declarative.parsers.custom_exceptions import UndefinedReferenceException +from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource +from yaml.parser import ParserError + +logger = logging.getLogger("airbyte") + + +EXTERNAL_CONNECTION_SPECIFICATION = { + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": {"api_token": {"type": "string"}}, +} + + +class MockYamlDeclarativeSource(YamlDeclarativeSource): + """ + Mock test class that is needed to monkey patch how we read from various files that make up a declarative source because of how our + tests write configuration files during testing. It is also used to properly namespace where files get written in specific + cases like when we temporarily write files like spec.yaml to the package unit_tests, which is the directory where it will + be read in during the tests. + """ + + def _read_and_parse_yaml_file(self, path_to_yaml_file): + """ + We override the default behavior because we use tempfile to write the yaml manifest to a temporary directory which is + not mounted during runtime which prevents pkgutil.get_data() from being able to find the yaml file needed to generate + # the declarative source. For tests we use open() which supports using an absolute path. + """ + with open(path_to_yaml_file, "r") as f: + config_content = f.read() + parsed_config = YamlDeclarativeSource._parse(config_content) + return parsed_config + + +class TestYamlDeclarativeSource: + def test_source_is_created_if_toplevel_fields_are_known(self): + content = """ + version: "0.29.3" + definitions: + schema_loader: + name: "{{ parameters.stream_name }}" + file_path: "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + retriever: + paginator: + type: "DefaultPaginator" + page_size: 10 + page_size_option: + inject_into: request_parameter + field_name: page_size + page_token_option: + type: RequestPath + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + page_size: 10 + requester: + url_base: "https://api.sendgrid.com" + path: "/v3/marketing/lists" + authenticator: + type: "BearerAuthenticator" + api_token: "{{ config.apikey }}" + request_parameters: + page_size: "{{ 10 }}" + record_selector: + extractor: + field_path: ["result"] + streams: + - type: DeclarativeStream + $parameters: + name: "lists" + primary_key: id + schema_loader: "#/definitions/schema_loader" + retriever: "#/definitions/retriever" + check: + type: CheckStream + stream_names: ["lists"] + """ + temporary_file = TestFileContent(content) + MockYamlDeclarativeSource(temporary_file.filename) + + def test_source_fails_for_invalid_yaml(self): + content = """ + version: "version" + definitions: + this is not parsable yaml: " at all + streams: + - type: DeclarativeStream + $parameters: + name: "lists" + primary_key: id + url_base: "https://api.sendgrid.com" + check: + type: CheckStream + stream_names: ["lists"] + """ + temporary_file = TestFileContent(content) + with pytest.raises(ParserError): + MockYamlDeclarativeSource(temporary_file.filename) + + def test_source_with_missing_reference_fails(self): + content = """ + version: "version" + definitions: + schema_loader: + name: "{{ parameters.stream_name }}" + file_path: "./source_sendgrid/schemas/{{ parameters.name }}.yaml" + streams: + - type: DeclarativeStream + $parameters: + name: "lists" + primary_key: id + url_base: "https://api.sendgrid.com" + schema_loader: "#/definitions/schema_loader" + retriever: "#/definitions/retriever" + check: + type: CheckStream + stream_names: ["lists"] + """ + temporary_file = TestFileContent(content) + with pytest.raises(UndefinedReferenceException): + MockYamlDeclarativeSource(temporary_file.filename) + + +class TestFileContent: + def __init__(self, content): + self.file = tempfile.NamedTemporaryFile(mode="w", delete=False) + + with self.file as f: + f.write(content) + + @property + def filename(self): + return self.file.name + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + os.unlink(self.filename) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_add_fields.py b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_add_fields.py new file mode 100644 index 000000000000..9b46cf49b99b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_add_fields.py @@ -0,0 +1,136 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, List, Mapping, Optional, Tuple + +import pytest +from airbyte_cdk.sources.declarative.transformations import AddFields +from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition +from airbyte_cdk.sources.types import FieldPointer + + +@pytest.mark.parametrize( + ["input_record", "field", "field_type", "kwargs", "expected"], + [ + pytest.param({"k": "v"}, [(["path"], "static_value")], None, {}, {"k": "v", "path": "static_value"}, id="add new static value"), + pytest.param({"k": "v"}, [(["path"], "{{ 1 }}")], None, {}, {"k": "v", "path": 1}, id="add an expression evaluated as a number"), + pytest.param( + {"k": "v"}, + [(["path"], "{{ 1 }}")], + str, + {}, + {"k": "v", "path": "1"}, + id="add an expression evaluated as a string using the value_type field", + ), + pytest.param( + {"k": "v"}, + [(["path"], "static_value"), (["path2"], "static_value2")], + None, + {}, + {"k": "v", "path": "static_value", "path2": "static_value2"}, + id="add new multiple static values", + ), + pytest.param( + {"k": "v"}, + [(["nested", "path"], "static_value")], + None, + {}, + {"k": "v", "nested": {"path": "static_value"}}, + id="set static value at nested path", + ), + pytest.param({"k": "v"}, [(["k"], "new_value")], None, {}, {"k": "new_value"}, id="update value which already exists"), + pytest.param({"k": [0, 1]}, [(["k", 3], "v")], None, {}, {"k": [0, 1, None, "v"]}, id="Set element inside array"), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ config["shop"] }}')], + None, + {"config": {"shop": "in-n-out"}}, + {"k": "v", "k2": "in-n-out"}, + id="set a value from the config using bracket notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ config.shop }}")], + None, + {"config": {"shop": "in-n-out"}}, + {"k": "v", "k2": "in-n-out"}, + id="set a value from the config using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ stream_state["cursor"] }}')], + None, + {"stream_state": {"cursor": "t0"}}, + {"k": "v", "k2": "t0"}, + id="set a value from the state using bracket notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ stream_state.cursor }}")], + None, + {"stream_state": {"cursor": "t0"}}, + {"k": "v", "k2": "t0"}, + id="set a value from the state using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ stream_slice["start_date"] }}')], + None, + {"stream_slice": {"start_date": "oct1"}}, + {"k": "v", "k2": "oct1"}, + id="set a value from the stream slice using bracket notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ stream_slice.start_date }}")], + None, + {"stream_slice": {"start_date": "oct1"}}, + {"k": "v", "k2": "oct1"}, + id="set a value from the stream slice using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ record.k }}")], + None, + {}, + {"k": "v", "k2": "v"}, + id="set a value from a field in the record using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ record["k"] }}')], + None, + {}, + {"k": "v", "k2": "v"}, + id="set a value from a field in the record using bracket notation", + ), + pytest.param( + {"k": {"nested": "v"}}, + [(["k2"], "{{ record.k.nested }}")], + None, + {}, + {"k": {"nested": "v"}, "k2": "v"}, + id="set a value from a nested field in the record using bracket notation", + ), + pytest.param( + {"k": {"nested": "v"}}, + [(["k2"], '{{ record["k"]["nested"] }}')], + None, + {}, + {"k": {"nested": "v"}, "k2": "v"}, + id="set a value from a nested field in the record using bracket notation", + ), + pytest.param({"k": "v"}, [(["k2"], "{{ 2 + 2 }}")], None, {}, {"k": "v", "k2": 4}, id="set a value from a jinja expression"), + ], +) +def test_add_fields( + input_record: Mapping[str, Any], + field: List[Tuple[FieldPointer, str]], + field_type: Optional[str], + kwargs: Mapping[str, Any], + expected: Mapping[str, Any], +): + inputs = [AddedFieldDefinition(path=v[0], value=v[1], value_type=field_type, parameters={}) for v in field] + AddFields(fields=inputs, parameters={"alas": "i live"}).transform(input_record, **kwargs) + assert input_record == expected diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_keys_to_lower_transformation.py b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_keys_to_lower_transformation.py new file mode 100644 index 000000000000..7464b9f04fd2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_keys_to_lower_transformation.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import KeysToLowerTransformation + +_ANY_VALUE = -1 + + +def test_transform() -> None: + record = {"wIth_CapITal": _ANY_VALUE, "anOThEr_witH_Caps": _ANY_VALUE} + KeysToLowerTransformation().transform(record) + assert {"with_capital": _ANY_VALUE, "another_with_caps": _ANY_VALUE} diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_remove_fields.py b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_remove_fields.py new file mode 100644 index 000000000000..89b17e8d0f75 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_remove_fields.py @@ -0,0 +1,89 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, List, Mapping + +import pytest +from airbyte_cdk.sources.declarative.transformations import RemoveFields +from airbyte_cdk.sources.types import FieldPointer + + +@pytest.mark.parametrize( + ["input_record", "field_pointers", "condition", "expected"], + [ + pytest.param({"k1": "v", "k2": "v"}, [["k1"]], None, {"k2": "v"}, id="remove a field that exists (flat dict), condition = None"), + pytest.param({"k1": "v", "k2": "v"}, [["k1"]], "", {"k2": "v"}, id="remove a field that exists (flat dict)"), + pytest.param({"k1": "v", "k2": "v"}, [["k3"]], "", {"k1": "v", "k2": "v"}, id="remove a field that doesn't exist (flat dict)"), + pytest.param({"k1": "v", "k2": "v"}, [["k1"], ["k2"]], "", {}, id="remove multiple fields that exist (flat dict)"), + # TODO: should we instead splice the element out of the array? I think that's the more intuitive solution + # Otherwise one could just set the field's value to null. + pytest.param({"k1": [1, 2]}, [["k1", 0]], "", {"k1": [None, 2]}, id="remove field inside array (int index)"), + pytest.param({"k1": [1, 2]}, [["k1", "0"]], "", {"k1": [None, 2]}, id="remove field inside array (string index)"), + pytest.param( + {"k1": "v", "k2": "v", "k3": [0, 1], "k4": "v"}, + [["k1"], ["k2"], ["k3", 0]], + "", + {"k3": [None, 1], "k4": "v"}, + id="test all cases (flat)", + ), + pytest.param({"k1": [0, 1]}, [[".", "k1", 10]], "", {"k1": [0, 1]}, id="remove array index that doesn't exist (flat)"), + pytest.param( + {".": {"k1": [0, 1]}}, [[".", "k1", 10]], "", {".": {"k1": [0, 1]}}, id="remove array index that doesn't exist (nested)" + ), + pytest.param({".": {"k2": "v", "k1": "v"}}, [[".", "k1"]], "", {".": {"k2": "v"}}, id="remove nested field that exists"), + pytest.param( + {".": {"k2": "v", "k1": "v"}}, [[".", "k3"]], "", {".": {"k2": "v", "k1": "v"}}, id="remove field that doesn't exist (nested)" + ), + pytest.param( + {".": {"k2": "v", "k1": "v"}}, [[".", "k1"], [".", "k2"]], "", {".": {}}, id="remove multiple fields that exist (nested)" + ), + pytest.param( + {".": {"k1": [0, 1]}}, + [[".", "k1", 0]], + "", + {".": {"k1": [None, 1]}}, + id="remove multiple fields that exist in arrays (nested)", + ), + pytest.param( + {".": {"k1": [{"k2": "v", "k3": "v"}, {"k4": "v"}]}}, + [[".", "k1", 0, "k2"], [".", "k1", 1, "k4"]], + "", + {".": {"k1": [{"k3": "v"}, {}]}}, + id="remove fields that exist in arrays (deeply nested)", + ), + pytest.param( + {"k1": "v", "k2": "v"}, + [["**"]], + "{{ False }}", + {"k1": "v", "k2": "v"}, + id="do not remove any field if condition is boolean False", + ), + pytest.param({"k1": "v", "k2": "v"}, [["**"]], "{{ True }}", {}, id="remove all field if condition is boolean True"), + pytest.param( + {"k1": "v", "k2": "v1", "k3": "v1", "k4": {"k_nested": "v1", "k_nested2": "v2"}}, + [["**"]], + "{{ property == 'v1' }}", + {"k1": "v", "k4": {"k_nested2": "v2"}}, + id="recursively remove any field that matches property condition and leave that does not", + ), + pytest.param( + {"k1": "v", "k2": "some_long_string", "k3": "some_long_string", "k4": {"k_nested": "v1", "k_nested2": "v2"}}, + [["**"]], + "{{ property|length > 5 }}", + {"k1": "v", "k4": {"k_nested": "v1", "k_nested2": "v2"}}, + id="remove any field that have length > 5 and leave that does not", + ), + pytest.param( + {"k1": 255, "k2": "some_string", "k3": "some_long_string", "k4": {"k_nested": 123123, "k_nested2": "v2"}}, + [["**"]], + "{{ property is integer }}", + {"k2": "some_string", "k3": "some_long_string", "k4": {"k_nested2": "v2"}}, + id="recursively remove any field that of type integer and leave that does not", + ), + ], +) +def test_remove_fields(input_record: Mapping[str, Any], field_pointers: List[FieldPointer], condition: str, expected: Mapping[str, Any]): + transformation = RemoveFields(field_pointers=field_pointers, condition=condition, parameters={}) + transformation.transform(input_record) + assert input_record == expected diff --git a/airbyte-cdk/python/unit_tests/sources/embedded/test_embedded_integration.py b/airbyte-cdk/python/unit_tests/sources/embedded/test_embedded_integration.py new file mode 100644 index 000000000000..7560dc403ecd --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/embedded/test_embedded_integration.py @@ -0,0 +1,162 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import unittest +from typing import Any, Mapping, Optional +from unittest.mock import MagicMock + +from airbyte_cdk.models import ( + AirbyteCatalog, + AirbyteLogMessage, + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + ConnectorSpecification, + DestinationSyncMode, + Level, + SyncMode, + Type, +) +from airbyte_cdk.sources.embedded.base_integration import BaseEmbeddedIntegration +from airbyte_cdk.utils import AirbyteTracedException + + +class TestIntegration(BaseEmbeddedIntegration): + def _handle_record(self, record: AirbyteRecordMessage, id: Optional[str]) -> Mapping[str, Any]: + return {"data": record.data, "id": id} + + +class EmbeddedIntegrationTestCase(unittest.TestCase): + def setUp(self): + self.source_class = MagicMock() + self.source = MagicMock() + self.source_class.return_value = self.source + self.source.spec.return_value = ConnectorSpecification( + connectionSpecification={ + "properties": { + "test": { + "type": "string", + } + } + } + ) + self.config = {"test": "abc"} + self.integration = TestIntegration(self.source, self.config) + self.stream1 = AirbyteStream( + name="test", + source_defined_primary_key=[["test"]], + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + ) + self.stream2 = AirbyteStream(name="test2", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]) + self.source.discover.return_value = AirbyteCatalog(streams=[self.stream2, self.stream1]) + + def test_integration(self): + self.source.read.return_value = [ + AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="test")), + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="test", data={"test": 1}, emitted_at=1)), + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="test", data={"test": 2}, emitted_at=2)), + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="test", data={"test": 3}, emitted_at=3)), + ] + result = list(self.integration._load_data("test", None)) + self.assertEqual( + result, + [ + {"data": {"test": 1}, "id": "1"}, + {"data": {"test": 2}, "id": "2"}, + {"data": {"test": 3}, "id": "3"}, + ], + ) + self.source.discover.assert_called_once_with(self.config) + self.source.read.assert_called_once_with( + self.config, + ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=self.stream1, + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + primary_key=[["test"]], + ) + ] + ), + None, + ) + + def test_failed_check(self): + self.config = {"test": 123} + with self.assertRaises(AirbyteTracedException) as error: + TestIntegration(self.source, self.config) + assert str(error.exception) == "123 is not of type 'string'" + + def test_state(self): + state = AirbyteStateMessage(data={}) + self.source.read.return_value = [ + AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="test")), + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="test", data={"test": 1}, emitted_at=1)), + AirbyteMessage(type=Type.STATE, state=state), + ] + result = list(self.integration._load_data("test", None)) + self.assertEqual( + result, + [ + {"data": {"test": 1}, "id": "1"}, + ], + ) + self.integration.last_state = state + + def test_incremental(self): + state = AirbyteStateMessage(data={}) + list(self.integration._load_data("test", state)) + self.source.read.assert_called_once_with( + self.config, + ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=self.stream1, + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + primary_key=[["test"]], + ) + ] + ), + state, + ) + + def test_incremental_without_state(self): + list(self.integration._load_data("test")) + self.source.read.assert_called_once_with( + self.config, + ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=self.stream1, + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + primary_key=[["test"]], + ) + ] + ), + None, + ) + + def test_incremental_unsupported(self): + state = AirbyteStateMessage(data={}) + list(self.integration._load_data("test2", state)) + self.source.read.assert_called_once_with( + self.config, + ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=self.stream2, + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.append, + ) + ] + ), + state, + ) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/availability_strategy/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/availability_strategy/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py b/airbyte-cdk/python/unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py new file mode 100644 index 000000000000..7206d234939d --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py @@ -0,0 +1,100 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import unittest +from datetime import datetime +from unittest.mock import Mock, PropertyMock + +from airbyte_cdk.sources.file_based.availability_strategy.default_file_based_availability_strategy import ( + DefaultFileBasedAvailabilityStrategy, +) +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat +from airbyte_cdk.sources.file_based.exceptions import CustomFileBasedException +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream + +_FILE_WITH_UNKNOWN_EXTENSION = RemoteFile(uri="a.unknown_extension", last_modified=datetime.now(), file_type="csv") +_ANY_CONFIG = FileBasedStreamConfig( + name="config.name", + file_type="parquet", + format=JsonlFormat(), +) +_ANY_SCHEMA = {"key": "value"} + + +class DefaultFileBasedAvailabilityStrategyTest(unittest.TestCase): + def setUp(self) -> None: + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._strategy = DefaultFileBasedAvailabilityStrategy(self._stream_reader) + + self._parser = Mock(spec=FileTypeParser) + self._parser.check_config.return_value = (True, None) + self._stream = Mock(spec=AbstractFileBasedStream) + self._stream.get_parser.return_value = self._parser + self._stream.catalog_schema = _ANY_SCHEMA + self._stream.config = _ANY_CONFIG + self._stream.validation_policy = PropertyMock(validate_schema_before_sync=False) + self._stream.stream_reader = self._stream_reader + + def test_given_file_extension_does_not_match_when_check_availability_and_parsability_then_stream_is_still_available(self) -> None: + """ + Before, we had a validation on the file extension but it turns out that in production, users sometimes have mismatch there. The + example we've seen was for JSONL parser but the file extension was just `.json`. Note that there we more than one record extracted + from this stream so it's not just that the file is one JSON object + """ + self._stream.get_files.return_value = [_FILE_WITH_UNKNOWN_EXTENSION] + self._parser.parse_records.return_value = [{"a record": 1}] + + is_available, reason = self._strategy.check_availability_and_parsability(self._stream, Mock(), Mock()) + + assert is_available + + def test_not_available_given_no_files(self) -> None: + """ + If no files are returned, then the stream is not available. + """ + self._stream.get_files.return_value = [] + + is_available, reason = self._strategy.check_availability_and_parsability(self._stream, Mock(), Mock()) + + assert not is_available + assert "No files were identified in the stream" in reason + + def test_parse_records_is_not_called_with_parser_max_n_files_for_parsability_set(self) -> None: + """ + If the stream parser sets parser_max_n_files_for_parsability to 0, then we should not call parse_records on it + """ + self._parser.parser_max_n_files_for_parsability = 0 + self._stream.get_files.return_value = [_FILE_WITH_UNKNOWN_EXTENSION] + + is_available, reason = self._strategy.check_availability_and_parsability(self._stream, Mock(), Mock()) + + assert is_available + assert not self._parser.parse_records.called + assert self._stream_reader.open_file.called + + def test_passing_config_check(self) -> None: + """ + Test if the DefaultFileBasedAvailabilityStrategy correctly handles the check_config method defined on the parser. + """ + self._parser.check_config.return_value = (False, "Ran into error") + is_available, error_message = self._strategy.check_availability_and_parsability(self._stream, Mock(), Mock()) + assert not is_available + assert "Ran into error" in error_message + + def test_catching_and_raising_custom_file_based_exception(self) -> None: + """ + Test if the DefaultFileBasedAvailabilityStrategy correctly handles the CustomFileBasedException + by raising a CheckAvailabilityError when the get_files method is called. + """ + # Mock the get_files method to raise CustomFileBasedException when called + self._stream.get_files.side_effect = CustomFileBasedException("Custom exception for testing.") + + # Invoke the check_availability_and_parsability method and check if it correctly handles the exception + is_available, error_message = self._strategy.check_availability_and_parsability(self._stream, Mock(), Mock()) + assert not is_available + assert "Custom exception for testing." in error_message diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_abstract_file_based_spec.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_abstract_file_based_spec.py new file mode 100644 index 000000000000..3c3d72c23300 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_abstract_file_based_spec.py @@ -0,0 +1,28 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Type + +import pytest +from airbyte_cdk.sources.file_based.config.file_based_stream_config import AvroFormat, CsvFormat, ParquetFormat +from jsonschema import ValidationError, validate +from pydantic.v1 import BaseModel + + +@pytest.mark.parametrize( + "file_format, file_type, expected_error", + [ + pytest.param(ParquetFormat, "parquet", None, id="test_parquet_format_is_a_valid_parquet_file_type"), + pytest.param(AvroFormat, "avro", None, id="test_avro_format_is_a_valid_avro_file_type"), + pytest.param(CsvFormat, "parquet", ValidationError, id="test_csv_format_is_not_a_valid_parquet_file_type"), + ], +) +def test_parquet_file_type_is_not_a_valid_csv_file_type(file_format: BaseModel, file_type: str, expected_error: Type[Exception]) -> None: + format_config = {file_type: {"filetype": file_type, "decimal_as_float": True}} + + if expected_error: + with pytest.raises(expected_error): + validate(instance=format_config[file_type], schema=file_format.schema()) + else: + validate(instance=format_config[file_type], schema=file_format.schema()) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py new file mode 100644 index 000000000000..c233bd7ac9e9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -0,0 +1,34 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import unittest + +import pytest +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided +from pydantic.v1.error_wrappers import ValidationError + + +class CsvHeaderDefinitionTest(unittest.TestCase): + def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None: + with pytest.raises(ValidationError): + CsvHeaderUserProvided(column_names=[]) + + def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None: + # no error means that this test succeeds + CsvHeaderUserProvided(column_names=["1", "2", "3"]) + + def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None: + assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row() + + def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None: + assert not CsvHeaderAutogenerated().has_header_row() + + def test_given_from_csv_then_csv_has_header_row(self) -> None: + assert CsvHeaderFromCsv().has_header_row() + + +class CsvDelimiterTest(unittest.TestCase): + def test_tab_delimter(self): + assert CsvFormat(delimiter=r"\t").delimiter == "\t" + assert len(CsvFormat(delimiter=r"\t").delimiter) == 1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_file_based_stream_config.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_file_based_stream_config.py new file mode 100644 index 000000000000..4c5d69a7dab5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_file_based_stream_config.py @@ -0,0 +1,84 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Mapping, Type + +import pytest as pytest +from airbyte_cdk.sources.file_based.config.file_based_stream_config import CsvFormat, FileBasedStreamConfig +from pydantic.v1.error_wrappers import ValidationError + + +@pytest.mark.parametrize( + "file_type, input_format, expected_format, expected_error", + [ + pytest.param( + "csv", + {"filetype": "csv", "delimiter": "d", "quote_char": "q", "escape_char": "e", "encoding": "ascii", "double_quote": True}, + {"filetype": "csv", "delimiter": "d", "quote_char": "q", "escape_char": "e", "encoding": "ascii", "double_quote": True}, + None, + id="test_valid_format", + ), + pytest.param( + "csv", + {"filetype": "csv", "double_quote": False}, + {"delimiter": ",", "quote_char": '"', "encoding": "utf8", "double_quote": False}, + None, + id="test_default_format_values", + ), + pytest.param( + "csv", {"filetype": "csv", "delimiter": "nope", "double_quote": True}, None, ValidationError, id="test_invalid_delimiter" + ), + pytest.param( + "csv", {"filetype": "csv", "quote_char": "nope", "double_quote": True}, None, ValidationError, id="test_invalid_quote_char" + ), + pytest.param( + "csv", {"filetype": "csv", "escape_char": "nope", "double_quote": True}, None, ValidationError, id="test_invalid_escape_char" + ), + pytest.param( + "csv", + {"filetype": "csv", "delimiter": ",", "quote_char": '"', "encoding": "not_a_format", "double_quote": True}, + {}, + ValidationError, + id="test_invalid_encoding_type", + ), + pytest.param( + "invalid", {"filetype": "invalid", "double_quote": False}, {}, ValidationError, id="test_config_format_file_type_mismatch" + ), + ], +) +def test_csv_config( + file_type: str, input_format: Mapping[str, Any], expected_format: Mapping[str, Any], expected_error: Type[Exception] +) -> None: + stream_config = {"name": "stream1", "file_type": file_type, "globs": ["*"], "validation_policy": "Emit Record", "format": input_format} + + if expected_error: + with pytest.raises(expected_error): + FileBasedStreamConfig(**stream_config) + else: + actual_config = FileBasedStreamConfig(**stream_config) + if actual_config.format is not None: + for expected_format_field, expected_format_value in expected_format.items(): + assert isinstance(actual_config.format, CsvFormat) + assert getattr(actual_config.format, expected_format_field) == expected_format_value + else: + assert False, "Expected format to be set" + + +def test_invalid_validation_policy() -> None: + stream_config = { + "name": "stream1", + "file_type": "csv", + "globs": ["*"], + "validation_policy": "Not Valid Policy", + "format": { + "filetype": "csv", + "delimiter": "d", + "quote_char": "q", + "escape_char": "e", + "encoding": "ascii", + "double_quote": True, + }, + } + with pytest.raises(ValidationError): + FileBasedStreamConfig(**stream_config) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/discovery_policy/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/discovery_policy/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py b/airbyte-cdk/python/unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py new file mode 100644 index 000000000000..b7ad67115cff --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py @@ -0,0 +1,31 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import unittest +from unittest.mock import Mock + +from airbyte_cdk.sources.file_based.discovery_policy.default_discovery_policy import DefaultDiscoveryPolicy +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser + + +class DefaultDiscoveryPolicyTest(unittest.TestCase): + def setUp(self) -> None: + self._policy = DefaultDiscoveryPolicy() + + self._parser = Mock(spec=FileTypeParser) + self._parser.parser_max_n_files_for_schema_inference = None + + def test_hardcoded_schema_inference_file_limit_is_returned(self) -> None: + """ + If the parser is not providing a limit, then we should use the hardcoded limit + """ + assert self._policy.get_max_n_files_for_schema_inference(self._parser) == 10 + + def test_parser_limit_is_respected(self) -> None: + """ + If the parser is providing a limit, then we should use that limit + """ + self._parser.parser_max_n_files_for_schema_inference = 1 + + assert self._policy.get_max_n_files_for_schema_inference(self._parser) == 1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_avro_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_avro_parser.py new file mode 100644 index 000000000000..a45d424b7a2b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_avro_parser.py @@ -0,0 +1,250 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime +import uuid + +import pytest +from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat +from airbyte_cdk.sources.file_based.file_types import AvroParser + +_default_avro_format = AvroFormat() +_double_as_string_avro_format = AvroFormat(double_as_string=True) +_uuid_value = uuid.uuid4() + + +@pytest.mark.parametrize( + "avro_format, avro_type, expected_json_type, expected_error", + [ + # Primitive types + pytest.param(_default_avro_format, "null", {"type": "null"}, None, id="test_null"), + pytest.param(_default_avro_format, "boolean", {"type": "boolean"}, None, id="test_boolean"), + pytest.param(_default_avro_format, "int", {"type": "integer"}, None, id="test_int"), + pytest.param(_default_avro_format, "long", {"type": "integer"}, None, id="test_long"), + pytest.param(_default_avro_format, "float", {"type": "number"}, None, id="test_float"), + pytest.param(_default_avro_format, "double", {"type": "number"}, None, id="test_double"), + pytest.param(_double_as_string_avro_format, "double", {"type": "string"}, None, id="test_double_as_string"), + pytest.param(_default_avro_format, "bytes", {"type": "string"}, None, id="test_bytes"), + pytest.param(_default_avro_format, "string", {"type": "string"}, None, id="test_string"), + pytest.param(_default_avro_format, "void", None, ValueError, id="test_invalid_type"), + # Complex types + pytest.param( + _default_avro_format, + { + "type": "record", + "name": "SubRecord", + "fields": [{"name": "precise", "type": "double"}, {"name": "robo", "type": "bytes"}, {"name": "simple", "type": "long"}], + }, + { + "type": "object", + "properties": { + "precise": {"type": "number"}, + "robo": {"type": "string"}, + "simple": {"type": "integer"}, + }, + }, + None, + id="test_record", + ), + pytest.param( + _default_avro_format, + { + "type": "record", + "name": "SubRecord", + "fields": [{"name": "precise", "type": "double"}, {"name": "obj_array", "type": {"type": "array", "items": "float"}}], + }, + {"type": "object", "properties": {"precise": {"type": "number"}, "obj_array": {"type": "array", "items": {"type": "number"}}}}, + None, + id="test_record_with_nested_array", + ), + pytest.param( + _default_avro_format, + { + "type": "record", + "name": "SubRecord", + "fields": [ + { + "name": "nested_record", + "type": {"type": "record", "name": "SubRecord", "fields": [{"name": "question", "type": "boolean"}]}, + } + ], + }, + { + "type": "object", + "properties": { + "nested_record": { + "type": "object", + "properties": {"question": {"type": "boolean"}}, + } + }, + }, + None, + id="test_record_with_nested_record", + ), + pytest.param( + _default_avro_format, {"type": "array", "items": "float"}, {"type": "array", "items": {"type": "number"}}, None, id="test_array" + ), + pytest.param( + _default_avro_format, + {"type": "array", "items": {"type": "record", "name": "SubRecord", "fields": [{"name": "precise", "type": "double"}]}}, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "precise": {"type": "number"}, + }, + }, + }, + None, + id="test_array_of_records", + ), + pytest.param(_default_avro_format, {"type": "array", "not_items": "string"}, None, ValueError, id="test_array_missing_items"), + pytest.param( + _default_avro_format, {"type": "array", "items": "invalid_avro_type"}, None, ValueError, id="test_array_invalid_item_type" + ), + pytest.param( + _default_avro_format, + {"type": "enum", "name": "IMF", "symbols": ["Ethan", "Benji", "Luther"]}, + {"type": "string", "enum": ["Ethan", "Benji", "Luther"]}, + None, + id="test_enum", + ), + pytest.param(_default_avro_format, {"type": "enum", "name": "IMF"}, None, ValueError, id="test_enum_missing_symbols"), + pytest.param( + _default_avro_format, {"type": "enum", "symbols": ["mission", "not", "accepted"]}, None, ValueError, id="test_enum_missing_name" + ), + pytest.param( + _default_avro_format, + {"type": "map", "values": "int"}, + {"type": "object", "additionalProperties": {"type": "integer"}}, + None, + id="test_map", + ), + pytest.param( + _default_avro_format, + {"type": "map", "values": {"type": "record", "name": "SubRecord", "fields": [{"name": "agent", "type": "string"}]}}, + {"type": "object", "additionalProperties": {"type": "object", "properties": {"agent": {"type": "string"}}}}, + None, + id="test_map_object", + ), + pytest.param(_default_avro_format, {"type": "map"}, None, ValueError, id="test_map_missing_values"), + pytest.param( + _default_avro_format, + {"type": "fixed", "name": "limit", "size": 12}, + {"type": "string", "pattern": "^[0-9A-Fa-f]{24}$"}, + None, + id="test_fixed", + ), + pytest.param(_default_avro_format, {"type": "fixed", "name": "limit"}, None, ValueError, id="test_fixed_missing_size"), + pytest.param( + _default_avro_format, {"type": "fixed", "name": "limit", "size": "50"}, None, ValueError, id="test_fixed_size_not_integer" + ), + # Logical types + pytest.param( + _default_avro_format, + {"type": "bytes", "logicalType": "decimal", "precision": 9, "scale": 4}, + {"type": "string", "pattern": f"^-?\\d{{{1, 5}}}(?:\\.\\d{1, 4})?$"}, + None, + id="test_decimal", + ), + pytest.param( + _default_avro_format, + {"type": "bytes", "logicalType": "decimal", "scale": 4}, + None, + ValueError, + id="test_decimal_missing_precision", + ), + pytest.param( + _default_avro_format, + {"type": "bytes", "logicalType": "decimal", "precision": 9}, + None, + ValueError, + id="test_decimal_missing_scale", + ), + pytest.param(_default_avro_format, {"type": "bytes", "logicalType": "uuid"}, {"type": "string"}, None, id="test_uuid"), + pytest.param( + _default_avro_format, {"type": "int", "logicalType": "date"}, {"type": "string", "format": "date"}, None, id="test_date" + ), + pytest.param(_default_avro_format, {"type": "int", "logicalType": "time-millis"}, {"type": "integer"}, None, id="test_time_millis"), + pytest.param( + _default_avro_format, {"type": "long", "logicalType": "time-micros"}, {"type": "integer"}, None, id="test_time_micros" + ), + pytest.param( + _default_avro_format, + {"type": "long", "logicalType": "timestamp-millis"}, + {"type": "string", "format": "date-time"}, + None, + id="test_timestamp_millis", + ), + pytest.param( + _default_avro_format, {"type": "long", "logicalType": "timestamp-micros"}, {"type": "string"}, None, id="test_timestamp_micros" + ), + pytest.param( + _default_avro_format, + {"type": "long", "logicalType": "local-timestamp-millis"}, + {"type": "string", "format": "date-time"}, + None, + id="test_local_timestamp_millis", + ), + pytest.param( + _default_avro_format, + {"type": "long", "logicalType": "local-timestamp-micros"}, + {"type": "string"}, + None, + id="test_local_timestamp_micros", + ), + pytest.param( + _default_avro_format, + {"type": "string", "logicalType": "invalid-logical-type"}, + None, + ValueError, + id="test_invalid_logical_type", + ), + ], +) +def test_convert_primitive_avro_type_to_json(avro_format, avro_type, expected_json_type, expected_error): + if expected_error: + with pytest.raises(expected_error): + AvroParser._convert_avro_type_to_json(avro_format, "field_name", avro_type) + else: + actual_json_type = AvroParser._convert_avro_type_to_json(avro_format, "field_name", avro_type) + assert actual_json_type == expected_json_type + + +@pytest.mark.parametrize( + "avro_format, record_type, record_value, expected_value", + [ + pytest.param(_default_avro_format, "boolean", True, True, id="test_boolean"), + pytest.param(_default_avro_format, "int", 123, 123, id="test_int"), + pytest.param(_default_avro_format, "long", 123, 123, id="test_long"), + pytest.param(_default_avro_format, "float", 123.456, 123.456, id="test_float"), + pytest.param(_default_avro_format, "double", 123.456, 123.456, id="test_double_default_config"), + pytest.param(_double_as_string_avro_format, "double", 123.456, "123.456", id="test_double_as_string"), + pytest.param(_default_avro_format, "bytes", b"hello world", "hello world", id="test_bytes"), + pytest.param(_default_avro_format, "string", "hello world", "hello world", id="test_string"), + pytest.param(_default_avro_format, {"logicalType": "decimal"}, 3.1415, "3.1415", id="test_decimal"), + pytest.param(_default_avro_format, {"logicalType": "uuid"}, _uuid_value, str(_uuid_value), id="test_uuid"), + pytest.param(_default_avro_format, {"logicalType": "date"}, datetime.date(2023, 8, 7), "2023-08-07", id="test_date"), + pytest.param(_default_avro_format, {"logicalType": "time-millis"}, 70267068, 70267068, id="test_time_millis"), + pytest.param(_default_avro_format, {"logicalType": "time-micros"}, 70267068, 70267068, id="test_time_micros"), + pytest.param( + _default_avro_format, + {"logicalType": "local-timestamp-millis"}, + datetime.datetime(2023, 8, 7, 19, 31, 7, 68000, tzinfo=datetime.timezone.utc), + "2023-08-07T19:31:07.068+00:00", + id="test_timestamp_millis", + ), + pytest.param( + _default_avro_format, + {"logicalType": "local-timestamp-micros"}, + datetime.datetime(2023, 8, 7, 19, 31, 7, 68000, tzinfo=datetime.timezone.utc), + "2023-08-07T19:31:07.068000+00:00", + id="test_timestamo_micros", + ), + ], +) +def test_to_output_value(avro_format, record_type, record_value, expected_value): + parser = AvroParser() + assert parser._to_output_value(avro_format, record_type, record_value) == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py new file mode 100644 index 000000000000..9280ffb60e69 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -0,0 +1,645 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import asyncio +import csv +import io +import logging +import unittest +from datetime import datetime +from typing import Any, Dict, Generator, List, Set +from unittest import TestCase, mock +from unittest.mock import Mock + +import pytest +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.file_based.config.csv_format import ( + DEFAULT_FALSE_VALUES, + DEFAULT_TRUE_VALUES, + CsvFormat, + CsvHeaderAutogenerated, + CsvHeaderUserProvided, + InferenceType, +) +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.exceptions import RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + +PROPERTY_TYPES = { + "col1": "null", + "col2": "boolean", + "col3": "integer", + "col4": "number", + "col5": "string", + "col6": "object", + "col7": "array", + "col8": "array", + "col9": "array", + "col10": "string", +} + +logger = logging.getLogger() + + +@pytest.mark.parametrize( + "row, true_values, false_values, expected_output", + [ + pytest.param( + { + "col1": "", + "col2": "true", + "col3": "1", + "col4": "1.1", + "col5": "asdf", + "col6": '{"a": "b"}', + "col7": "[1, 2]", + "col8": '["1", "2"]', + "col9": '[{"a": "b"}, {"a": "c"}]', + "col10": "asdf", + }, + DEFAULT_TRUE_VALUES, + DEFAULT_FALSE_VALUES, + { + "col1": None, + "col2": True, + "col3": 1, + "col4": 1.1, + "col5": "asdf", + "col6": {"a": "b"}, + "col7": [1, 2], + "col8": ["1", "2"], + "col9": [{"a": "b"}, {"a": "c"}], + "col10": "asdf", + }, + id="cast-all-cols", + ), + pytest.param({"col1": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col1": "1"}, id="cannot-cast-to-null"), + pytest.param({"col2": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": True}, id="cast-1-to-bool"), + pytest.param({"col2": "0"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": False}, id="cast-0-to-bool"), + pytest.param({"col2": "yes"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": True}, id="cast-yes-to-bool"), + pytest.param( + {"col2": "this_is_a_true_value"}, + ["this_is_a_true_value"], + DEFAULT_FALSE_VALUES, + {"col2": True}, + id="cast-custom-true-value-to-bool", + ), + pytest.param( + {"col2": "this_is_a_false_value"}, + DEFAULT_TRUE_VALUES, + ["this_is_a_false_value"], + {"col2": False}, + id="cast-custom-false-value-to-bool", + ), + pytest.param({"col2": "no"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": False}, id="cast-no-to-bool"), + pytest.param({"col2": "10"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": "10"}, id="cannot-cast-to-bool"), + pytest.param({"col3": "1.1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col3": "1.1"}, id="cannot-cast-to-int"), + pytest.param({"col4": "asdf"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col4": "asdf"}, id="cannot-cast-to-float"), + pytest.param({"col6": "{'a': 'b'}"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col6": "{'a': 'b'}"}, id="cannot-cast-to-dict"), + pytest.param( + {"col7": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col7": "['a', 'b']"}, id="cannot-cast-to-list-of-ints" + ), + pytest.param( + {"col8": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col8": "['a', 'b']"}, id="cannot-cast-to-list-of-strings" + ), + pytest.param( + {"col9": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col9": "['a', 'b']"}, id="cannot-cast-to-list-of-objects" + ), + pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {}, id="item-not-in-props-doesn't-error"), + ], +) +def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None: + csv_format = CsvFormat(true_values=true_values, false_values=false_values) + assert CsvParser._cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output + + +@pytest.mark.parametrize( + "row, strings_can_be_null, expected_output", + [ + pytest.param( + {"id": "1", "name": "bob", "age": 10, "is_cool": False}, + False, + {"id": "1", "name": "bob", "age": 10, "is_cool": False}, + id="test-no-values-are-null", + ), + pytest.param( + {"id": "1", "name": "bob", "age": "null", "is_cool": "null"}, + False, + {"id": "1", "name": "bob", "age": None, "is_cool": None}, + id="test-non-string-values-are-none-if-in-null-values", + ), + pytest.param( + {"id": "1", "name": "null", "age": 10, "is_cool": False}, + False, + {"id": "1", "name": "null", "age": 10, "is_cool": False}, + id="test-string-values-are-not-none-if-strings-cannot-be-null", + ), + pytest.param( + {"id": "1", "name": "null", "age": 10, "is_cool": False}, + True, + {"id": "1", "name": None, "age": 10, "is_cool": False}, + id="test-string-values-none-if-strings-can-be-null", + ), + ], +) +def test_to_nullable(row, strings_can_be_null, expected_output): + property_types = {"id": "string", "name": "string", "age": "integer", "is_cool": "boolean"} + null_values = {"null"} + nulled_row = CsvParser._to_nullable(row, property_types, null_values, strings_can_be_null) + assert nulled_row == expected_output + + +_DEFAULT_TRUE_VALUES = {"1", "yes", "yeah", "right"} +_DEFAULT_FALSE_VALUES = {"0", "no", "nop", "wrong"} + + +class SchemaInferenceTestCase(TestCase): + _A_NULL_VALUE = "null" + _HEADER_NAME = "header" + + def setUp(self) -> None: + self._config_format = CsvFormat() + self._config_format.true_values = _DEFAULT_TRUE_VALUES + self._config_format.false_values = _DEFAULT_FALSE_VALUES + self._config_format.null_values = {self._A_NULL_VALUE} + self._config_format.inference_type = InferenceType.NONE + self._config = Mock() + self._config.get_input_schema.return_value = None + self._config.format = self._config_format + + self._file = RemoteFile(uri="a uri", last_modified=datetime.now()) + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._logger = Mock(spec=logging.Logger) + self._csv_reader = Mock(spec=_CsvReader) + self._parser = CsvParser(self._csv_reader) + + def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(self) -> None: + self._config.get_input_schema.return_value = {self._HEADER_NAME: {"type": "potato"}} + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") + + def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") + + def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(["2", "90329", "5645"], "integer") + + def test_given_integer_overlap_with_bool_value_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(["1", "90329", "5645"], "integer") # here, "1" is also considered a boolean + + def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(["2", "90329", "2.312"], "number") + + def test_given_arrays_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") + + def test_given_objects_when_infer_schema_then_type_is_object(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") + + def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(["a string", "another string"], "string") + + def test_given_a_null_value_when_infer_then_ignore_null(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema(["2", "90329", "5645", self._A_NULL_VALUE], "integer") + + def test_given_only_null_values_when_infer_then_type_is_string(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._test_infer_schema([self._A_NULL_VALUE, self._A_NULL_VALUE, self._A_NULL_VALUE], "string") + + def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY + self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) + inferred_schema = self._infer_schema() + # since the type is number, we know the string at the end was not considered + assert inferred_schema == {self._HEADER_NAME: {"type": "number"}} + + def test_given_empty_csv_file_when_infer_schema_then_raise_config_error(self) -> None: + self._csv_reader.read_data.return_value = [] + with pytest.raises(AirbyteTracedException) as exception: + self._infer_schema() + assert exception.value.failure_type == FailureType.config_error + + def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: + self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in rows) + inferred_schema = self._infer_schema() + assert inferred_schema == {self._HEADER_NAME: {"type": expected_type}} + + def _infer_schema(self): + loop = asyncio.new_event_loop() + task = loop.create_task(self._parser.infer_schema(self._config, self._file, self._stream_reader, self._logger)) + loop.run_until_complete(task) + return task.result() + + +class CsvFileBuilder: + def __init__(self) -> None: + self._prefixed_rows: List[str] = [] + self._data: List[str] = [] + + def with_prefixed_rows(self, rows: List[str]) -> "CsvFileBuilder": + self._prefixed_rows = rows + return self + + def with_data(self, data: List[str]) -> "CsvFileBuilder": + self._data = data + return self + + def build(self) -> io.StringIO: + return io.StringIO("\n".join(self._prefixed_rows + self._data)) + + +class CsvReaderTest(unittest.TestCase): + _CONFIG_NAME = "config_name" + + def setUp(self) -> None: + self._config_format = CsvFormat() + self._config = Mock() + self._config.name = self._CONFIG_NAME + self._config.format = self._config_format + + self._file = RemoteFile(uri="a uri", last_modified=datetime.now()) + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._logger = Mock(spec=logging.Logger) + self._csv_reader = _CsvReader() + + def test_given_skip_rows_when_read_data_then_do_not_considered_prefixed_rows(self) -> None: + self._config_format.skip_rows_before_header = 2 + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_prefixed_rows(["first line", "second line"]) + .with_data( + [ + "header", + "a value", + "another value", + ] + ) + .build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}] + + def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None: + self._config_format.header_definition = CsvHeaderAutogenerated() + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}] + + def test_given_skip_row_before_and_after_and_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None: + self._config_format.header_definition = CsvHeaderAutogenerated() + self._config_format.skip_rows_before_header = 1 + self._config_format.skip_rows_after_header = 2 + self._stream_reader.open_file.return_value = ( + CsvFileBuilder().with_data(["skip before", "skip after 1", "skip after 2", "0,1,2,3,4,5,6"]).build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}] + + def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None: + self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"]) + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}] + + def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None: + self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"]) + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build() + + with pytest.raises(RecordParseError): + list(self._read_data()) + + def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None: + self._config_format.skip_rows_after_header = 1 + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + "skipped row: important that the is no comma in this string to test if columns do not match in skipped rows", + "a value 1,a value 2", + "another value 1,another value 2", + ] + ) + .build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [ + {"header1": "a value 1", "header2": "a value 2"}, + {"header1": "another value 1", "header2": "another value 2"}, + ] + + def test_given_quote_delimiter_when_read_data_then_parse_properly(self) -> None: + self._config_format.delimiter = "|" + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1|header2", + "a value 1|a value 2", + ] + ) + .build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "a value 1", "header2": "a value 2"}] + + def test_given_quote_char_when_read_data_then_parse_properly(self) -> None: + self._config_format.quote_char = "|" + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + "|a,value,1|,|a,value,2|", + ] + ) + .build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "a,value,1", "header2": "a,value,2"}] + + def test_given_escape_char_when_read_data_then_parse_properly(self) -> None: + self._config_format.escape_char = "|" + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + '"a |"value|", 1",a value 2', + ] + ) + .build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": 'a "value", 1', "header2": "a value 2"}] + + def test_given_double_quote_on_when_read_data_then_parse_properly(self) -> None: + self._config_format.double_quote = True + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ] + ) + .build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "1", "header2": 'Text with doublequote: "This is a text."'}] + + def test_given_double_quote_off_when_read_data_then_parse_properly(self) -> None: + self._config_format.double_quote = False + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ] + ) + .build() + ) + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "1", "header2": 'Text with doublequote: "This is a text."""'}] + + def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header", + "a value", + "another value", + ] + ) + .build() + ) + + dialects_before = set(csv.list_dialects()) + data_generator = self._read_data() + next(data_generator) + [new_dialect] = set(csv.list_dialects()) - dialects_before + assert self._CONFIG_NAME in new_dialect + data_generator.close() + assert new_dialect not in csv.list_dialects() + + def test_given_too_many_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header", + "a value", + "too many values,value,value,value", + ] + ) + .build() + ) + + dialects_before = set(csv.list_dialects()) + data_generator = self._read_data() + next(data_generator) + [new_dialect] = set(csv.list_dialects()) - dialects_before + assert self._CONFIG_NAME in new_dialect + + with pytest.raises(RecordParseError): + next(data_generator) + assert new_dialect not in csv.list_dialects() + + def test_given_too_few_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2,header3", + "value1,value2,value3", + "a value", + ] + ) + .build() + ) + + dialects_before = set(csv.list_dialects()) + data_generator = self._read_data() + next(data_generator) + [new_dialect] = set(csv.list_dialects()) - dialects_before + assert self._CONFIG_NAME in new_dialect + + with pytest.raises(RecordParseError): + next(data_generator) + assert new_dialect not in csv.list_dialects() + + def test_parse_field_size_larger_than_default_python_maximum(self) -> None: + # The field size for the csv module will be set as a side-effect of initializing the CsvParser class. + assert csv.field_size_limit() == 2**31 + long_string = 130 * 1024 * "a" + assert len(long_string.encode("utf-8")) > (128 * 1024) + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + f'1,"{long_string}"', + ] + ) + .build() + ) + + data_generator = self._read_data() + assert list(data_generator) == [{"header1": "1", "header2": long_string}] + + def test_read_data_with_encoding_error(self) -> None: + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["something"]).build() + self._csv_reader._get_headers = Mock(side_effect=UnicodeDecodeError("encoding", b"", 0, 1, "reason")) + + with pytest.raises(AirbyteTracedException) as ate: + data_generator = self._read_data() + assert len(list(data_generator)) == 0 + + assert "encoding" in ate.value.message + assert self._csv_reader._get_headers.called + + def _read_data(self) -> Generator[Dict[str, str], None, None]: + data_generator = self._csv_reader.read_data( + self._config, + self._file, + self._stream_reader, + self._logger, + FileReadMode.READ, + ) + return data_generator + + +_TOO_MANY_VALUES = [ + "header", + "too many values,value,value,value", +] + +_TOO_FEW_VALUES = [ + "header1,header2,header3", + "a value", + "value1,value2,value3", +] + + +@pytest.mark.parametrize( + "ignore_errors_on_fields_mismatch, data, error_message", + [ + ( + True, + _TOO_MANY_VALUES, + "Skipping record in line 2 of file a uri; invalid CSV row with missing column.", + ), + ( + False, + _TOO_MANY_VALUES, + None, + ), + ( + True, + _TOO_FEW_VALUES, + "Skipping record in line 2 of file a uri; invalid CSV row with extra column.", + ), + ( + False, + _TOO_FEW_VALUES, + None, + ), + ], +) +def test_mismatch_between_values_and_header(ignore_errors_on_fields_mismatch, data, error_message) -> None: + config_format = CsvFormat() + config = Mock() + config.name = "config_name" + config.format = config_format + + file = RemoteFile(uri="a uri", last_modified=datetime.now()) + stream_reader = Mock(spec=AbstractFileBasedStreamReader) + logger = Mock(spec=logging.Logger) + csv_reader = _CsvReader() + + config_format.ignore_errors_on_fields_mismatch = ignore_errors_on_fields_mismatch + stream_reader.open_file.return_value = CsvFileBuilder().with_data(data).build() + + data_generator = csv_reader.read_data( + config, + file, + stream_reader, + logger, + FileReadMode.READ, + ) + + # Check if exception is raised only when skip_wrong_number_of_fields_error is False + if not ignore_errors_on_fields_mismatch: + with pytest.raises(RecordParseError): + print(list(data_generator)) + else: + # Expect no exception when skip_wrong_number_of_fields_error is True + list(data_generator) + logger.error.assert_called_with(error_message) + + +def test_encoding_is_passed_to_stream_reader() -> None: + parser = CsvParser() + encoding = "ascii" + stream_reader = Mock() + mock_obj = stream_reader.open_file.return_value + mock_obj.__enter__ = Mock(return_value=io.StringIO("c1,c2\nv1,v2")) + mock_obj.__exit__ = Mock(return_value=None) + file = RemoteFile(uri="s3://bucket/key.csv", last_modified=datetime.now()) + config = FileBasedStreamConfig(name="test", validation_policy="Emit Record", file_type="csv", format=CsvFormat(encoding=encoding)) + list(parser.parse_records(config, file, stream_reader, logger, {"properties": {"c1": {"type": "string"}, "c2": {"type": "string"}}})) + stream_reader.open_file.assert_has_calls( + [ + mock.call(file, FileReadMode.READ, encoding, logger), + mock.call().__enter__(), + mock.call().__exit__(None, None, None), + ] + ) + + mock_obj.__enter__ = Mock(return_value=io.StringIO("c1,c2\nv1,v2")) + loop = asyncio.get_event_loop() + loop.run_until_complete(parser.infer_schema(config, file, stream_reader, logger)) + stream_reader.open_file.assert_called_with(file, FileReadMode.READ, encoding, logger) + stream_reader.open_file.assert_has_calls( + [ + mock.call(file, FileReadMode.READ, encoding, logger), + mock.call().__enter__(), + mock.call().__exit__(None, None, None), + mock.call(file, FileReadMode.READ, encoding, logger), + mock.call().__enter__(), + mock.call().__exit__(None, None, None), + ] + ) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_excel_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_excel_parser.py new file mode 100644 index 000000000000..bd9d8338f094 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -0,0 +1,122 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + + +import datetime +from io import BytesIO +from unittest.mock import MagicMock, Mock, mock_open, patch + +import pandas as pd +import pytest +from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig, ValidationPolicy +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types.excel_parser import ExcelParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType + + +@pytest.fixture +def mock_stream_reader(): + return Mock(spec=AbstractFileBasedStreamReader) + + +@pytest.fixture +def mock_logger(): + return Mock() + + +@pytest.fixture +def file_config(): + return FileBasedStreamConfig( + name="test.xlsx", + file_type="excel", + format=ExcelFormat(sheet_name="Sheet1"), + validation_policy=ValidationPolicy.emit_record, + ) + + +@pytest.fixture +def remote_file(): + return RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now()) + + +@pytest.fixture +def setup_parser(remote_file): + parser = ExcelParser() + + # Sample data for the mock Excel file + data = pd.DataFrame( + { + "column1": [1, 2, 3], + "column2": ["a", "b", "c"], + "column3": [True, False, True], + "column4": pd.to_datetime(["2021-01-01", "2022-01-01", "2023-01-01"]), + } + ) + + # Convert the DataFrame to an Excel byte stream + excel_bytes = BytesIO() + with pd.ExcelWriter(excel_bytes, engine="xlsxwriter") as writer: + data.to_excel(writer, index=False) + excel_bytes.seek(0) + + # Mock the stream_reader's open_file method to return the Excel byte stream + stream_reader = MagicMock(spec=AbstractFileBasedStreamReader) + stream_reader.open_file.return_value = BytesIO(excel_bytes.read()) + + return parser, FileBasedStreamConfig(name="test_stream", format=ExcelFormat()), remote_file, stream_reader, MagicMock(), data + + +@patch("pandas.ExcelFile") +@pytest.mark.asyncio +async def test_infer_schema(mock_excel_file, setup_parser): + parser, config, file, stream_reader, logger, data = setup_parser + + # Mock the parse method of the pandas ExcelFile object + mock_excel_file.return_value.parse.return_value = data + + # Call infer_schema + schema = await parser.infer_schema(config, file, stream_reader, logger) + + # Define the expected schema + expected_schema: SchemaType = { + "column1": {"type": "number"}, + "column2": {"type": "string"}, + "column3": {"type": "boolean"}, + "column4": {"type": "string", "format": "date-time"}, + } + + # Validate the schema + assert schema == expected_schema + + # Assert that the stream_reader's open_file was called correctly + stream_reader.open_file.assert_called_once_with(file, parser.file_read_mode, parser.ENCODING, logger) + + # Assert that the logger was not used for warnings/errors + logger.info.assert_not_called() + logger.error.assert_not_called() + + +def test_invalid_format(mock_stream_reader, mock_logger, remote_file): + parser = ExcelParser() + invalid_config = FileBasedStreamConfig( + name="test.xlsx", + file_type="csv", + format={"filetype": "csv"}, + validation_policy=ValidationPolicy.emit_record, + ) + + with pytest.raises(ConfigValidationError): + list(parser.parse_records(invalid_config, remote_file, mock_stream_reader, mock_logger)) + + +def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_file): + parser = ExcelParser() + with patch("builtins.open", mock_open(read_data=b"corrupted data")): + with patch("pandas.ExcelFile") as mock_excel: + mock_excel.return_value.parse.side_effect = ValueError("Failed to parse file") + + with pytest.raises(RecordParseError): + list(parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger)) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_jsonl_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_jsonl_parser.py new file mode 100644 index 000000000000..af5d83d77d04 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_jsonl_parser.py @@ -0,0 +1,158 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import asyncio +import io +import json +from typing import Any, Dict +from unittest.mock import MagicMock, Mock + +import pytest +from airbyte_cdk.sources.file_based.exceptions import RecordParseError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types import JsonlParser + +JSONL_CONTENT_WITHOUT_MULTILINE_JSON_OBJECTS = [ + b'{"a": 1, "b": "1"}', + b'{"a": 2, "b": "2"}', +] +JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS = [ + b"{", + b' "a": 1,', + b' "b": "1"', + b"}", + b"{", + b' "a": 2,', + b' "b": "2"', + b"}", +] +INVALID_JSON_CONTENT = [ + b"{", + b' "a": 1,', + b' "b": "1"', + b"{", + b' "a": 2,', + b' "b": "2"', + b"}", +] + + +@pytest.fixture +def stream_reader() -> MagicMock: + return MagicMock(spec=AbstractFileBasedStreamReader) + + +def _infer_schema(stream_reader: MagicMock) -> Dict[str, Any]: + loop = asyncio.new_event_loop() + task = loop.create_task(JsonlParser().infer_schema(Mock(), Mock(), stream_reader, Mock())) + loop.run_until_complete(task) + return task.result() # type: ignore # asyncio has no typing + + +def test_when_infer_then_return_proper_types(stream_reader: MagicMock) -> None: + record = {"col1": 1, "col2": 2.2, "col3": "3", "col4": ["a", "list"], "col5": {"inner": "obj"}, "col6": None, "col7": True} + stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO(json.dumps(record).encode("utf-8")) + + schema = _infer_schema(stream_reader) + + assert schema == { + "col1": {"type": "integer"}, + "col2": {"type": "number"}, + "col3": {"type": "string"}, + "col4": {"type": "array"}, + "col5": {"type": "object"}, + "col6": {"type": "null"}, + "col7": {"type": "boolean"}, + } + + +def test_given_str_io_when_infer_then_return_proper_types(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = io.StringIO('{"col": 1}') + + schema = _infer_schema(stream_reader) + + assert schema == {"col": {"type": "integer"}} + + +def test_given_empty_record_when_infer_then_return_empty_schema(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO("{}".encode("utf-8")) + schema = _infer_schema(stream_reader) + assert schema == {} + + +def test_given_no_records_when_infer_then_return_empty_schema(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO("".encode("utf-8")) + schema = _infer_schema(stream_reader) + assert schema == {} + + +def test_given_limit_hit_when_infer_then_stop_considering_records(stream_reader: MagicMock) -> None: + jsonl_file_content = '{"key": 2.' + "2" * JsonlParser.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE + '}\n{"key": "a string"}' + stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO(jsonl_file_content.encode("utf-8")) + + schema = _infer_schema(stream_reader) + + assert schema == {"key": {"type": "number"}} + + +def test_given_multiline_json_objects_and_read_limit_hit_when_infer_then_return_parse_until_at_least_one_record( + stream_reader: MagicMock, +) -> None: + jsonl_file_content = '{\n"key": 2.' + "2" * JsonlParser.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE + "\n}" + stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO(jsonl_file_content.encode("utf-8")) + + schema = _infer_schema(stream_reader) + + assert schema == {"key": {"type": "number"}} + + +def test_given_multiline_json_objects_and_hits_read_limit_when_infer_then_return_proper_types(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS + schema = _infer_schema(stream_reader) + assert schema == {"a": {"type": "integer"}, "b": {"type": "string"}} + + +def test_given_multiple_records_then_merge_types(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO('{"col1": 1}\n{"col1": 2.3}'.encode("utf-8")) + schema = _infer_schema(stream_reader) + assert schema == {"col1": {"type": "number"}} + + +def test_given_one_json_per_line_when_parse_records_then_return_records(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITHOUT_MULTILINE_JSON_OBJECTS + records = list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, Mock(), None)) + assert records == [{"a": 1, "b": "1"}, {"a": 2, "b": "2"}] + + +def test_given_one_json_per_line_when_parse_records_then_do_not_send_warning(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITHOUT_MULTILINE_JSON_OBJECTS + logger = Mock() + + list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger, None)) + + assert logger.warning.call_count == 0 + + +def test_given_multiline_json_object_when_parse_records_then_return_records(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS + records = list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, Mock(), None)) + assert records == [{"a": 1, "b": "1"}, {"a": 2, "b": "2"}] + + +def test_given_multiline_json_object_when_parse_records_then_log_once_one_record_yielded(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS + logger = Mock() + + next(iter(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger, None))) + + assert logger.warning.call_count == 1 + + +def test_given_unparsable_json_when_parse_records_then_raise_error(stream_reader: MagicMock) -> None: + stream_reader.open_file.return_value.__enter__.return_value = INVALID_JSON_CONTENT + logger = Mock() + + with pytest.raises(RecordParseError): + list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger, None)) + assert logger.warning.call_count == 0 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_parquet_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_parquet_parser.py new file mode 100644 index 000000000000..c4768facc7dd --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_parquet_parser.py @@ -0,0 +1,275 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import asyncio +import datetime +import math +from typing import Any, Mapping, Union +from unittest.mock import Mock + +import pyarrow as pa +import pytest +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy +from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat +from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat +from airbyte_cdk.sources.file_based.file_types import ParquetParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from pyarrow import Scalar + +_default_parquet_format = ParquetFormat() +_decimal_as_float_parquet_format = ParquetFormat(decimal_as_float=True) + + +@pytest.mark.parametrize( + "parquet_type, expected_type, parquet_format", + [ + pytest.param(pa.bool_(), {"type": "boolean"}, _default_parquet_format, id="test_parquet_bool"), + pytest.param(pa.int8(), {"type": "integer"}, _default_parquet_format, id="test_parquet_int8"), + pytest.param(pa.int16(), {"type": "integer"}, _default_parquet_format, id="test_parquet_int16"), + pytest.param(pa.int32(), {"type": "integer"}, _default_parquet_format, id="test_parquet_int32"), + pytest.param(pa.int64(), {"type": "integer"}, _default_parquet_format, id="test_parquet_int64"), + pytest.param(pa.uint8(), {"type": "integer"}, _default_parquet_format, id="test_parquet_uint8"), + pytest.param(pa.uint16(), {"type": "integer"}, _default_parquet_format, id="test_parquet_uint16"), + pytest.param(pa.uint32(), {"type": "integer"}, _default_parquet_format, id="test_parquet_uint32"), + pytest.param(pa.uint64(), {"type": "integer"}, _default_parquet_format, id="test_parquet_uint64"), + pytest.param(pa.float16(), {"type": "number"}, _default_parquet_format, id="test_parquet_float16"), + pytest.param(pa.float32(), {"type": "number"}, _default_parquet_format, id="test_parquet_float32"), + pytest.param(pa.float64(), {"type": "number"}, _default_parquet_format, id="test_parquet_float64"), + pytest.param(pa.time32("s"), {"type": "string"}, _default_parquet_format, id="test_parquet_time32s"), + pytest.param(pa.time32("ms"), {"type": "string"}, _default_parquet_format, id="test_parquet_time32ms"), + pytest.param(pa.time64("us"), {"type": "string"}, _default_parquet_format, id="test_parquet_time64us"), + pytest.param(pa.time64("ns"), {"type": "string"}, _default_parquet_format, id="test_parquet_time64us"), + pytest.param(pa.timestamp("s"), {"type": "string", "format": "date-time"}, _default_parquet_format, id="test_parquet_timestamps_s"), + pytest.param( + pa.timestamp("ms"), {"type": "string", "format": "date-time"}, _default_parquet_format, id="test_parquet_timestamp_ms" + ), + pytest.param( + pa.timestamp("s", "utc"), + {"type": "string", "format": "date-time"}, + _default_parquet_format, + id="test_parquet_timestamps_s_with_tz", + ), + pytest.param( + pa.timestamp("ms", "est"), + {"type": "string", "format": "date-time"}, + _default_parquet_format, + id="test_parquet_timestamps_ms_with_tz", + ), + pytest.param(pa.date32(), {"type": "string", "format": "date"}, _default_parquet_format, id="test_parquet_date32"), + pytest.param(pa.date64(), {"type": "string", "format": "date"}, _default_parquet_format, id="test_parquet_date64"), + pytest.param(pa.duration("s"), {"type": "integer"}, _default_parquet_format, id="test_duration_s"), + pytest.param(pa.duration("ms"), {"type": "integer"}, _default_parquet_format, id="test_duration_ms"), + pytest.param(pa.duration("us"), {"type": "integer"}, _default_parquet_format, id="test_duration_us"), + pytest.param(pa.duration("ns"), {"type": "integer"}, _default_parquet_format, id="test_duration_ns"), + pytest.param(pa.month_day_nano_interval(), {"type": "array"}, _default_parquet_format, id="test_parquet_month_day_nano_interval"), + pytest.param(pa.binary(), {"type": "string"}, _default_parquet_format, id="test_binary"), + pytest.param(pa.binary(2), {"type": "string"}, _default_parquet_format, id="test_fixed_size_binary"), + pytest.param(pa.string(), {"type": "string"}, _default_parquet_format, id="test_parquet_string"), + pytest.param(pa.utf8(), {"type": "string"}, _default_parquet_format, id="test_utf8"), + pytest.param(pa.large_binary(), {"type": "string"}, _default_parquet_format, id="test_large_binary"), + pytest.param(pa.large_string(), {"type": "string"}, _default_parquet_format, id="test_large_string"), + pytest.param(pa.large_utf8(), {"type": "string"}, _default_parquet_format, id="test_large_utf8"), + pytest.param(pa.dictionary(pa.int32(), pa.string()), {"type": "object"}, _default_parquet_format, id="test_dictionary"), + pytest.param(pa.struct([pa.field("field", pa.int32())]), {"type": "object"}, _default_parquet_format, id="test_struct"), + pytest.param(pa.list_(pa.int32()), {"type": "array"}, _default_parquet_format, id="test_list"), + pytest.param(pa.large_list(pa.int32()), {"type": "array"}, _default_parquet_format, id="test_large_list"), + pytest.param(pa.decimal128(2), {"type": "string"}, _default_parquet_format, id="test_decimal128"), + pytest.param(pa.decimal256(2), {"type": "string"}, _default_parquet_format, id="test_decimal256"), + pytest.param(pa.decimal128(2), {"type": "number"}, _decimal_as_float_parquet_format, id="test_decimal128_as_float"), + pytest.param(pa.decimal256(2), {"type": "number"}, _decimal_as_float_parquet_format, id="test_decimal256_as_float"), + pytest.param(pa.map_(pa.int32(), pa.int32()), {"type": "object"}, _default_parquet_format, id="test_map"), + pytest.param(pa.null(), {"type": "null"}, _default_parquet_format, id="test_null"), + ], +) +def test_type_mapping(parquet_type: pa.DataType, expected_type: Mapping[str, str], parquet_format: ParquetFormat) -> None: + if expected_type is None: + with pytest.raises(ValueError): + ParquetParser.parquet_type_to_schema_type(parquet_type, parquet_format) + else: + assert ParquetParser.parquet_type_to_schema_type(parquet_type, parquet_format) == expected_type + + +@pytest.mark.parametrize( + "pyarrow_type, parquet_format, parquet_object, expected_value", + [ + pytest.param(pa.bool_(), _default_parquet_format, True, True, id="test_bool"), + pytest.param(pa.int8(), _default_parquet_format, -1, -1, id="test_int8"), + pytest.param(pa.int16(), _default_parquet_format, 2, 2, id="test_int16"), + pytest.param(pa.int32(), _default_parquet_format, 3, 3, id="test_int32"), + pytest.param(pa.int64(), _default_parquet_format, 4, 4, id="test_int64"), + pytest.param(pa.uint8(), _default_parquet_format, 4, 4, id="test_parquet_uint8"), + pytest.param(pa.uint16(), _default_parquet_format, 5, 5, id="test_parquet_uint16"), + pytest.param(pa.uint32(), _default_parquet_format, 6, 6, id="test_parquet_uint32"), + pytest.param(pa.uint64(), _default_parquet_format, 6, 6, id="test_parquet_uint64"), + pytest.param(pa.float32(), _default_parquet_format, 2.7, 2.7, id="test_parquet_float32"), + pytest.param(pa.float64(), _default_parquet_format, 3.14, 3.14, id="test_parquet_float64"), + pytest.param(pa.time32("s"), _default_parquet_format, datetime.time(1, 2, 3), "01:02:03", id="test_parquet_time32s"), + pytest.param(pa.time32("ms"), _default_parquet_format, datetime.time(3, 4, 5), "03:04:05", id="test_parquet_time32ms"), + pytest.param(pa.time64("us"), _default_parquet_format, datetime.time(6, 7, 8), "06:07:08", id="test_parquet_time64us"), + pytest.param(pa.time64("ns"), _default_parquet_format, datetime.time(9, 10, 11), "09:10:11", id="test_parquet_time64us"), + pytest.param( + pa.timestamp("s"), + _default_parquet_format, + datetime.datetime(2023, 7, 7, 10, 11, 12), + "2023-07-07T10:11:12", + id="test_parquet_timestamps_s", + ), + pytest.param( + pa.timestamp("ms"), + _default_parquet_format, + datetime.datetime(2024, 8, 8, 11, 12, 13), + "2024-08-08T11:12:13", + id="test_parquet_timestamp_ms", + ), + pytest.param( + pa.timestamp("s", "utc"), + _default_parquet_format, + datetime.datetime(2020, 1, 1, 1, 1, 1, tzinfo=datetime.timezone.utc), + "2020-01-01T01:01:01+00:00", + id="test_parquet_timestamps_s_with_tz", + ), + pytest.param( + pa.timestamp("ms", "utc"), + _default_parquet_format, + datetime.datetime(2021, 2, 3, 4, 5, tzinfo=datetime.timezone.utc), + "2021-02-03T04:05:00+00:00", + id="test_parquet_timestamps_ms_with_tz", + ), + pytest.param(pa.date32(), _default_parquet_format, datetime.date(2023, 7, 7), "2023-07-07", id="test_parquet_date32"), + pytest.param(pa.date64(), _default_parquet_format, datetime.date(2023, 7, 8), "2023-07-08", id="test_parquet_date64"), + pytest.param(pa.duration("s"), _default_parquet_format, 12345, 12345, id="test_duration_s"), + pytest.param(pa.duration("ms"), _default_parquet_format, 12345, 12345, id="test_duration_ms"), + pytest.param(pa.duration("us"), _default_parquet_format, 12345, 12345, id="test_duration_us"), + pytest.param(pa.duration("ns"), _default_parquet_format, 12345, 12345, id="test_duration_ns"), + pytest.param( + pa.month_day_nano_interval(), + _default_parquet_format, + datetime.timedelta(days=3, microseconds=4), + [0, 3, 4000], + id="test_parquet_month_day_nano_interval", + ), + pytest.param(pa.binary(), _default_parquet_format, b"this is a binary string", "this is a binary string", id="test_binary"), + pytest.param(pa.binary(2), _default_parquet_format, b"t1", "t1", id="test_fixed_size_binary"), + pytest.param(pa.string(), _default_parquet_format, "this is a string", "this is a string", id="test_parquet_string"), + pytest.param(pa.utf8(), _default_parquet_format, "utf8".encode("utf8"), "utf8", id="test_utf8"), + pytest.param(pa.large_binary(), _default_parquet_format, b"large binary string", "large binary string", id="test_large_binary"), + pytest.param(pa.large_string(), _default_parquet_format, "large string", "large string", id="test_large_string"), + pytest.param(pa.large_utf8(), _default_parquet_format, "large utf8", "large utf8", id="test_large_utf8"), + pytest.param(pa.struct([pa.field("field", pa.int32())]), _default_parquet_format, {"field": 1}, {"field": 1}, id="test_struct"), + pytest.param(pa.list_(pa.int32()), _default_parquet_format, [1, 2, 3], [1, 2, 3], id="test_list"), + pytest.param(pa.large_list(pa.int32()), _default_parquet_format, [4, 5, 6], [4, 5, 6], id="test_large_list"), + pytest.param(pa.decimal128(5, 3), _default_parquet_format, 12, "12.000", id="test_decimal128"), + pytest.param(pa.decimal256(8, 2), _default_parquet_format, 13, "13.00", id="test_decimal256"), + pytest.param(pa.decimal128(5, 3), _decimal_as_float_parquet_format, 12, 12.000, id="test_decimal128"), + pytest.param(pa.decimal256(8, 2), _decimal_as_float_parquet_format, 13, 13.00, id="test_decimal256"), + pytest.param( + pa.map_(pa.string(), pa.int32()), _default_parquet_format, {"hello": 1, "world": 2}, {"hello": 1, "world": 2}, id="test_map" + ), + pytest.param(pa.null(), _default_parquet_format, None, None, id="test_null"), + ], +) +def test_value_transformation( + pyarrow_type: pa.DataType, parquet_format: ParquetFormat, parquet_object: Scalar, expected_value: Any +) -> None: + pyarrow_value = pa.array([parquet_object], type=pyarrow_type)[0] + py_value = ParquetParser._to_output_value(pyarrow_value, parquet_format) + if isinstance(py_value, float): + assert math.isclose(py_value, expected_value, abs_tol=0.01) + else: + assert py_value == expected_value + + +def test_value_dictionary() -> None: + # Setting the dictionary is more involved than other data types so we test it in a separate test + dictionary_values = ["apple", "banana", "cherry"] + indices = [0, 1, 2, 0, 1] + indices_array = pa.array(indices, type=pa.int8()) + dictionary = pa.DictionaryArray.from_arrays(indices_array, dictionary_values) + py_value = ParquetParser._to_output_value(dictionary, _default_parquet_format) + assert py_value == {"indices": [0, 1, 2, 0, 1], "values": ["apple", "banana", "cherry"]} + + +@pytest.mark.parametrize( + "parquet_type, parquet_format", + [ + pytest.param(pa.bool_(), _default_parquet_format, id="test_parquet_bool"), + pytest.param(pa.int8(), _default_parquet_format, id="test_parquet_int8"), + pytest.param(pa.int16(), _default_parquet_format, id="test_parquet_int16"), + pytest.param(pa.int32(), _default_parquet_format, id="test_parquet_int32"), + pytest.param(pa.int64(), _default_parquet_format, id="test_parquet_int64"), + pytest.param(pa.uint8(), _default_parquet_format, id="test_parquet_uint8"), + pytest.param(pa.uint16(), _default_parquet_format, id="test_parquet_uint16"), + pytest.param(pa.uint32(), _default_parquet_format, id="test_parquet_uint32"), + pytest.param(pa.uint64(), _default_parquet_format, id="test_parquet_uint64"), + pytest.param(pa.float16(), _default_parquet_format, id="test_parquet_float16"), + pytest.param(pa.float32(), _default_parquet_format, id="test_parquet_float32"), + pytest.param(pa.float64(), _default_parquet_format, id="test_parquet_float64"), + pytest.param(pa.time32("s"), _default_parquet_format, id="test_parquet_time32s"), + pytest.param(pa.time32("ms"), _default_parquet_format, id="test_parquet_time32ms"), + pytest.param(pa.time64("us"), _default_parquet_format, id="test_parquet_time64us"), + pytest.param(pa.time64("ns"), _default_parquet_format, id="test_parquet_time64ns"), + pytest.param(pa.timestamp("s"), _default_parquet_format, id="test_parquet_timestamps_s"), + pytest.param(pa.timestamp("ms"), _default_parquet_format, id="test_parquet_timestamp_ms"), + pytest.param(pa.timestamp("s", "utc"), _default_parquet_format, id="test_parquet_timestamps_s_with_tz"), + pytest.param(pa.timestamp("ms", "est"), _default_parquet_format, id="test_parquet_timestamps_ms_with_tz"), + pytest.param(pa.date32(), _default_parquet_format, id="test_parquet_date32"), + pytest.param(pa.date64(), _default_parquet_format, id="test_parquet_date64"), + pytest.param(pa.duration("s"), _default_parquet_format, id="test_duration_s"), + pytest.param(pa.duration("ms"), _default_parquet_format, id="test_duration_ms"), + pytest.param(pa.duration("us"), _default_parquet_format, id="test_duration_us"), + pytest.param(pa.duration("ns"), _default_parquet_format, id="test_duration_ns"), + pytest.param(pa.month_day_nano_interval(), _default_parquet_format, id="test_parquet_month_day_nano_interval"), + pytest.param(pa.binary(), _default_parquet_format, id="test_binary"), + pytest.param(pa.binary(2), _default_parquet_format, id="test_fixed_size_binary"), + pytest.param(pa.string(), _default_parquet_format, id="test_parquet_string"), + pytest.param(pa.utf8(), _default_parquet_format, id="test_utf8"), + pytest.param(pa.large_binary(), _default_parquet_format, id="test_large_binary"), + pytest.param(pa.large_string(), _default_parquet_format, id="test_large_string"), + pytest.param(pa.large_utf8(), _default_parquet_format, id="test_large_utf8"), + pytest.param(pa.dictionary(pa.int32(), pa.string()), _default_parquet_format, id="test_dictionary"), + pytest.param(pa.struct([pa.field("field", pa.int32())]), _default_parquet_format, id="test_struct"), + pytest.param(pa.list_(pa.int32()), _default_parquet_format, id="test_list"), + pytest.param(pa.large_list(pa.int32()), _default_parquet_format, id="test_large_list"), + pytest.param(pa.decimal128(2), _default_parquet_format, id="test_decimal128"), + pytest.param(pa.decimal256(2), _default_parquet_format, id="test_decimal256"), + pytest.param(pa.decimal128(2), _decimal_as_float_parquet_format, id="test_decimal128_as_float"), + pytest.param(pa.decimal256(2), _decimal_as_float_parquet_format, id="test_decimal256_as_float"), + pytest.param(pa.map_(pa.int32(), pa.int32()), _default_parquet_format, id="test_map"), + pytest.param(pa.null(), _default_parquet_format, id="test_null"), + ], +) +def test_null_value_does_not_throw(parquet_type, parquet_format) -> None: + pyarrow_value = pa.scalar(None, type=parquet_type) + assert ParquetParser._to_output_value(pyarrow_value, parquet_format) is None + + +@pytest.mark.parametrize( + "file_format", + [ + pytest.param( + CsvFormat( + filetype="csv", + delimiter=",", + escape_char="\\", + quote_char='"', + ), + id="test_csv_format", + ), + pytest.param(JsonlFormat(), id="test_jsonl_format"), + ], +) +def test_wrong_file_format(file_format: Union[CsvFormat, JsonlFormat]) -> None: + parser = ParquetParser() + config = FileBasedStreamConfig( + name="test.parquet", + file_type=file_format.filetype, + format={file_format.filetype: file_format}, + validation_policy=ValidationPolicy.emit_record, + ) + file = RemoteFile(uri="s3://mybucket/test.parquet", last_modified=datetime.datetime.now()) + stream_reader = Mock() + logger = Mock() + with pytest.raises(ValueError): + asyncio.get_event_loop().run_until_complete(parser.infer_schema(config, file, stream_reader, logger)) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_unstructured_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_unstructured_parser.py new file mode 100644 index 000000000000..9bc096c5136e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_unstructured_parser.py @@ -0,0 +1,593 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import asyncio +from datetime import datetime +from unittest import mock +from unittest.mock import MagicMock, call, mock_open, patch + +import pytest +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.config.unstructured_format import APIParameterConfigModel, APIProcessingConfigModel, UnstructuredFormat +from airbyte_cdk.sources.file_based.exceptions import RecordParseError +from airbyte_cdk.sources.file_based.file_types import UnstructuredParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unstructured.documents.elements import ElementMetadata, Formula, ListItem, Text, Title +from unstructured.file_utils.filetype import FileType + +FILE_URI = "path/to/file.xyz" + + +@pytest.mark.parametrize( + "filetype, format_config, raises", + [ + pytest.param( + FileType.MD, + UnstructuredFormat(skip_unprocessable_files=False), + False, + id="markdown_file", + ), + pytest.param( + FileType.CSV, + UnstructuredFormat(skip_unprocessable_files=False), + True, + id="wrong_file_format", + ), + pytest.param( + FileType.CSV, + UnstructuredFormat(skip_unprocessable_files=True), + False, + id="wrong_file_format_skipping", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_files=False), + False, + id="pdf_file", + ), + pytest.param( + FileType.DOCX, + UnstructuredFormat(skip_unprocessable_files=False), + False, + id="docx_file", + ), + pytest.param( + FileType.PPTX, + UnstructuredFormat(skip_unprocessable_files=False), + False, + id="pptx_file", + ), + ], +) +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype") +def test_infer_schema(mock_detect_filetype, filetype, format_config, raises): + # use a fresh event loop to avoid leaking into other tests + main_loop = asyncio.get_event_loop() + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + stream_reader = MagicMock() + mock_open(stream_reader.open_file) + fake_file = MagicMock() + fake_file.uri = FILE_URI + logger = MagicMock() + mock_detect_filetype.return_value = filetype + config = MagicMock() + config.format = format_config + if raises: + with pytest.raises(RecordParseError): + loop.run_until_complete(UnstructuredParser().infer_schema(config, fake_file, stream_reader, logger)) + else: + schema = loop.run_until_complete(UnstructuredParser().infer_schema(config, MagicMock(), MagicMock(), MagicMock())) + assert schema == { + "content": {"type": "string", "description": "Content of the file as markdown. Might be null if the file could not be parsed"}, + "document_key": {"type": "string", "description": "Unique identifier of the document, e.g. the file path"}, + "_ab_source_file_parse_error": { + "type": "string", + "description": "Error message if the file could not be parsed even though the file is supported", + }, + } + loop.close() + asyncio.set_event_loop(main_loop) + + +@pytest.mark.parametrize( + "filetype, format_config, parse_result, raises, expected_records, parsing_error", + [ + pytest.param( + FileType.MD, + UnstructuredFormat(skip_unprocessable_files=False), + "test", + False, + [ + { + "content": "test", + "document_key": FILE_URI, + "_ab_source_file_parse_error": None, + } + ], + False, + id="markdown_file", + ), + pytest.param( + FileType.CSV, + UnstructuredFormat(skip_unprocessable_files=False), + None, + True, + None, + False, + id="wrong_file_format", + ), + pytest.param( + FileType.CSV, + UnstructuredFormat(skip_unprocessable_files=True), + None, + False, + [ + { + "content": None, + "document_key": FILE_URI, + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=path/to/file.xyz message=File type FileType.CSV is not supported. Supported file types are FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT", + } + ], + False, + id="skip_unprocessable_files", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_files=False), + [ + Title("heading"), + Text("This is the text"), + ListItem("This is a list item"), + Formula("This is a formula"), + ], + False, + [ + { + "content": "# heading\n\nThis is the text\n\n- This is a list item\n\n```\nThis is a formula\n```", + "document_key": FILE_URI, + "_ab_source_file_parse_error": None, + } + ], + False, + id="pdf_file", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_files=False), + [ + Title("first level heading", metadata=ElementMetadata(category_depth=1)), + Title("second level heading", metadata=ElementMetadata(category_depth=2)), + ], + False, + [ + { + "content": "# first level heading\n\n## second level heading", + "document_key": FILE_URI, + "_ab_source_file_parse_error": None, + } + ], + False, + id="multi_level_headings", + ), + pytest.param( + FileType.DOCX, + UnstructuredFormat(skip_unprocessable_files=False), + [ + Title("heading"), + Text("This is the text"), + ListItem("This is a list item"), + Formula("This is a formula"), + ], + False, + [ + { + "content": "# heading\n\nThis is the text\n\n- This is a list item\n\n```\nThis is a formula\n```", + "document_key": FILE_URI, + "_ab_source_file_parse_error": None, + } + ], + False, + id="docx_file", + ), + pytest.param( + FileType.DOCX, + UnstructuredFormat(skip_unprocessable_files=True), + "", + False, + [ + { + "content": None, + "document_key": FILE_URI, + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=path/to/file.xyz message=weird parsing error", + } + ], + True, + id="exception_during_parsing", + ), + ], +) +@patch("unstructured.partition.pdf.partition_pdf") +@patch("unstructured.partition.pptx.partition_pptx") +@patch("unstructured.partition.docx.partition_docx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype") +def test_parse_records( + mock_detect_filetype, + mock_partition_docx, + mock_partition_pptx, + mock_partition_pdf, + filetype, + format_config, + parse_result, + raises, + expected_records, + parsing_error, +): + stream_reader = MagicMock() + mock_open(stream_reader.open_file, read_data=bytes(str(parse_result), "utf-8")) + fake_file = RemoteFile(uri=FILE_URI, last_modified=datetime.now()) + fake_file.uri = FILE_URI + logger = MagicMock() + config = MagicMock() + config.format = format_config + mock_detect_filetype.return_value = filetype + if parsing_error: + mock_partition_docx.side_effect = Exception("weird parsing error") + mock_partition_pptx.side_effect = Exception("weird parsing error") + mock_partition_pdf.side_effect = Exception("weird parsing error") + else: + mock_partition_docx.return_value = parse_result + mock_partition_pptx.return_value = parse_result + mock_partition_pdf.return_value = parse_result + if raises: + with pytest.raises(RecordParseError): + list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock())) + else: + assert list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock())) == expected_records + + +@pytest.mark.parametrize( + "format_config, raises_for_status, json_response, is_ok, expected_error", + [ + pytest.param( + UnstructuredFormat(skip_unprocessable_file_types=False), + False, + {"status": "ok"}, + True, + None, + id="local", + ), + pytest.param( + UnstructuredFormat(skip_unprocessable_file_types=False, strategy="fast"), + False, + {"status": "ok"}, + True, + None, + id="local_ok_strategy", + ), + pytest.param( + UnstructuredFormat(skip_unprocessable_file_types=False, strategy="hi_res"), + False, + {"status": "ok"}, + False, + "Hi-res strategy is not supported for local processing", + id="local_unsupported_strategy", + ), + pytest.param( + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + False, + [{"type": "Title", "text": "Airbyte source connection test"}], + True, + None, + id="api_ok", + ), + pytest.param( + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + True, + None, + False, + "API error", + id="api_error", + ), + pytest.param( + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + False, + {"unexpected": "response"}, + False, + "Error", + id="unexpected_handling_error", + ), + ], +) +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.requests") +def test_check_config(requests_mock, format_config, raises_for_status, json_response, is_ok, expected_error): + mock_response = MagicMock() + mock_response.json.return_value = json_response + if raises_for_status: + mock_response.raise_for_status.side_effect = Exception("API error") + requests_mock.post.return_value = mock_response + result, error = UnstructuredParser().check_config(FileBasedStreamConfig(name="test", format=format_config)) + assert result == is_ok + if expected_error: + assert expected_error in error + + +@pytest.mark.parametrize( + "filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records, http_status_code", + [ + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + None, + "test", + [{"type": "Text", "text": "test"}], + [ + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ) + ], + False, + [{"content": "test", "document_key": FILE_URI, "_ab_source_file_parse_error": None}], + 200, + id="basic_request", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat( + skip_unprocessable_file_types=False, + strategy="hi_res", + processing=APIProcessingConfigModel( + mode="api", + api_key="test", + api_url="http://localhost:8000", + parameters=[ + APIParameterConfigModel(name="include_page_breaks", value="true"), + APIParameterConfigModel(name="ocr_languages", value="eng"), + APIParameterConfigModel(name="ocr_languages", value="kor"), + ], + ), + ), + None, + "test", + [{"type": "Text", "text": "test"}], + [ + call( + "http://localhost:8000/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "hi_res", "include_page_breaks": "true", "ocr_languages": ["eng", "kor"]}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ) + ], + False, + [{"content": "test", "document_key": FILE_URI, "_ab_source_file_parse_error": None}], + 200, + id="request_with_params", + ), + pytest.param( + FileType.MD, + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + None, + "# Mymarkdown", + None, + None, + False, + [{"content": "# Mymarkdown", "document_key": FILE_URI, "_ab_source_file_parse_error": None}], + 200, + id="handle_markdown_locally", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + [ + requests.exceptions.RequestException("API error"), + requests.exceptions.RequestException("API error"), + requests.exceptions.RequestException("API error"), + requests.exceptions.RequestException("API error"), + requests.exceptions.RequestException("API error"), + ], + "test", + None, + [ + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + ], + True, + None, + 200, + id="retry_and_raise_on_api_error", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + [ + requests.exceptions.RequestException("API error"), + requests.exceptions.RequestException("API error"), + None, + ], + "test", + [{"type": "Text", "text": "test"}], + [ + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + ], + False, + [{"content": "test", "document_key": FILE_URI, "_ab_source_file_parse_error": None}], + 200, + id="retry_and_recover", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + [ + Exception("Unexpected error"), + ], + "test", + [{"type": "Text", "text": "test"}], + [ + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + ], + True, + None, + 200, + id="no_retry_on_unexpected_error", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + [ + requests.exceptions.RequestException("API error", response=MagicMock(status_code=400)), + ], + "test", + [{"type": "Text", "text": "test"}], + [ + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + call().raise_for_status(), + ], + True, + None, + 400, + id="no_retry_on_400_error", + ), + pytest.param( + FileType.PDF, + UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")), + None, + "test", + [{"detail": "Something went wrong"}], + [ + call( + "https://api.unstructured.io/general/v0/general", + headers={"accept": "application/json", "unstructured-api-key": "test"}, + data={"strategy": "auto"}, + files={"files": ("filename", mock.ANY, "application/pdf")}, + ), + ], + False, + [ + { + "content": None, + "document_key": FILE_URI, + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=path/to/file.xyz message=[{'detail': 'Something went wrong'}]", + } + ], + 422, + id="error_record_on_422_error", + ), + ], +) +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.requests") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype") +@patch("time.sleep", side_effect=lambda _: None) +def test_parse_records_remotely( + time_mock, + mock_detect_filetype, + requests_mock, + filetype, + format_config, + raises_for_status, + file_content, + json_response, + expected_requests, + raises, + expected_records, + http_status_code, +): + stream_reader = MagicMock() + mock_open(stream_reader.open_file, read_data=bytes(str(file_content), "utf-8")) + fake_file = RemoteFile(uri=FILE_URI, last_modified=datetime.now()) + fake_file.uri = FILE_URI + logger = MagicMock() + config = MagicMock() + config.format = format_config + mock_detect_filetype.return_value = filetype + mock_response = MagicMock() + mock_response.json.return_value = json_response + mock_response.status_code = http_status_code + if raises_for_status: + mock_response.raise_for_status.side_effect = raises_for_status + requests_mock.post.return_value = mock_response + requests_mock.exceptions.RequestException = requests.exceptions.RequestException + + if raises: + with pytest.raises(AirbyteTracedException) as exc: + list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock())) + # Failures from the API are treated as config errors + assert exc.value.failure_type == FailureType.config_error + else: + assert list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock())) == expected_records + + if expected_requests: + requests_mock.post.assert_has_calls(expected_requests) + else: + requests_mock.post.assert_not_called() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/helpers.py b/airbyte-cdk/python/unit_tests/sources/file_based/helpers.py new file mode 100644 index 000000000000..6d4966e2c2c9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/helpers.py @@ -0,0 +1,70 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from datetime import datetime +from io import IOBase +from typing import Any, Dict, List, Mapping, Optional + +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.discovery_policy import DefaultDiscoveryPolicy +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.jsonl_parser import JsonlParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor +from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor +from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesStreamReader + + +class EmptySchemaParser(CsvParser): + async def infer_schema( + self, config: FileBasedStreamConfig, file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger + ) -> Dict[str, Any]: + return {} + + +class LowInferenceLimitDiscoveryPolicy(DefaultDiscoveryPolicy): + def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int: + return 1 + + +class LowInferenceBytesJsonlParser(JsonlParser): + MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1 + + +class TestErrorListMatchingFilesInMemoryFilesStreamReader(InMemoryFilesStreamReader): + def get_matching_files( + self, + globs: List[str], + from_date: Optional[datetime] = None, + ) -> List[RemoteFile]: + raise Exception("Error listing files") + + +class TestErrorOpenFileInMemoryFilesStreamReader(InMemoryFilesStreamReader): + def open_file(self, file: RemoteFile, file_read_mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase: + raise Exception("Error opening file") + + +class FailingSchemaValidationPolicy(AbstractSchemaValidationPolicy): + ALWAYS_FAIL = "always_fail" + validate_schema_before_sync = True + + def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool: + return False + + +class LowHistoryLimitCursor(DefaultFileBasedCursor): + DEFAULT_MAX_HISTORY_SIZE = 3 + + +class LowHistoryLimitConcurrentCursor(FileBasedConcurrentCursor): + DEFAULT_MAX_HISTORY_SIZE = 3 + + +def make_remote_files(files: List[str]) -> List[RemoteFile]: + return [RemoteFile(uri=f, last_modified=datetime.strptime("2023-06-05T03:54:07.000Z", "%Y-%m-%dT%H:%M:%S.%fZ")) for f in files] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/in_memory_files_source.py b/airbyte-cdk/python/unit_tests/sources/file_based/in_memory_files_source.py new file mode 100644 index 000000000000..bf3bb79671a8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/in_memory_files_source.py @@ -0,0 +1,237 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import csv +import io +import json +import logging +import tempfile +from datetime import datetime +from io import IOBase +from typing import Any, Dict, Iterable, List, Mapping, Optional + +import avro.io as ai +import avro.schema as avro_schema +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from airbyte_cdk.models import ConfiguredAirbyteCatalog, ConfiguredAirbyteCatalogSerializer +from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy +from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor, DefaultFileBasedCursor +from airbyte_cdk.sources.source import TState +from avro import datafile +from pydantic.v1 import AnyUrl + + +class InMemoryFilesSource(FileBasedSource): + _concurrency_level = 10 + + def __init__( + self, + files: Mapping[str, Any], + file_type: str, + availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy], + discovery_policy: Optional[AbstractDiscoveryPolicy], + validation_policies: Mapping[str, AbstractSchemaValidationPolicy], + parsers: Mapping[str, FileTypeParser], + stream_reader: Optional[AbstractFileBasedStreamReader], + catalog: Optional[Mapping[str, Any]], + config: Optional[Mapping[str, Any]], + state: Optional[TState], + file_write_options: Mapping[str, Any], + cursor_cls: Optional[AbstractFileBasedCursor], + ): + # Attributes required for test purposes + self.files = files + self.file_type = file_type + self.catalog = catalog + self.configured_catalog = ConfiguredAirbyteCatalogSerializer.load(self.catalog) if self.catalog else None + self.config = config + self.state = state + + # Source setup + stream_reader = stream_reader or InMemoryFilesStreamReader(files=files, file_type=file_type, file_write_options=file_write_options) + availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(stream_reader) # type: ignore[assignment] + super().__init__( + stream_reader, + spec_class=InMemorySpec, + catalog=self.configured_catalog, + config=self.config, + state=self.state, + availability_strategy=availability_strategy, + discovery_policy=discovery_policy or DefaultDiscoveryPolicy(), + parsers=parsers, + validation_policies=validation_policies or DEFAULT_SCHEMA_VALIDATION_POLICIES, + cursor_cls=cursor_cls or DefaultFileBasedCursor, + ) + + def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: + return self.configured_catalog + + +class InMemoryFilesStreamReader(AbstractFileBasedStreamReader): + def __init__(self, files: Mapping[str, Mapping[str, Any]], file_type: str, file_write_options: Optional[Mapping[str, Any]] = None): + self.files = files + self.file_type = file_type + self.file_write_options = file_write_options + super().__init__() + + @property + def config(self) -> Optional[AbstractFileBasedSpec]: + return self._config + + @config.setter + def config(self, value: AbstractFileBasedSpec) -> None: + self._config = value + + def get_matching_files( + self, + globs: List[str], + prefix: Optional[str], + logger: logging.Logger, + ) -> Iterable[RemoteFile]: + yield from self.filter_files_by_globs_and_start_date( + [ + RemoteFile( + uri=f, + mime_type=data.get("mime_type", None), + last_modified=datetime.strptime(data["last_modified"], "%Y-%m-%dT%H:%M:%S.%fZ"), + ) + for f, data in self.files.items() + ], + globs, + ) + + def file_size(self, file: RemoteFile) -> int: + return 0 + + def get_file(self, file: RemoteFile, local_directory: str, logger: logging.Logger) -> Dict[str, Any]: + return {} + + def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase: + if self.file_type == "csv": + return self._make_csv_file_contents(file.uri) + elif self.file_type == "jsonl": + return self._make_jsonl_file_contents(file.uri) + elif self.file_type == "unstructured": + return self._make_binary_file_contents(file.uri) + else: + raise NotImplementedError(f"No implementation for file type: {self.file_type}") + + def _make_csv_file_contents(self, file_name: str) -> IOBase: + + # Some tests define the csv as an array of strings to make it easier to validate the handling + # of quotes, delimiter, and escpare chars. + if isinstance(self.files[file_name]["contents"][0], str): + return io.StringIO("\n".join([s.strip() for s in self.files[file_name]["contents"]])) + + fh = io.StringIO() + + if self.file_write_options: + csv.register_dialect("in_memory_dialect", **self.file_write_options) + writer = csv.writer(fh, dialect="in_memory_dialect") + writer.writerows(self.files[file_name]["contents"]) + csv.unregister_dialect("in_memory_dialect") + else: + writer = csv.writer(fh) + writer.writerows(self.files[file_name]["contents"]) + fh.seek(0) + return fh + + def _make_jsonl_file_contents(self, file_name: str) -> IOBase: + fh = io.BytesIO() + + for line in self.files[file_name]["contents"]: + try: + fh.write((json.dumps(line) + "\n").encode("utf-8")) + except TypeError: + # Intentionally trigger json validation error + fh.write((str(line) + "\n").encode("utf-8")) + fh.seek(0) + return fh + + def _make_binary_file_contents(self, file_name: str) -> IOBase: + fh = io.BytesIO() + + fh.write(self.files[file_name]["contents"]) + fh.seek(0) + return fh + + +class InMemorySpec(AbstractFileBasedSpec): + @classmethod + def documentation_url(cls) -> AnyUrl: + return AnyUrl(scheme="https", url="https://docs.airbyte.com/integrations/sources/in_memory_files") # type: ignore + + +class TemporaryParquetFilesStreamReader(InMemoryFilesStreamReader): + """ + A file reader that writes RemoteFiles to a temporary file and then reads them back. + """ + + def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase: + return io.BytesIO(self._create_file(file.uri)) + + def _create_file(self, file_name: str) -> bytes: + contents = self.files[file_name]["contents"] + schema = self.files[file_name].get("schema") + + df = pd.DataFrame(contents[1:], columns=contents[0]) + with tempfile.TemporaryFile() as fp: + table = pa.Table.from_pandas(df, schema) + pq.write_table(table, fp) + + fp.seek(0) + return fp.read() + + +class TemporaryAvroFilesStreamReader(InMemoryFilesStreamReader): + """ + A file reader that writes RemoteFiles to a temporary file and then reads them back. + """ + + def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase: + return io.BytesIO(self._make_file_contents(file.uri)) + + def _make_file_contents(self, file_name: str) -> bytes: + contents = self.files[file_name]["contents"] + schema = self.files[file_name]["schema"] + stream_schema = avro_schema.make_avsc_object(schema) + + rec_writer = ai.DatumWriter(stream_schema) + with tempfile.TemporaryFile() as fp: + file_writer = datafile.DataFileWriter(fp, rec_writer, stream_schema) + for content in contents: + data = {col["name"]: content[i] for i, col in enumerate(schema["fields"])} + file_writer.append(data) + file_writer.flush() + fp.seek(0) + return fp.read() + + +class TemporaryExcelFilesStreamReader(InMemoryFilesStreamReader): + """ + A file reader that writes RemoteFiles to a temporary file and then reads them back. + """ + + def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase: + return io.BytesIO(self._make_file_contents(file.uri)) + + def _make_file_contents(self, file_name: str) -> bytes: + contents = self.files[file_name]["contents"] + df = pd.DataFrame(contents) + + with io.BytesIO() as fp: + writer = pd.ExcelWriter(fp, engine="xlsxwriter") + df.to_excel(writer, index=False, sheet_name="Sheet1") + writer._save() + fp.seek(0) + return fp.read() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py new file mode 100644 index 000000000000..7b891a168f7a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py @@ -0,0 +1,750 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime +import decimal + +from unit_tests.sources.file_based.in_memory_files_source import TemporaryAvroFilesStreamReader +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +_single_avro_file = { + "a.avro": { + "schema": { + "type": "record", + "name": "sampleAvro", + "fields": [ + {"name": "col1", "type": "string"}, + {"name": "col2", "type": "int"}, + ], + }, + "contents": [ + ("val11", 12), + ("val21", 22), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +_multiple_avro_combine_schema_file = { + "a.avro": { + "schema": { + "type": "record", + "name": "sampleAvro", + "fields": [ + {"name": "col_double", "type": "double"}, + {"name": "col_string", "type": "string"}, + {"name": "col_album", "type": {"type": "record", "name": "Album", "fields": [{"name": "album", "type": "string"}]}}, + ], + }, + "contents": [ + (20.02, "Robbers", {"album": "The 1975"}), + (20.23, "Somebody Else", {"album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"}), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.avro": { + "schema": { + "type": "record", + "name": "sampleAvro", + "fields": [ + {"name": "col_double", "type": "double"}, + {"name": "col_string", "type": "string"}, + {"name": "col_song", "type": {"type": "record", "name": "Song", "fields": [{"name": "title", "type": "string"}]}}, + ], + }, + "contents": [ + (1975.1975, "It's Not Living (If It's Not with You)", {"title": "Love It If We Made It"}), + (5791.5791, "The 1975", {"title": "About You"}), + ], + "last_modified": "2023-06-06T03:54:07.000Z", + }, +} + +_avro_all_types_file = { + "a.avro": { + "schema": { + "type": "record", + "name": "sampleAvro", + "fields": [ + # Primitive Types + {"name": "col_bool", "type": "boolean"}, + {"name": "col_int", "type": "int"}, + {"name": "col_long", "type": "long"}, + {"name": "col_float", "type": "float"}, + {"name": "col_double", "type": "double"}, + {"name": "col_bytes", "type": "bytes"}, + {"name": "col_string", "type": "string"}, + # Complex Types + { + "name": "col_record", + "type": { + "type": "record", + "name": "SongRecord", + "fields": [ + {"name": "artist", "type": "string"}, + {"name": "song", "type": "string"}, + {"name": "year", "type": "int"}, + ], + }, + }, + {"name": "col_enum", "type": {"type": "enum", "name": "Genre", "symbols": ["POP_ROCK", "INDIE_ROCK", "ALTERNATIVE_ROCK"]}}, + {"name": "col_array", "type": {"type": "array", "items": "string"}}, + {"name": "col_map", "type": {"type": "map", "values": "string"}}, + {"name": "col_fixed", "type": {"type": "fixed", "name": "MyFixed", "size": 4}}, + # Logical Types + {"name": "col_decimal", "type": {"type": "bytes", "logicalType": "decimal", "precision": 10, "scale": 5}}, + {"name": "col_uuid", "type": {"type": "string", "logicalType": "uuid"}}, + {"name": "col_date", "type": {"type": "int", "logicalType": "date"}}, + {"name": "col_time_millis", "type": {"type": "int", "logicalType": "time-millis"}}, + {"name": "col_time_micros", "type": {"type": "long", "logicalType": "time-micros"}}, + {"name": "col_timestamp_millis", "type": {"type": "long", "logicalType": "timestamp-millis"}}, + {"name": "col_timestamp_micros", "type": {"type": "long", "logicalType": "timestamp-micros"}}, + ], + }, + "contents": [ + ( + True, + 27, + 1992, + 999.09723456, + 9123456.12394, + b"\x00\x01\x02\x03", + "Love It If We Made It", + {"artist": "The 1975", "song": "About You", "year": 2022}, + "POP_ROCK", + [ + "The 1975", + "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It", + "The 1975 A Brief Inquiry into Online Relationships", + "Notes on a Conditional Form", + "Being Funny in a Foreign Language", + ], + {"lead_singer": "Matty Healy", "lead_guitar": "Adam Hann", "bass_guitar": "Ross MacDonald", "drummer": "George Daniel"}, + b"\x12\x34\x56\x78", + decimal.Decimal("1234.56789"), + "123e4567-e89b-12d3-a456-426655440000", + datetime.date(2022, 5, 29), + datetime.time(6, 0, 0, 456000), + datetime.time(12, 0, 0, 456789), + datetime.datetime(2022, 5, 29, 0, 0, 0, 456000, tzinfo=datetime.timezone.utc), + datetime.datetime(2022, 5, 30, 0, 0, 0, 456789, tzinfo=datetime.timezone.utc), + ), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +_multiple_avro_stream_file = { + "odesza_songs.avro": { + "schema": { + "type": "record", + "name": "sampleAvro", + "fields": [ + {"name": "col_title", "type": "string"}, + { + "name": "col_album", + "type": { + "type": "enum", + "name": "Album", + "symbols": ["SUMMERS_GONE", "IN_RETURN", "A_MOMENT_APART", "THE_LAST_GOODBYE"], + }, + }, + {"name": "col_year", "type": "int"}, + {"name": "col_vocals", "type": "boolean"}, + ], + }, + "contents": [ + ("Late Night", "A_MOMENT_APART", 2017, False), + ("White Lies", "IN_RETURN", 2014, True), + ("Wide Awake", "THE_LAST_GOODBYE", 2022, True), + ("Sun Models", "SUMMERS_GONE", 2012, True), + ("All We Need", "IN_RETURN", 2014, True), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "california_festivals.avro": { + "schema": { + "type": "record", + "name": "sampleAvro", + "fields": [ + {"name": "col_name", "type": "string"}, + { + "name": "col_location", + "type": { + "type": "record", + "name": "LocationRecord", + "fields": [ + {"name": "country", "type": "string"}, + {"name": "state", "type": "string"}, + {"name": "city", "type": "string"}, + ], + }, + }, + {"name": "col_attendance", "type": "long"}, + ], + }, + "contents": [ + ("Coachella", {"country": "USA", "state": "California", "city": "Indio"}, 250000), + ("CRSSD", {"country": "USA", "state": "California", "city": "San Diego"}, 30000), + ("Lightning in a Bottle", {"country": "USA", "state": "California", "city": "Buena Vista Lake"}, 18000), + ("Outside Lands", {"country": "USA", "state": "California", "city": "San Francisco"}, 220000), + ], + "last_modified": "2023-06-06T03:54:07.000Z", + }, +} + +single_avro_scenario = ( + TestScenarioBuilder() + .set_name("single_avro_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "avro"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryAvroFilesStreamReader(files=_single_avro_file, file_type="avro")) + .set_file_type("avro") + ) + .set_expected_check_status("SUCCEEDED") + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": 12, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.avro", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": 22, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.avro", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "integer"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +multiple_avro_combine_schema_scenario = ( + TestScenarioBuilder() + .set_name("multiple_avro_combine_schema_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "avro"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryAvroFilesStreamReader(files=_multiple_avro_combine_schema_file, file_type="avro")) + .set_file_type("avro") + ) + .set_expected_records( + [ + { + "data": { + "col_double": 20.02, + "col_string": "Robbers", + "col_album": {"album": "The 1975"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.avro", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 20.23, + "col_string": "Somebody Else", + "col_album": {"album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.avro", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 1975.1975, + "col_string": "It's Not Living (If It's Not with You)", + "col_song": {"title": "Love It If We Made It"}, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.avro", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 5791.5791, + "col_string": "The 1975", + "col_song": {"title": "About You"}, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.avro", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_double": {"type": ["null", "number"]}, + "col_string": {"type": ["null", "string"]}, + "col_album": { + "properties": { + "album": {"type": ["null", "string"]}, + }, + "type": ["null", "object"], + }, + "col_song": { + "properties": { + "title": {"type": ["null", "string"]}, + }, + "type": ["null", "object"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +avro_all_types_scenario = ( + TestScenarioBuilder() + .set_name("avro_all_types_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "avro"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryAvroFilesStreamReader(files=_avro_all_types_file, file_type="avro")) + .set_file_type("avro") + ) + .set_expected_records( + [ + { + "data": { + "col_bool": True, + "col_int": 27, + "col_long": 1992, + "col_float": 999.09723456, + "col_double": 9123456.12394, + "col_bytes": "\x00\x01\x02\x03", + "col_string": "Love It If We Made It", + "col_record": {"artist": "The 1975", "song": "About You", "year": 2022}, + "col_enum": "POP_ROCK", + "col_array": [ + "The 1975", + "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It", + "The 1975 A Brief Inquiry into Online Relationships", + "Notes on a Conditional Form", + "Being Funny in a Foreign Language", + ], + "col_map": { + "lead_singer": "Matty Healy", + "lead_guitar": "Adam Hann", + "bass_guitar": "Ross MacDonald", + "drummer": "George Daniel", + }, + "col_fixed": "\x12\x34\x56\x78", + "col_decimal": "1234.56789", + "col_uuid": "123e4567-e89b-12d3-a456-426655440000", + "col_date": "2022-05-29", + "col_time_millis": "06:00:00.456000", + "col_time_micros": "12:00:00.456789", + "col_timestamp_millis": "2022-05-29T00:00:00.456+00:00", + "col_timestamp_micros": "2022-05-30T00:00:00.456789+00:00", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.avro", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_array": {"items": {"type": ["null", "string"]}, "type": ["null", "array"]}, + "col_bool": {"type": ["null", "boolean"]}, + "col_bytes": {"type": ["null", "string"]}, + "col_double": {"type": ["null", "number"]}, + "col_enum": {"enum": ["POP_ROCK", "INDIE_ROCK", "ALTERNATIVE_ROCK"], "type": ["null", "string"]}, + "col_fixed": {"pattern": "^[0-9A-Fa-f]{8}$", "type": ["null", "string"]}, + "col_float": {"type": ["null", "number"]}, + "col_int": {"type": ["null", "integer"]}, + "col_long": {"type": ["null", "integer"]}, + "col_map": {"additionalProperties": {"type": ["null", "string"]}, "type": ["null", "object"]}, + "col_record": { + "properties": { + "artist": {"type": ["null", "string"]}, + "song": {"type": ["null", "string"]}, + "year": {"type": ["null", "integer"]}, + }, + "type": ["null", "object"], + }, + "col_string": {"type": ["null", "string"]}, + "col_decimal": {"pattern": "^-?\\d{(1, 5)}(?:\\.\\d(1, 5))?$", "type": ["null", "string"]}, + "col_uuid": {"type": ["null", "string"]}, + "col_date": {"format": "date", "type": ["null", "string"]}, + "col_time_millis": {"type": ["null", "integer"]}, + "col_time_micros": {"type": ["null", "integer"]}, + "col_timestamp_millis": {"format": "date-time", "type": ["null", "string"]}, + "col_timestamp_micros": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +multiple_streams_avro_scenario = ( + TestScenarioBuilder() + .set_name("multiple_streams_avro_stream") + .set_config( + { + "streams": [ + { + "name": "songs_stream", + "format": {"filetype": "avro"}, + "globs": ["*_songs.avro"], + "validation_policy": "Emit Record", + }, + { + "name": "festivals_stream", + "format": {"filetype": "avro"}, + "globs": ["*_festivals.avro"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryAvroFilesStreamReader(files=_multiple_avro_stream_file, file_type="avro")) + .set_file_type("avro") + ) + .set_expected_records( + [ + { + "data": { + "col_title": "Late Night", + "col_album": "A_MOMENT_APART", + "col_year": 2017, + "col_vocals": False, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.avro", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_title": "White Lies", + "col_album": "IN_RETURN", + "col_year": 2014, + "col_vocals": True, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.avro", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_title": "Wide Awake", + "col_album": "THE_LAST_GOODBYE", + "col_year": 2022, + "col_vocals": True, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.avro", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_title": "Sun Models", + "col_album": "SUMMERS_GONE", + "col_year": 2012, + "col_vocals": True, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.avro", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_title": "All We Need", + "col_album": "IN_RETURN", + "col_year": 2014, + "col_vocals": True, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.avro", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_name": "Coachella", + "col_location": {"country": "USA", "state": "California", "city": "Indio"}, + "col_attendance": 250000, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "california_festivals.avro", + }, + "stream": "festivals_stream", + }, + { + "data": { + "col_name": "CRSSD", + "col_location": {"country": "USA", "state": "California", "city": "San Diego"}, + "col_attendance": 30000, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "california_festivals.avro", + }, + "stream": "festivals_stream", + }, + { + "data": { + "col_name": "Lightning in a Bottle", + "col_location": {"country": "USA", "state": "California", "city": "Buena Vista Lake"}, + "col_attendance": 18000, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "california_festivals.avro", + }, + "stream": "festivals_stream", + }, + { + "data": { + "col_name": "Outside Lands", + "col_location": {"country": "USA", "state": "California", "city": "San Francisco"}, + "col_attendance": 220000, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "california_festivals.avro", + }, + "stream": "festivals_stream", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_title": {"type": ["null", "string"]}, + "col_album": { + "type": ["null", "string"], + "enum": ["SUMMERS_GONE", "IN_RETURN", "A_MOMENT_APART", "THE_LAST_GOODBYE"], + }, + "col_year": {"type": ["null", "integer"]}, + "col_vocals": {"type": ["null", "boolean"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "songs_stream", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_name": {"type": ["null", "string"]}, + "col_location": { + "properties": { + "country": {"type": ["null", "string"]}, + "state": {"type": ["null", "string"]}, + "city": {"type": ["null", "string"]}, + }, + "type": ["null", "object"], + }, + "col_attendance": {"type": ["null", "integer"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "festivals_stream", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) +).build() + +avro_file_with_double_as_number_scenario = ( + TestScenarioBuilder() + .set_name("avro_file_with_double_as_number_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": {"filetype": "avro", "double_as_string": False}, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryAvroFilesStreamReader(files=_multiple_avro_combine_schema_file, file_type="avro")) + .set_file_type("avro") + ) + .set_expected_records( + [ + { + "data": { + "col_double": 20.02, + "col_string": "Robbers", + "col_album": {"album": "The 1975"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.avro", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 20.23, + "col_string": "Somebody Else", + "col_album": {"album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.avro", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 1975.1975, + "col_string": "It's Not Living (If It's Not with You)", + "col_song": {"title": "Love It If We Made It"}, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.avro", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 5791.5791, + "col_string": "The 1975", + "col_song": {"title": "About You"}, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.avro", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_double": {"type": ["null", "number"]}, + "col_string": {"type": ["null", "string"]}, + "col_album": { + "properties": { + "album": {"type": ["null", "string"]}, + }, + "type": ["null", "object"], + }, + "col_song": { + "properties": { + "title": {"type": ["null", "string"]}, + }, + "type": ["null", "object"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/check_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/check_scenarios.py new file mode 100644 index 000000000000..26136d9cf025 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/check_scenarios.py @@ -0,0 +1,220 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError +from unit_tests.sources.file_based.helpers import ( + FailingSchemaValidationPolicy, + TestErrorListMatchingFilesInMemoryFilesStreamReader, + TestErrorOpenFileInMemoryFilesStreamReader, +) +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +_base_success_scenario = ( + TestScenarioBuilder() + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_check_status("SUCCEEDED") +) + + +success_csv_scenario = (_base_success_scenario.copy().set_name("success_csv_scenario")).build() + + +success_multi_stream_scenario = ( + _base_success_scenario.copy() + .set_name("success_multi_stream_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv", "*.gz"], + "validation_policy": "Emit Record", + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["*.csv", "*.gz"], + "validation_policy": "Emit Record", + }, + ] + } + ) +).build() + + +success_extensionless_scenario = ( + _base_success_scenario.copy() + .set_name("success_extensionless_file_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + _base_success_scenario.source_builder.copy().set_files( + { + "a": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + ) +).build() + + +success_user_provided_schema_scenario = ( + _base_success_scenario.copy() + .set_name("success_user_provided_schema_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "string"}', + } + ], + } + ) +).build() + + +_base_failure_scenario = _base_success_scenario.copy().set_expected_check_status("FAILED") + + +error_empty_stream_scenario = ( + _base_failure_scenario.copy() + .set_name("error_empty_stream_scenario") + .set_source_builder(_base_failure_scenario.copy().source_builder.copy().set_files({})) + .set_expected_check_error(None, FileBasedSourceError.EMPTY_STREAM.value) +).build() + + +error_listing_files_scenario = ( + _base_failure_scenario.copy() + .set_name("error_listing_files_scenario") + .set_source_builder( + _base_failure_scenario.source_builder.copy().set_stream_reader( + TestErrorListMatchingFilesInMemoryFilesStreamReader(files=_base_failure_scenario.source_builder._files, file_type="csv") + ) + ) + .set_expected_check_error(None, FileBasedSourceError.ERROR_LISTING_FILES.value) +).build() + + +error_reading_file_scenario = ( + _base_failure_scenario.copy() + .set_name("error_reading_file_scenario") + .set_source_builder( + _base_failure_scenario.source_builder.copy().set_stream_reader( + TestErrorOpenFileInMemoryFilesStreamReader(files=_base_failure_scenario.source_builder._files, file_type="csv") + ) + ) + .set_expected_check_error(None, FileBasedSourceError.ERROR_READING_FILE.value) +).build() + + +error_record_validation_user_provided_schema_scenario = ( + _base_failure_scenario.copy() + .set_name("error_record_validation_user_provided_schema_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "always_fail", + "input_schema": '{"col1": "number", "col2": "string"}', + } + ], + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_validation_policies({FailingSchemaValidationPolicy.ALWAYS_FAIL: FailingSchemaValidationPolicy()}) + ) + .set_expected_check_error(None, FileBasedSourceError.ERROR_VALIDATING_RECORD.value) +).build() + + +error_multi_stream_scenario = ( + _base_failure_scenario.copy() + .set_name("error_multi_stream_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + }, + { + "name": "stream2", + "format": {"filetype": "jsonl"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + }, + ], + } + ) + .set_expected_check_error(None, FileBasedSourceError.ERROR_READING_FILE.value) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py new file mode 100644 index 000000000000..e5a7ee419452 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py @@ -0,0 +1,2865 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor +from airbyte_cdk.test.state_builder import StateBuilder +from unit_tests.sources.file_based.helpers import LowHistoryLimitConcurrentCursor +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder + +single_csv_input_state_is_earlier_scenario_concurrent = ( + TestScenarioBuilder() + .set_name("single_csv_input_state_is_earlier_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"some_old_file.csv": "2023-06-01T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-01T03:54:07.000000Z_some_old_file.csv", + }, + ) + .build(), + ) + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": {"some_old_file.csv": "2023-06-01T03:54:07.000000Z", "a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) +).build() + +single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent = ( + TestScenarioBuilder() + .set_name("single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + }, + ) + .build(), + ) + ) + .set_expected_records( + [ + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + } + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) +).build() + +single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent = ( + TestScenarioBuilder() + .set_name("single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-01T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-01T03:54:07.000000Z_a.csv", + }, + ) + .build(), + ) + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) +).build() + +single_csv_no_input_state_scenario_concurrent = ( + TestScenarioBuilder() + .set_name("single_csv_input_state_is_earlier_again_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) +).build() + +multi_csv_same_timestamp_scenario_concurrent = ( + TestScenarioBuilder() + .set_name("multi_csv_same_timestamp_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) +).build() + +single_csv_input_state_is_later_scenario_concurrent = ( + TestScenarioBuilder() + .set_name("single_csv_input_state_is_later_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "recent_file.csv": "2023-07-15T23:59:59.000000Z", + "a.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-07-15T23:59:59.000000Z_recent_file.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"recent_file.csv": "2023-07-15T23:59:59.000000Z"}, + "_ab_source_file_last_modified": "2023-07-15T23:59:59.000000Z_recent_file.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_different_timestamps_scenario_concurrent = ( + TestScenarioBuilder() + .set_name("multi_csv_stream_different_timestamps_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-04T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-04T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z_a.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-04T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) +).build() + +multi_csv_per_timestamp_scenario_concurrent = ( + TestScenarioBuilder() + .set_name("multi_csv_per_timestamp_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) +).build() + +multi_csv_skip_file_if_already_in_history_concurrent = ( + TestScenarioBuilder() + .set_name("skip_files_already_in_history_concurrent") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, # this file is skipped + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + {"history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv"}, + ) + .build(), + ) + ) +).build() + +multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer = ( + TestScenarioBuilder() + .set_name("multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, # this file is skipped + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c"}, "stream": "stream1"}, # this file is skipped + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "c.csv": "2023-06-06T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older = ( + TestScenarioBuilder() + .set_name("multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(FileBasedConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, # this file is skipped + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c"}, "stream": "stream1"}, # this file is skipped + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "c.csv": "2023-06-06T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-03T03:54:07.000000Z_x.csv", + }, + ) + .build() + ) + ) +).build() + +multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer = ( + TestScenarioBuilder() + .set_name("multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-10T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "very_old_file.csv": "2023-06-02T03:54:07.000000Z", + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "a.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_old_file_same_timestamp_as_a.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "a.csv": "2023-06-06T03:54:07.000000Z", + "b.csv": "2023-06-07T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z_b.csv", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "history": { + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "b.csv": "2023-06-07T03:54:07.000000Z", + "c.csv": "2023-06-10T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "very_very_old_file.csv": "2023-06-01T03:54:07.000000Z", + "very_old_file.csv": "2023-06-02T03:54:07.000000Z", + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_old_file_same_timestamp_as_a.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older = ( + TestScenarioBuilder() + .set_name("multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-10T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "very_old_file.csv": "2023-06-02T03:54:07.000000Z", + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "a.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_old_file_same_timestamp_as_a.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "a.csv": "2023-06-06T03:54:07.000000Z", + "b.csv": "2023-06-07T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z_b.csv", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "history": { + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "b.csv": "2023-06-07T03:54:07.000000Z", + "c.csv": "2023-06-10T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "very_very_old_file.csv": "2023-06-01T03:54:07.000000Z", + "very_old_file.csv": "2023-06-02T03:54:07.000000Z", + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-05-01T03:54:07.000000Z_very_very_very_old_file.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer = ( + TestScenarioBuilder() + .set_name("multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11d", + "col2": "val12d", + "col3": "val13d", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21d", + "col2": "val22d", + "col3": "val23d", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) +).build() + +multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older = ( + TestScenarioBuilder() + .set_name("multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11d", + "col2": "val12d", + "col3": "val13d", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21d", + "col2": "val22d", + "col3": "val23d", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) +).build() + +multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older = ( + TestScenarioBuilder() + .set_name("multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", + } + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer = ( + TestScenarioBuilder() + .set_name("multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", + } + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", + }, + ) + .build(), + ) + ) +).build() + + +multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older = ( + TestScenarioBuilder() + .set_name("multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-08T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # This file is skipped because it is older than the time_window + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + "e.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_e.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + "e.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_e.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer = ( + TestScenarioBuilder() + .set_name("multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-08T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # This file is skipped because it is older than the time_window + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + "e.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_e.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + "e.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_e.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer = ( + TestScenarioBuilder() + .set_name( + "multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer" + ) + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-08T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "b.csv": "2023-06-06T03:54:07.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "old_file.csv": "2023-06-05T00:00:00.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", + }, + ) + .build(), + ) + ) +).build() + +multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older = ( + TestScenarioBuilder() + .set_name( + "multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older" + ) + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-08T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitConcurrentCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "b.csv": "2023-06-06T03:54:07.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "old_file.csv": "2023-06-05T00:00:00.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-04T00:00:00.000000Z_very_old_file.csv", + }, + ) + .build(), + ) + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py new file mode 100644 index 000000000000..2fff455b08ca --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -0,0 +1,3391 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, SyncMode +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError +from airbyte_cdk.test.catalog_builder import CatalogBuilder +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.helpers import EmptySchemaParser, LowInferenceLimitDiscoveryPolicy +from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesSource +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenario, TestScenarioBuilder + +single_csv_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("single_csv_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_spec( + { + "documentationUrl": "https://docs.airbyte.com/integrations/sources/in_memory_files", + "connectionSpecification": { + "title": "InMemorySpec", + "description": "Used during spec; allows the developer to configure the cloud provider specific options\nthat are needed when users configure a file-based source.", + "type": "object", + "properties": { + "start_date": { + "title": "Start Date", + "description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", + "examples": ["2021-01-01T00:00:00.000000Z"], + "format": "date-time", + "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", + "pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ", + "order": 1, + "type": "string", + }, + "streams": { + "title": "The list of streams to sync", + "description": 'Each instance of this configuration defines a stream. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.', + "order": 10, + "type": "array", + "items": { + "title": "FileBasedStreamConfig", + "type": "object", + "properties": { + "name": {"title": "Name", "description": "The name of the stream.", "type": "string"}, + "globs": { + "title": "Globs", + "description": 'The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look here.', + "type": "array", + "items": {"type": "string"}, + "order": 1, + "default": ["**"], + }, + "legacy_prefix": { + "title": "Legacy Prefix", + "airbyte_hidden": True, + "type": "string", + "description": "The path prefix configured in v3 versions of the S3 connector. This option is deprecated in favor of a single glob.", + }, + "validation_policy": { + "title": "Validation Policy", + "description": "The name of the validation policy that dictates sync behavior when a record does not adhere to the stream schema.", + "default": "Emit Record", + "enum": ["Emit Record", "Skip Record", "Wait for Discover"], + }, + "input_schema": { + "title": "Input Schema", + "description": "The schema that will be used to validate records extracted from the file. This will override the stream schema that is auto-detected from incoming files.", + "type": "string", + }, + "primary_key": { + "title": "Primary Key", + "description": "The column or columns (for a composite key) that serves as the unique identifier of a record. If empty, the primary key will default to the parser's default primary key.", + "type": "string", + "airbyte_hidden": True, + }, + "days_to_sync_if_history_is_full": { + "title": "Days To Sync If History Is Full", + "description": "When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.", + "default": 3, + "type": "integer", + }, + "format": { + "title": "Format", + "description": "The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.", + "type": "object", + "oneOf": [ + { + "title": "Avro Format", + "type": "object", + "properties": { + "filetype": {"title": "Filetype", "default": "avro", "const": "avro", "type": "string"}, + "double_as_string": { + "title": "Convert Double Fields to Strings", + "description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.", + "default": False, + "type": "boolean", + }, + }, + "required": ["filetype"], + }, + { + "title": "CSV Format", + "type": "object", + "properties": { + "filetype": {"title": "Filetype", "default": "csv", "const": "csv", "type": "string"}, + "delimiter": { + "title": "Delimiter", + "description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.", + "default": ",", + "type": "string", + }, + "quote_char": { + "title": "Quote Character", + "description": "The character used for quoting CSV values. To disallow quoting, make this field blank.", + "default": '"', + "type": "string", + }, + "escape_char": { + "title": "Escape Character", + "description": "The character used for escaping special characters. To disallow escaping, leave this field blank.", + "type": "string", + }, + "encoding": { + "title": "Encoding", + "description": 'The character encoding of the CSV data. Leave blank to default to UTF8. See list of python encodings for allowable options.', + "default": "utf8", + "type": "string", + }, + "double_quote": { + "title": "Double Quote", + "description": "Whether two quotes in a quoted CSV value denote a single quote in the data.", + "default": True, + "type": "boolean", + }, + "null_values": { + "title": "Null Values", + "description": "A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.", + "default": [], + "type": "array", + "items": {"type": "string"}, + "uniqueItems": True, + }, + "strings_can_be_null": { + "title": "Strings Can Be Null", + "description": "Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.", + "default": True, + "type": "boolean", + }, + "skip_rows_before_header": { + "title": "Skip Rows Before Header", + "description": "The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.", + "default": 0, + "type": "integer", + }, + "skip_rows_after_header": { + "title": "Skip Rows After Header", + "description": "The number of rows to skip after the header row.", + "default": 0, + "type": "integer", + }, + "header_definition": { + "title": "CSV Header Definition", + "type": "object", + "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.", + "default": {"header_definition_type": "From CSV"}, + "oneOf": [ + { + "title": "From CSV", + "type": "object", + "properties": { + "header_definition_type": { + "title": "Header Definition Type", + "default": "From CSV", + "const": "From CSV", + "type": "string", + }, + }, + "required": ["header_definition_type"], + }, + { + "title": "Autogenerated", + "type": "object", + "properties": { + "header_definition_type": { + "title": "Header Definition Type", + "default": "Autogenerated", + "const": "Autogenerated", + "type": "string", + }, + }, + "required": ["header_definition_type"], + }, + { + "title": "User Provided", + "type": "object", + "properties": { + "header_definition_type": { + "title": "Header Definition Type", + "default": "User Provided", + "const": "User Provided", + "type": "string", + }, + "column_names": { + "title": "Column Names", + "description": "The column names that will be used while emitting the CSV records", + "type": "array", + "items": {"type": "string"}, + }, + }, + "required": ["column_names", "header_definition_type"], + }, + ], + }, + "true_values": { + "title": "True Values", + "description": "A set of case-sensitive strings that should be interpreted as true values.", + "default": ["y", "yes", "t", "true", "on", "1"], + "type": "array", + "items": {"type": "string"}, + "uniqueItems": True, + }, + "false_values": { + "title": "False Values", + "description": "A set of case-sensitive strings that should be interpreted as false values.", + "default": ["n", "no", "f", "false", "off", "0"], + "type": "array", + "items": {"type": "string"}, + "uniqueItems": True, + }, + "inference_type": { + "title": "Inference Type", + "description": "How to infer the types of the columns. If none, inference default to strings.", + "default": "None", + "airbyte_hidden": True, + "enum": ["None", "Primitive Types Only"], + }, + "ignore_errors_on_fields_mismatch": { + "type": "boolean", + "title": "Ignore errors on field mismatch", + "default": False, + "description": "Whether to ignore errors that occur when the number of fields in the CSV does not match the number of columns in the schema.", + }, + }, + "required": ["filetype"], + }, + { + "title": "Jsonl Format", + "type": "object", + "properties": { + "filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"} + }, + "required": ["filetype"], + }, + { + "title": "Parquet Format", + "type": "object", + "properties": { + "filetype": { + "title": "Filetype", + "default": "parquet", + "const": "parquet", + "type": "string", + }, + "decimal_as_float": { + "title": "Convert Decimal Fields to Floats", + "description": "Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.", + "default": False, + "type": "boolean", + }, + }, + "required": ["filetype"], + }, + { + "title": "Unstructured Document Format", + "type": "object", + "properties": { + "filetype": { + "title": "Filetype", + "default": "unstructured", + "const": "unstructured", + "type": "string", + }, + "skip_unprocessable_files": { + "type": "boolean", + "default": True, + "title": "Skip Unprocessable Files", + "description": "If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.", + "always_show": True, + }, + "strategy": { + "type": "string", + "always_show": True, + "order": 0, + "default": "auto", + "title": "Parsing Strategy", + "enum": ["auto", "fast", "ocr_only", "hi_res"], + "description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf", + }, + "processing": { + "title": "Processing", + "description": "Processing configuration", + "default": {"mode": "local"}, + "type": "object", + "oneOf": [ + { + "title": "Local", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "local", + "const": "local", + "enum": ["local"], + "type": "string", + } + }, + "description": "Process files locally, supporting `fast` and `ocr` modes. This is the default option.", + "required": ["mode"], + }, + { + "title": "via API", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "api", + "const": "api", + "enum": ["api"], + "type": "string", + }, + "api_key": { + "title": "API Key", + "description": "The API key to use matching the environment", + "default": "", + "always_show": True, + "airbyte_secret": True, + "type": "string", + }, + "api_url": { + "title": "API URL", + "description": "The URL of the unstructured API to use", + "default": "https://api.unstructured.io", + "always_show": True, + "examples": ["https://api.unstructured.com"], + "type": "string", + }, + "parameters": { + "title": "Additional URL Parameters", + "description": "List of parameters send to the API", + "default": [], + "always_show": True, + "type": "array", + "items": { + "title": "APIParameterConfigModel", + "type": "object", + "properties": { + "name": { + "title": "Parameter name", + "description": "The name of the unstructured API parameter to use", + "examples": ["combine_under_n_chars", "languages"], + "type": "string", + }, + "value": { + "title": "Value", + "description": "The value of the parameter", + "examples": ["true", "hi_res"], + "type": "string", + }, + }, + "required": ["name", "value"], + }, + }, + }, + "description": "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured.", + "required": ["mode"], + }, + ], + }, + }, + "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.", + "required": ["filetype"], + }, + { + "title": "Excel Format", + "type": "object", + "properties": { + "filetype": {"title": "Filetype", "default": "excel", "const": "excel", "type": "string"} + }, + "required": ["filetype"], + }, + ], + }, + "schemaless": { + "title": "Schemaless", + "description": "When enabled, syncs will not validate or structure records against the stream's schema.", + "default": False, + "type": "boolean", + }, + "recent_n_files_to_read_for_schema_discovery": { + "title": "Files To Read For Schema Discover", + "description": "The number of resent files which will be used to discover the schema for this stream.", + "exclusiveMinimum": 0, + "type": "integer", + }, + }, + "required": ["name", "format"], + }, + }, + "delivery_method": { + "airbyte_hidden": True, + "title": "Delivery Method", + "default": "use_records_transfer", + "type": "object", + "order": 7, + "display_type": "radio", + "group": "advanced", + "oneOf": [ + { + "title": "Replicate Records", + "type": "object", + "properties": { + "delivery_type": { + "title": "Delivery Type", + "default": "use_records_transfer", + "const": "use_records_transfer", + "enum": [ + "use_records_transfer" + ], + "type": "string" + } + }, + "description": "Recommended - Extract and load structured records into your destination of choice. This is the classic method of moving data in Airbyte. It allows for blocking and hashing individual fields or files from a structured schema. Data can be flattened, typed and deduped depending on the destination.", + "required": [ + "delivery_type" + ] + }, + { + "title": "Copy Raw Files", + "type": "object", + "properties": { + "delivery_type": { + "title": "Delivery Type", + "default": "use_file_transfer", + "const": "use_file_transfer", + "enum": [ + "use_file_transfer" + ], + "type": "string" + } + }, + "description": "Copy raw files without parsing their contents. Bits are copied into the destination exactly as they appeared in the source. Recommended for use with unstructured text data, non-text and compressed files.", + "required": [ + "delivery_type" + ] + } + ] + }, + }, + "required": ["streams"], + }, + "supportsDBT": False, + "supportsNormalization": False, + } + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_analytics_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_analytics") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a.csv"], + "validation_policy": "Emit Record", + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream2", + }, + ] + ) + .set_expected_analytics( + [ + AirbyteAnalyticsTraceMessage(type="file-cdk-csv-stream-count", value="2"), + ] + ) +).build() + +multi_csv_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("multi_csv_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +multi_csv_stream_n_file_exceeds_limit_for_inference = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("multi_csv_stream_n_file_exceeds_limit_for_inference") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + .set_discovery_policy(LowInferenceLimitDiscoveryPolicy()) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +multi_csv_stream_n_file_exceeds_config_limit_for_inference = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("multi_csv_stream_n_file_exceeds_config_limit_for_inference") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + "recent_n_files_to_read_for_schema_discovery": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3", "col4"), + ("val11c", "val12c", "val13c", "val14c"), + ("val21c", "val22c", "val23c", "val24c"), + ], + "last_modified": "2023-06-06T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "col3": {"type": ["null", "string"]}, + "col4": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "col4": "val14c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "col4": "val24c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +invalid_csv_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("invalid_csv_scenario") # too many values for the number of headers + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1",), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records([]) + .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) + .set_expected_logs( + { + "read": [ + { + "level": "ERROR", + "message": f"{FileBasedSourceError.INVALID_SCHEMA_ERROR.value} stream=stream1 file=a.csv line_no=1 n_skipped=0", + }, + ] + } + ) + .set_expected_read_error( + AirbyteTracedException, + "Please check the logged errors for more information.", + ) +).build() + +invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("invalid_csv_multi_scenario") # too many values for the number of headers + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1",), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col3",), + ("val13b", "val14b"), + ("val23b", "val24b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "json_schema": { + "type": "object", + "properties": { + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records([]) + .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) + .set_expected_logs( + { + "read": [ + { + "level": "ERROR", + "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0", + }, + { + "level": "ERROR", + "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0", + }, + ] + } + ) + .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.") +).build() + +csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_single_stream_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.jsonl": { + "contents": [ + {"col1": "val11b", "col2": "val12b", "col3": "val13b"}, + {"col1": "val12b", "col2": "val22b", "col3": "val23b"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_multi_stream_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_multi_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col3",), + ("val13b",), + ("val23b",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream1", + }, + { + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream1", + }, + { + "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream2", + }, + { + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream2", + }, + ] + ) +).build() + +csv_custom_format_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_custom_format") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + "delimiter": "#", + "quote_char": "|", + "escape_char": "!", + "double_quote": True, + }, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11", "val12", "val |13|"), + ("val21", "val22", "val23"), + ("val,31", "val |,32|", "val, !!!! 33"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_file_write_options( + { + "delimiter": "#", + "quotechar": "|", + } + ) + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "col3": "val |13|", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "col3": "val23", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val,31", + "col2": "val |,32|", + "col3": "val, !! 33", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +multi_stream_custom_format = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("multi_stream_custom_format_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "format": {"filetype": "csv", "delimiter": "#", "escape_char": "!", "double_quote": True, "newlines_in_values": False}, + }, + { + "name": "stream2", + "globs": ["b.csv"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + "delimiter": "#", + "escape_char": "@", + "double_quote": True, + "newlines_in_values": False, + }, + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val !! 12a"), + ("val !! 21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col3",), + ("val @@@@ 13b",), + ("val23b",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + .set_file_write_options( + { + "delimiter": "#", + } + ) + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val ! 12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val ! 21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col3": "val @@@@ 13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream1", + }, + { + "data": { + "col3": "val @@ 13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream2", + }, + { + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream2", + }, + ] + ) +).build() + +empty_schema_inference_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("empty_schema_inference_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_parsers({CsvFormat: EmptySchemaParser()}) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) +).build() + +schemaless_csv_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("schemaless_csv_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Skip Record", + "schemaless": True, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "data": {"type": "object"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "data": {"col1": "val11a", "col2": "val12a"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": "val21a", "col2": "val22a"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": "val11b", "col2": "val12b", "col3": "val13b"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": "val21b", "col2": "val22b", "col3": "val23b"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +schemaless_csv_multi_stream_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("schemaless_csv_multi_stream_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a.csv"], + "validation_policy": "Skip Record", + "schemaless": True, + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Skip Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col3",), + ("val13b",), + ("val23b",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "data": {"type": "object"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "data": {"col1": "val11a", "col2": "val12a"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": "val21a", "col2": "val22a"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream2", + }, + { + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "stream": "stream2", + }, + ] + ) +).build() + +schemaless_with_user_input_schema_fails_connection_check_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("schemaless_with_user_input_schema_fails_connection_check_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Skip Record", + "input_schema": '{"col1": "string", "col2": "string", "col3": "string"}', + "schemaless": True, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_catalog(CatalogBuilder().with_stream("stream1", SyncMode.full_refresh).build()) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "data": {"type": "object"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value) + .set_expected_discover_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value) + .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value) +).build() + +schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a.csv"], + "validation_policy": "Skip Record", + "schemaless": True, + "input_schema": '{"col1": "string", "col2": "string", "col3": "string"}', + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Skip Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col3",), + ("val13b",), + ("val23b",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_catalog(CatalogBuilder().with_stream("stream1", SyncMode.full_refresh).with_stream("stream2", SyncMode.full_refresh).build()) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "data": {"type": "object"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value) + .set_expected_discover_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value) + .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value) +).build() + +csv_string_can_be_null_with_input_schemas_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_string_can_be_null_with_input_schema") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "string"}', + "format": { + "filetype": "csv", + "null_values": ["null"], + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("2", "null"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_string_are_not_null_if_strings_can_be_null_is_false_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_string_are_not_null_if_strings_can_be_null_is_false") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "string"}', + "format": { + "filetype": "csv", + "null_values": ["null"], + "strings_can_be_null": False, + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("2", "null"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "2", + "col2": "null", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_string_not_null_if_no_null_values_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_string_not_null_if_no_null_values") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("2", "null"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "2", + "col2": "null", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_strings_can_be_null_not_quoted_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_strings_can_be_null_no_input_schema") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": {"filetype": "csv", "null_values": ["null"]}, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("2", "null"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_newline_in_values_quoted_value_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_newline_in_values_quoted_value") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + '''"col1","col2"''', + '''"2","val\n2"''', + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "2", + "col2": "val\n2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_newline_in_values_not_quoted") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + """col1,col2""", + """2,val\n2""", + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # Note that the value for col2 is truncated to "val" because the newline is not escaped + { + "data": { + "col1": "2", + "col2": "val", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_read_error( + AirbyteTracedException, + f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0", + ) + .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) + .set_expected_read_error( + AirbyteTracedException, + "Please check the logged errors for more information.", + ) +).build() + +csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_escape_char_is_set") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + "double_quotes": False, + "quote_char": '"', + "delimiter": ",", + "escape_char": "\\", + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + """col1,col2""", + '''val11,"val\\"2"''', + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": 'val"2', + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_double_quote_is_set_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_doublequote_is_set") + # This scenario tests that quotes are properly escaped when double_quotes is True + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + "double_quotes": True, + "quote_char": '"', + "delimiter": ",", + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + """col1,col2""", + '''val11,"val""2"''', + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": 'val"2', + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_custom_delimiter_with_escape_char_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_custom_delimiter_with_escape_char") + # This scenario tests that a value can contain the delimiter if it is wrapped in the quote_char + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": {"filetype": "csv", "double_quotes": True, "quote_char": "@", "delimiter": "|", "escape_char": "+"}, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + """col1|col2""", + """val"1,1|val+|2""", + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": 'val"1,1', + "col2": "val|2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_custom_delimiter_in_double_quotes_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_custom_delimiter_in_double_quotes") + # This scenario tests that a value can contain the delimiter if it is wrapped in the quote_char + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + "double_quotes": True, + "quote_char": "@", + "delimiter": "|", + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + """col1|col2""", + """val"1,1|@val|2@""", + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": 'val"1,1', + "col2": "val|2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_skip_before_header_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_skip_before_header") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": {"filetype": "csv", "skip_rows_before_header": 2}, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("skip_this", "skip_this"), + ("skip_this_too", "skip_this_too"), + ("col1", "col2"), + ("val11", "val12"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_skip_after_header_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_skip_after_header") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": {"filetype": "csv", "skip_rows_after_header": 2}, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("skip_this", "skip_this"), + ("skip_this_too", "skip_this_too"), + ("val11", "val12"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_skip_before_and_after_header_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_skip_before_after_header") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + "skip_rows_before_header": 1, + "skip_rows_after_header": 1, + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("skip_this", "skip_this"), + ("col1", "col2"), + ("skip_this_too", "skip_this_too"), + ("val11", "val12"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_autogenerate_column_names_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_autogenerate_column_names") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": { + "filetype": "csv", + "header_definition": {"header_definition_type": "Autogenerated"}, + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("val11", "val12"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "f0": {"type": ["null", "string"]}, + "f1": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "f0": "val11", + "f1": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_custom_bool_values_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_custom_bool_values") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "boolean", "col2": "boolean"}', + "format": { + "filetype": "csv", + "true_values": ["this_is_true"], + "false_values": ["this_is_false"], + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("this_is_true", "this_is_false"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "boolean"}, + "col2": {"type": "boolean"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": True, + "col2": False, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_custom_null_values_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_custom_null_values") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "boolean", "col2": "string"}', + "format": { + "filetype": "csv", + "null_values": ["null"], + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("null", "na"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "boolean"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col2": "na", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +earlier_csv_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("earlier_csv_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ], + "start_date": "2023-06-10T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_check_status("FAILED") + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + "data": {"type": "object"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records(None) +).build() + +csv_no_records_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("csv_empty_no_records") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "boolean", "col2": "string"}', + "format": { + "filetype": "csv", + "null_values": ["null"], + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [("col1", "col2")], # column headers, but no data rows + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "boolean"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records([]) +).build() + +csv_no_files_scenario: TestScenario[InMemoryFilesSource] = ( + TestScenarioBuilder[InMemoryFilesSource]() + .set_name("no_files_csv_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ], + "start_date": "2023-06-10T03:54:07.000000Z", + } + ) + .set_source_builder(FileBasedSourceBuilder().set_files({}).set_file_type("csv")) + .set_expected_check_status("FAILED") + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + "data": {"type": "object"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records(None) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/excel_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/excel_scenarios.py new file mode 100644 index 000000000000..6653296535d5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/excel_scenarios.py @@ -0,0 +1,436 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import datetime + +from unit_tests.sources.file_based.in_memory_files_source import TemporaryExcelFilesStreamReader +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +_single_excel_file = { + "a.xlsx": { + "contents": [ + {"col1": "val11", "col2": "val12"}, + {"col1": "val21", "col2": "val22"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +_multiple_excel_combine_schema_file = { + "a.xlsx": { + "contents": [ + {"col_double": 20.02, "col_string": "Robbers", "col_album": "The 1975"}, + { + "col_double": 20.23, + "col_string": "Somebody Else", + "col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It", + }, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.xlsx": { + "contents": [ + {"col_double": 1975.1975, "col_string": "It's Not Living (If It's Not with You)", "col_song": "Love It If We Made It"}, + {"col_double": 5791.5791, "col_string": "The 1975", "col_song": "About You"}, + ], + "last_modified": "2023-06-06T03:54:07.000Z", + }, +} + +_excel_all_types_file = { + "a.xlsx": { + "contents": [ + { + "col_bool": True, + "col_int": 27, + "col_long": 1992, + "col_float": 999.09723456, + "col_string": "Love It If We Made It", + "col_date": datetime.date(2022, 5, 29), + "col_time_millis": datetime.time(6, 0, 0, 456000), + "col_time_micros": datetime.time(12, 0, 0, 456789), + } + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +_multiple_excel_stream_file = { + "odesza_songs.xlsx": { + "contents": [ + {"col_title": "Late Night", "col_album": "A_MOMENT_APART", "col_year": 2017, "col_vocals": False}, + {"col_title": "White Lies", "col_album": "IN_RETURN", "col_year": 2014, "col_vocals": True}, + {"col_title": "Wide Awake", "col_album": "THE_LAST_GOODBYE", "col_year": 2022, "col_vocals": True}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "california_festivals.xlsx": { + "contents": [ + { + "col_name": "Lightning in a Bottle", + "col_location": {"country": "USA", "state": "California", "city": "Buena Vista Lake"}, + "col_attendance": 18000, + }, + { + "col_name": "Outside Lands", + "col_location": {"country": "USA", "state": "California", "city": "San Francisco"}, + "col_attendance": 220000, + }, + ], + "last_modified": "2023-06-06T03:54:07.000Z", + }, +} + +single_excel_scenario = ( + TestScenarioBuilder() + .set_name("single_excel_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "excel"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryExcelFilesStreamReader(files=_single_excel_file, file_type="excel")) + .set_file_type("excel") + ) + .set_expected_check_status("SUCCEEDED") + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.xlsx", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.xlsx", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +multiple_excel_combine_schema_scenario = ( + TestScenarioBuilder() + .set_name("multiple_excel_combine_schema_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "excel"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_combine_schema_file, file_type="excel")) + .set_file_type("excel") + ) + .set_expected_records( + [ + { + "data": { + "col_double": 20.02, + "col_string": "Robbers", + "col_album": "The 1975", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.xlsx", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 20.23, + "col_string": "Somebody Else", + "col_album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.xlsx", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 1975.1975, + "col_string": "It's Not Living (If It's Not with You)", + "col_song": "Love It If We Made It", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.xlsx", + }, + "stream": "stream1", + }, + { + "data": { + "col_double": 5791.5791, + "col_string": "The 1975", + "col_song": "About You", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.xlsx", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_double": {"type": ["null", "number"]}, + "col_string": {"type": ["null", "string"]}, + "col_album": {"type": ["null", "string"]}, + "col_song": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +excel_all_types_scenario = ( + TestScenarioBuilder() + .set_name("excel_all_types_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "excel"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryExcelFilesStreamReader(files=_excel_all_types_file, file_type="excel")) + .set_file_type("excel") + ) + .set_expected_records( + [ + { + "data": { + "col_bool": True, + "col_int": 27, + "col_long": 1992, + "col_float": 999.09723456, + "col_string": "Love It If We Made It", + "col_date": "2022-05-29T00:00:00.000000", + "col_time_millis": "06:00:00.456000", + "col_time_micros": "12:00:00.456789", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.xlsx", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_bool": {"type": ["null", "boolean"]}, + "col_int": {"type": ["null", "number"]}, + "col_long": {"type": ["null", "number"]}, + "col_float": {"type": ["null", "number"]}, + "col_string": {"type": ["null", "string"]}, + "col_date": {"format": "date-time", "type": ["null", "string"]}, + "col_time_millis": {"type": ["null", "string"]}, + "col_time_micros": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +multiple_streams_excel_scenario = ( + TestScenarioBuilder() + .set_name("multiple_streams_excel_stream") + .set_config( + { + "streams": [ + { + "name": "songs_stream", + "format": {"filetype": "excel"}, + "globs": ["*_songs.xlsx"], + "validation_policy": "Emit Record", + }, + { + "name": "festivals_stream", + "format": {"filetype": "excel"}, + "globs": ["*_festivals.xlsx"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryExcelFilesStreamReader(files=_multiple_excel_stream_file, file_type="excel")) + .set_file_type("excel") + ) + .set_expected_records( + [ + { + "data": { + "col_title": "Late Night", + "col_album": "A_MOMENT_APART", + "col_year": 2017, + "col_vocals": False, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.xlsx", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_title": "White Lies", + "col_album": "IN_RETURN", + "col_year": 2014, + "col_vocals": True, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.xlsx", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_title": "Wide Awake", + "col_album": "THE_LAST_GOODBYE", + "col_year": 2022, + "col_vocals": True, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "odesza_songs.xlsx", + }, + "stream": "songs_stream", + }, + { + "data": { + "col_name": "Lightning in a Bottle", + "col_location": "{'country': 'USA', 'state': 'California', 'city': 'Buena Vista Lake'}", + "col_attendance": 18000, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "california_festivals.xlsx", + }, + "stream": "festivals_stream", + }, + { + "data": { + "col_name": "Outside Lands", + "col_location": "{'country': 'USA', 'state': 'California', 'city': 'San Francisco'}", + "col_attendance": 220000, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "california_festivals.xlsx", + }, + "stream": "festivals_stream", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_title": {"type": ["null", "string"]}, + "col_album": {"type": ["null", "string"]}, + "col_year": {"type": ["null", "number"]}, + "col_vocals": {"type": ["null", "boolean"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "songs_stream", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_name": {"type": ["null", "string"]}, + "col_location": {"type": ["null", "string"]}, + "col_attendance": {"type": ["null", "number"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "festivals_stream", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/file_based_source_builder.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/file_based_source_builder.py new file mode 100644 index 000000000000..6675df380c7c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/file_based_source_builder.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from copy import deepcopy +from typing import Any, Mapping, Optional, Type + +from airbyte_cdk.sources.file_based.availability_strategy.abstract_file_based_availability_strategy import ( + AbstractFileBasedAvailabilityStrategy, +) +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy +from airbyte_cdk.sources.file_based.file_based_source import default_parsers +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.source import TState +from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesSource +from unit_tests.sources.file_based.scenarios.scenario_builder import SourceBuilder + + +class FileBasedSourceBuilder(SourceBuilder[InMemoryFilesSource]): + def __init__(self) -> None: + self._files: Mapping[str, Any] = {} + self._file_type: Optional[str] = None + self._availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None + self._discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy() + self._validation_policies: Optional[Mapping[str, AbstractSchemaValidationPolicy]] = None + self._parsers = default_parsers + self._stream_reader: Optional[AbstractFileBasedStreamReader] = None + self._file_write_options: Mapping[str, Any] = {} + self._cursor_cls: Optional[Type[AbstractFileBasedCursor]] = None + self._config: Optional[Mapping[str, Any]] = None + self._state: Optional[TState] = None + + def build( + self, configured_catalog: Optional[Mapping[str, Any]], config: Optional[Mapping[str, Any]], state: Optional[TState] + ) -> InMemoryFilesSource: + if self._file_type is None: + raise ValueError("file_type is not set") + return InMemoryFilesSource( + self._files, + self._file_type, + self._availability_strategy, + self._discovery_policy, + self._validation_policies, + self._parsers, + self._stream_reader, + configured_catalog, + config, + state, + self._file_write_options, + self._cursor_cls, + ) + + def set_files(self, files: Mapping[str, Any]) -> "FileBasedSourceBuilder": + self._files = files + return self + + def set_file_type(self, file_type: str) -> "FileBasedSourceBuilder": + self._file_type = file_type + return self + + def set_parsers(self, parsers: Mapping[Type[Any], FileTypeParser]) -> "FileBasedSourceBuilder": + self._parsers = parsers + return self + + def set_availability_strategy(self, availability_strategy: AbstractFileBasedAvailabilityStrategy) -> "FileBasedSourceBuilder": + self._availability_strategy = availability_strategy + return self + + def set_discovery_policy(self, discovery_policy: AbstractDiscoveryPolicy) -> "FileBasedSourceBuilder": + self._discovery_policy = discovery_policy + return self + + def set_validation_policies(self, validation_policies: Mapping[str, AbstractSchemaValidationPolicy]) -> "FileBasedSourceBuilder": + self._validation_policies = validation_policies + return self + + def set_stream_reader(self, stream_reader: AbstractFileBasedStreamReader) -> "FileBasedSourceBuilder": + self._stream_reader = stream_reader + return self + + def set_cursor_cls(self, cursor_cls: AbstractFileBasedCursor) -> "FileBasedSourceBuilder": + self._cursor_cls = cursor_cls + return self + + def set_file_write_options(self, file_write_options: Mapping[str, Any]) -> "FileBasedSourceBuilder": + self._file_write_options = file_write_options + return self + + def copy(self) -> "FileBasedSourceBuilder": + return deepcopy(self) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/incremental_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/incremental_scenarios.py new file mode 100644 index 000000000000..95cde6b73a48 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/incremental_scenarios.py @@ -0,0 +1,1941 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor +from airbyte_cdk.test.state_builder import StateBuilder +from unit_tests.sources.file_based.helpers import LowHistoryLimitCursor +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder + +single_csv_input_state_is_earlier_scenario = ( + TestScenarioBuilder() + .set_name("single_csv_input_state_is_earlier") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"some_old_file.csv": "2023-06-01T03:54:07.000000Z"}, + }, + ) + .build(), + ) + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": {"some_old_file.csv": "2023-06-01T03:54:07.000000Z", "a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) +).build() + +single_csv_file_is_skipped_if_same_modified_at_as_in_history = ( + TestScenarioBuilder() + .set_name("single_csv_file_is_skipped_if_same_modified_at_as_in_history") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + }, + ) + .build(), + ) + ) + .set_expected_records( + [ + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + } + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) +).build() + +single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history = ( + TestScenarioBuilder() + .set_name("single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-01T03:54:07.000000Z"}, + }, + ) + .build(), + ) + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) +).build() + +single_csv_no_input_state_scenario = ( + TestScenarioBuilder() + .set_name("single_csv_input_state_is_earlier_again") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder().build(), + ) + ) +).build() + +multi_csv_same_timestamp_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_same_timestamp") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder().build(), + ) + ) +).build() + +single_csv_input_state_is_later_scenario = ( + TestScenarioBuilder() + .set_name("single_csv_input_state_is_later") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "recent_file.csv": "2023-07-15T23:59:59.000000Z", + "a.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-07-15T23:59:59.000000Z_recent_file.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"recent_file.csv": "2023-07-15T23:59:59.000000Z"}, + }, + ) + .build(), + ) + ) +).build() + +multi_csv_different_timestamps_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_stream_different_timestamps") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-04T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-04T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z_a.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-04T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder().build(), + ) + ) +).build() + +multi_csv_per_timestamp_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_per_timestamp") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder().build(), + ) + ) +).build() + +multi_csv_skip_file_if_already_in_history = ( + TestScenarioBuilder() + .set_name("skip_files_already_in_history") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, # this file is skipped + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z"}, + }, + ) + .build(), + ) + ) +).build() + +multi_csv_include_missing_files_within_history_range = ( + TestScenarioBuilder() + .set_name("include_missing_files_within_history_range") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(DefaultFileBasedCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, # this file is skipped + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c"}, "stream": "stream1"}, # this file is skipped + # {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c"}, "stream": "stream1"}, # this file is skipped + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": {"a.csv": "2023-06-05T03:54:07.000000Z", "c.csv": "2023-06-06T03:54:07.000000Z"}, + }, + ) + .build(), + ) + ) +).build() + +multi_csv_remove_old_files_if_history_is_full_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_remove_old_files_if_history_is_full") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-10T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "very_old_file.csv": "2023-06-02T03:54:07.000000Z", + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "a.csv": "2023-06-06T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_old_file_same_timestamp_as_a.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "a.csv": "2023-06-06T03:54:07.000000Z", + "b.csv": "2023-06-07T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z_b.csv", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "history": { + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + "b.csv": "2023-06-07T03:54:07.000000Z", + "c.csv": "2023-06-10T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z_c.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "very_very_old_file.csv": "2023-06-01T03:54:07.000000Z", + "very_old_file.csv": "2023-06-02T03:54:07.000000Z", + "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", + }, + }, + ) + .build(), + ) + ) +).build() + +multi_csv_same_timestamp_more_files_than_history_size_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_same_timestamp_more_files_than_history_size") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11c", + "col2": "val12c", + "col3": "val13c", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21c", + "col2": "val22c", + "col3": "val23c", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11d", + "col2": "val12d", + "col3": "val13d", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21d", + "col2": "val22d", + "col3": "val23d", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder().build(), + ) + ) +).build() + +multi_csv_sync_recent_files_if_history_is_incomplete_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_sync_recent_files_if_history_is_incomplete") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + } + ) + .set_cursor_cls(LowHistoryLimitCursor) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", + } + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "b.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-05T03:54:07.000000Z", + "d.csv": "2023-06-05T03:54:07.000000Z", + }, + }, + ) + .build(), + ) + ) +).build() + +multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_sync_recent_files_if_history_is_incomplete__different_timestamps") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-08T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # This file is skipped because it is older than the time_window + # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + "e.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_e.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + "e.csv": "2023-06-08T03:54:07.000000Z", + }, + }, + ) + .build(), + ) + ) +).build() + +multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario = ( + TestScenarioBuilder() + .set_name("multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + "days_to_sync_if_history_is_full": 3, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-06T03:54:07.000000Z", + }, + "c.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11c", "val12c", "val13c"), + ("val21c", "val22c", "val23c"), + ], + "last_modified": "2023-06-07T03:54:07.000000Z", + }, + "d.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11d", "val12d", "val13d"), + ("val21d", "val22d", "val23d"), + ], + "last_modified": "2023-06-08T03:54:07.000000Z", + }, + } + ) + .set_file_type("csv") + .set_cursor_cls(LowHistoryLimitCursor) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "history": { + "a.csv": "2023-06-05T03:54:07.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "history": { + "b.csv": "2023-06-06T03:54:07.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", + }, + ] + ) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=StateBuilder() + .with_stream_state( + "stream1", + { + "history": { + "old_file.csv": "2023-06-05T00:00:00.000000Z", + "c.csv": "2023-06-07T03:54:07.000000Z", + "d.csv": "2023-06-08T03:54:07.000000Z", + }, + }, + ) + .build(), + ) + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/jsonl_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/jsonl_scenarios.py new file mode 100644 index 000000000000..23e879306e4c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/jsonl_scenarios.py @@ -0,0 +1,941 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat +from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.helpers import LowInferenceBytesJsonlParser, LowInferenceLimitDiscoveryPolicy +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +single_jsonl_scenario = ( + TestScenarioBuilder() + .set_name("single_jsonl_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": "val11", "col2": "val12"}, + {"col1": "val21", "col2": "val22"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("jsonl") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + ] + ) +).build() + + +multi_jsonl_with_different_keys_scenario = ( + TestScenarioBuilder() + .set_name("multi_jsonl_with_different_keys_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": "val11a", "col2": "val12a"}, + {"col1": "val21a", "col2": "val22a"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.jsonl": { + "contents": [ + {"col1": "val11b", "col2": "val12b", "col3": "val13b"}, + {"col1": "val21b", "col3": "val23b"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("jsonl") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "col3": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + ] + ) +).build() + + +multi_jsonl_stream_n_file_exceeds_limit_for_inference = ( + TestScenarioBuilder() + .set_name("multi_jsonl_stream_n_file_exceeds_limit_for_inference") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": "val11a", "col2": "val12a"}, + {"col1": "val21a", "col2": "val22a"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.jsonl": { + "contents": [ + {"col1": "val11b", "col2": "val12b", "col3": "val13b"}, + {"col1": "val21b", "col3": "val23b"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("jsonl") + .set_discovery_policy(LowInferenceLimitDiscoveryPolicy()) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + ] + ) +).build() + + +multi_jsonl_stream_n_bytes_exceeds_limit_for_inference = ( + TestScenarioBuilder() + .set_name("multi_jsonl_stream_n_bytes_exceeds_limit_for_inference") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": "val11a", "col2": "val12a"}, + {"col1": "val21a", "col2": "val22a"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.jsonl": { + "contents": [ + {"col1": "val11b", "col2": "val12b"}, + {"col1": "val21b", "col2": "val22b", "col3": "val23b"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("jsonl") + .set_parsers({JsonlFormat: LowInferenceBytesJsonlParser()}) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "col2": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + ] + ) +).build() + + +invalid_jsonl_scenario = ( + TestScenarioBuilder() + .set_name("invalid_jsonl_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": "val1"}, + "invalid", + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("jsonl") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records([]) + .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) + .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.") + .set_expected_logs( + { + "read": [ + { + "level": "ERROR", + "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.jsonl line_no=2 n_skipped=0", + }, + ] + } + ) +).build() + + +jsonl_multi_stream_scenario = ( + TestScenarioBuilder() + .set_name("jsonl_multi_stream_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*.jsonl"], + "validation_policy": "Emit Record", + }, + { + "name": "stream2", + "format": {"filetype": "jsonl"}, + "globs": ["b.jsonl"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": 1, "col2": "record1"}, + {"col1": 2, "col2": "record2"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.jsonl": { + "contents": [ + {"col3": 1.1}, + {"col3": 2.2}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("jsonl") + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "integer"]}, + "col2": { + "type": ["null", "string"], + }, + "col3": {"type": ["null", "number"]}, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "col3": {"type": ["null", "number"]}, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": 1, + "col2": "record1", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": 2, + "col2": "record2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, + "stream": "stream1", + }, + { + "data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, + "stream": "stream1", + }, + { + "data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, + "stream": "stream2", + }, + { + "data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, + "stream": "stream2", + }, + ] + ) +).build() + + +schemaless_jsonl_scenario = ( + TestScenarioBuilder() + .set_name("schemaless_jsonl_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*"], + "validation_policy": "Skip Record", + "schemaless": True, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": 1, "col2": "record1"}, + {"col1": 2, "col2": "record2"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.jsonl": { + "contents": [ + {"col1": 3, "col2": "record3", "col3": 1.1}, + {"col1": 4, "col2": "record4", "col3": 1.1}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("jsonl") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "data": {"type": "object"}, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "data": {"col1": 1, "col2": "record1"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": 2, "col2": "record2"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": 3, "col2": "record3", "col3": 1.1}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": 4, "col2": "record4", "col3": 1.1}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.jsonl", + }, + "stream": "stream1", + }, + ] + ) +).build() + + +schemaless_jsonl_multi_stream_scenario = ( + TestScenarioBuilder() + .set_name("schemaless_jsonl_multi_stream_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["a.jsonl"], + "validation_policy": "Skip Record", + "schemaless": True, + }, + { + "name": "stream2", + "format": {"filetype": "jsonl"}, + "globs": ["b.jsonl"], + "validation_policy": "Skip Record", + }, + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": 1, "col2": "record1"}, + {"col1": 2, "col2": "record2"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.jsonl": { + "contents": [ + {"col3": 1.1}, + {"col3": 2.2}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("jsonl") + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "data": {"type": "object"}, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "col3": {"type": ["null", "number"]}, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "default_cursor_field": ["_ab_source_file_last_modified"], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "data": {"col1": 1, "col2": "record1"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "data": {"col1": 2, "col2": "record2"}, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, + "stream": "stream2", + }, + { + "data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, + "stream": "stream2", + }, + ] + ) +).build() + +jsonl_user_input_schema_scenario = ( + TestScenarioBuilder() + .set_name("jsonl_user_input_schema_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "jsonl"}, + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "integer", "col2": "string"}', + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.jsonl": { + "contents": [ + {"col1": 1, "col2": "val12"}, + {"col1": 2, "col2": "val22"}, + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("jsonl") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "integer"}, + "col2": { + "type": "string", + }, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": 1, + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + { + "data": { + "col1": 2, + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.jsonl", + }, + "stream": "stream1", + }, + ] + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/parquet_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/parquet_scenarios.py new file mode 100644 index 000000000000..732660e564d8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/parquet_scenarios.py @@ -0,0 +1,763 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import datetime +import decimal + +import pyarrow as pa +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.in_memory_files_source import TemporaryParquetFilesStreamReader +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +_single_parquet_file = { + "a.parquet": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +_single_partitioned_parquet_file = { + "path_prefix/partition1=1/partition2=2/a.parquet": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +_parquet_file_with_decimal = { + "a.parquet": { + "contents": [ + ("col1",), + (decimal.Decimal("13.00"),), + ], + "schema": pa.schema( + [ + pa.field("col1", pa.decimal128(5, 2)), + ] + ), + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +_multiple_parquet_file = { + "a.parquet": { + "contents": [ + ("col1", "col2"), + ("val11a", "val12a"), + ("val21a", "val22a"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.parquet": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, +} + +_parquet_file_with_various_types = { + "a.parquet": { + "contents": [ + ( + "col_bool", + "col_int8", + "col_int16", + "col_int32", + "col_uint8", + "col_uint16", + "col_uint32", + "col_uint64", + "col_float32", + "col_float64", + "col_string", + "col_date32", + "col_date64", + "col_timestamp_without_tz", + "col_timestamp_with_tz", + "col_time32s", + "col_time32ms", + "col_time64us", + "col_struct", + "col_list", + "col_duration", + "col_binary", + ), + ( + True, + -1, + 1, + 2, + 2, + 3, + 4, + 5, + 3.14, + 5.0, + "2020-01-01", + datetime.date(2021, 1, 1), + datetime.date(2022, 1, 1), + datetime.datetime(2023, 1, 1, 1, 2, 3), + datetime.datetime(2024, 3, 4, 5, 6, 7, tzinfo=datetime.timezone.utc), + datetime.time(1, 2, 3), + datetime.time(2, 3, 4), + datetime.time(1, 2, 3, 4), + {"struct_key": "struct_value"}, + [1, 2, 3, 4], + 12345, + b"binary string. Hello world!", + ), + ], + "schema": pa.schema( + [ + pa.field("col_bool", pa.bool_()), + pa.field("col_int8", pa.int8()), + pa.field("col_int16", pa.int16()), + pa.field("col_int32", pa.int32()), + pa.field("col_uint8", pa.uint8()), + pa.field("col_uint16", pa.uint16()), + pa.field("col_uint32", pa.uint32()), + pa.field("col_uint64", pa.uint64()), + pa.field("col_float32", pa.float32()), + pa.field("col_float64", pa.float64()), + pa.field("col_string", pa.string()), + pa.field("col_date32", pa.date32()), + pa.field("col_date64", pa.date64()), + pa.field("col_timestamp_without_tz", pa.timestamp("s")), + pa.field("col_timestamp_with_tz", pa.timestamp("s", tz="UTC")), + pa.field("col_time32s", pa.time32("s")), + pa.field("col_time32ms", pa.time32("ms")), + pa.field("col_time64us", pa.time64("us")), + pa.field("col_struct", pa.struct([pa.field("struct_key", pa.string())])), + pa.field("col_list", pa.list_(pa.int32())), + pa.field("col_duration", pa.duration("s")), + pa.field("col_binary", pa.binary()), + ] + ), + "last_modified": "2023-06-05T03:54:07.000Z", + } +} + +single_parquet_scenario = ( + TestScenarioBuilder() + .set_name("single_parquet_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "parquet"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_single_parquet_file, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.parquet", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.parquet", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +single_partitioned_parquet_scenario = ( + TestScenarioBuilder() + .set_name("single_partitioned_parquet_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "parquet"}, + "globs": ["path_prefix/**/*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_single_partitioned_parquet_file, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "partition1": "1", + "partition2": "2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "path_prefix/partition1=1/partition2=2/a.parquet", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "partition1": "1", + "partition2": "2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "path_prefix/partition1=1/partition2=2/a.parquet", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "partition1": {"type": ["null", "string"]}, + "partition2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +multi_parquet_scenario = ( + TestScenarioBuilder() + .set_name("multi_parquet_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "parquet"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_file_type("parquet") + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_multiple_parquet_file, file_type="parquet")) + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "col3": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": "val12a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.parquet", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21a", + "col2": "val22a", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.parquet", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.parquet", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.parquet", + }, + "stream": "stream1", + }, + ] + ) +).build() + +parquet_various_types_scenario = ( + TestScenarioBuilder() + .set_name("parquet_various_types") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "parquet"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_parquet_file_with_various_types, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col_bool": { + "type": ["null", "boolean"], + }, + "col_int8": { + "type": ["null", "integer"], + }, + "col_int16": { + "type": ["null", "integer"], + }, + "col_int32": { + "type": ["null", "integer"], + }, + "col_uint8": { + "type": ["null", "integer"], + }, + "col_uint16": { + "type": ["null", "integer"], + }, + "col_uint32": { + "type": ["null", "integer"], + }, + "col_uint64": { + "type": ["null", "integer"], + }, + "col_float32": { + "type": ["null", "number"], + }, + "col_float64": { + "type": ["null", "number"], + }, + "col_string": { + "type": ["null", "string"], + }, + "col_date32": {"type": ["null", "string"], "format": "date"}, + "col_date64": {"type": ["null", "string"], "format": "date"}, + "col_timestamp_without_tz": {"type": ["null", "string"], "format": "date-time"}, + "col_timestamp_with_tz": {"type": ["null", "string"], "format": "date-time"}, + "col_time32s": { + "type": ["null", "string"], + }, + "col_time32ms": { + "type": ["null", "string"], + }, + "col_time64us": { + "type": ["null", "string"], + }, + "col_struct": { + "type": ["null", "object"], + }, + "col_list": { + "type": ["null", "array"], + }, + "col_duration": { + "type": ["null", "integer"], + }, + "col_binary": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": { + "type": "string", + }, + "_ab_source_file_url": { + "type": "string", + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col_bool": True, + "col_int8": -1, + "col_int16": 1, + "col_int32": 2, + "col_uint8": 2, + "col_uint16": 3, + "col_uint32": 4, + "col_uint64": 5, + "col_float32": 3.14, + "col_float64": 5.0, + "col_string": "2020-01-01", + "col_date32": "2021-01-01", + "col_date64": "2022-01-01", + "col_timestamp_without_tz": "2023-01-01T01:02:03", + "col_timestamp_with_tz": "2024-03-04T05:06:07+00:00", + "col_time32s": "01:02:03", + "col_time32ms": "02:03:04", + "col_time64us": "01:02:03.000004", + "col_struct": {"struct_key": "struct_value"}, + "col_list": [1, 2, 3, 4], + "col_duration": 12345, + "col_binary": "binary string. Hello world!", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.parquet", + }, + "stream": "stream1", + }, + ] + ) +).build() + +parquet_file_with_decimal_no_config_scenario = ( + TestScenarioBuilder() + .set_name("parquet_file_with_decimal_no_config") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "parquet"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_parquet_file_with_decimal, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_records( + [ + { + "data": { + "col1": "13.00", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.parquet", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +parquet_file_with_decimal_as_string_scenario = ( + TestScenarioBuilder() + .set_name("parquet_file_with_decimal_as_string") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": {"filetype": "parquet", "decimal_as_float": False}, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_parquet_file_with_decimal, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_records( + [ + { + "data": { + "col1": "13.00", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.parquet", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +parquet_file_with_decimal_as_float_scenario = ( + TestScenarioBuilder() + .set_name("parquet_file_with_decimal_as_float") + .set_config( + { + "streams": [ + { + "name": "stream1", + "globs": ["*"], + "validation_policy": "Emit Record", + "format": {"filetype": "parquet", "decimal_as_float": True}, + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_parquet_file_with_decimal, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_records( + [ + { + "data": {"col1": 13.00, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "number"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +parquet_file_with_decimal_legacy_config_scenario = ( + TestScenarioBuilder() + .set_name("parquet_file_with_decimal_legacy_config") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": { + "filetype": "parquet", + }, + "globs": ["*"], + "validation_policy": "emit_record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_parquet_file_with_decimal, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_records( + [ + { + "data": {"col1": 13.00, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, + "stream": "stream1", + }, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "number"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + +parquet_with_invalid_config_scenario = ( + TestScenarioBuilder() + .set_name("parquet_with_invalid_config") + .set_config({"streams": [{"name": "stream1", "globs": ["*"], "validation_policy": "Emit Record", "format": {"filetype": "csv"}}]}) + .set_source_builder( + FileBasedSourceBuilder() + .set_stream_reader(TemporaryParquetFilesStreamReader(files=_single_parquet_file, file_type="parquet")) + .set_file_type("parquet") + ) + .set_expected_records([]) + .set_expected_logs({"read": [{"level": "ERROR", "message": "Error parsing record"}]}) + .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files") + .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.") + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/scenario_builder.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/scenario_builder.py new file mode 100644 index 000000000000..8158225ac8f4 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/scenario_builder.py @@ -0,0 +1,243 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from abc import ABC, abstractmethod +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Any, Generic, List, Mapping, Optional, Set, Tuple, Type, TypeVar + +from airbyte_cdk.models import ( + AirbyteAnalyticsTraceMessage, + AirbyteStateMessageSerializer, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, + SyncMode, +) +from airbyte_cdk.sources import AbstractSource +from airbyte_cdk.sources.source import TState + + +@dataclass +class IncrementalScenarioConfig: + input_state: List[Mapping[str, Any]] = field(default_factory=list) + expected_output_state: Optional[Mapping[str, Any]] = None + + +SourceType = TypeVar("SourceType", bound=AbstractSource) + + +class SourceBuilder(ABC, Generic[SourceType]): + """ + A builder that creates a source instance of type SourceType + """ + + @abstractmethod + def build( + self, configured_catalog: Optional[Mapping[str, Any]], config: Optional[Mapping[str, Any]], state: Optional[TState] + ) -> SourceType: + raise NotImplementedError() + + +class TestScenario(Generic[SourceType]): + def __init__( + self, + name: str, + config: Mapping[str, Any], + source: SourceType, + expected_spec: Optional[Mapping[str, Any]], + expected_check_status: Optional[str], + expected_catalog: Optional[Mapping[str, Any]], + expected_logs: Optional[Mapping[str, List[Mapping[str, Any]]]], + expected_records: List[Mapping[str, Any]], + expected_check_error: Tuple[Optional[Type[Exception]], Optional[str]], + expected_discover_error: Tuple[Optional[Type[Exception]], Optional[str]], + expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]], + incremental_scenario_config: Optional[IncrementalScenarioConfig], + expected_analytics: Optional[List[AirbyteAnalyticsTraceMessage]] = None, + log_levels: Optional[Set[str]] = None, + catalog: Optional[ConfiguredAirbyteCatalog] = None, + ): + if log_levels is None: + log_levels = {"ERROR", "WARN", "WARNING"} + self.name = name + self.config = config + self.catalog = catalog + self.source = source + self.expected_spec = expected_spec + self.expected_check_status = expected_check_status + self.expected_catalog = expected_catalog + self.expected_logs = expected_logs + self.expected_records = expected_records + self.expected_check_error = expected_check_error + self.expected_discover_error = expected_discover_error + self.expected_read_error = expected_read_error + self.incremental_scenario_config = incremental_scenario_config + self.expected_analytics = expected_analytics + self.log_levels = log_levels + self.validate() + + def validate(self) -> None: + assert self.name + + def configured_catalog(self, sync_mode: SyncMode) -> Optional[Mapping[str, Any]]: + # The preferred way of returning the catalog for the TestScenario is by providing it at the initialization. The previous solution + # relied on `self.source.streams` which might raise an exception hence screwing the tests results as the user might expect the + # exception to be raised as part of the actual check/discover/read commands + # Note that to avoid a breaking change, we still attempt to automatically generate the catalog based on the streams + if self.catalog: + return ConfiguredAirbyteCatalogSerializer.dump(self.catalog) + + catalog: Mapping[str, Any] = {"streams": []} + for stream in catalog["streams"]: + catalog["streams"].append( + { + "stream": { + "name": stream["name"], + "json_schema": {}, + "supported_sync_modes": [sync_mode.value], + }, + "sync_mode": sync_mode.value, + "destination_sync_mode": "append", + } + ) + + return catalog + + def input_state(self) -> List[Mapping[str, Any]]: + if self.incremental_scenario_config: + return self.incremental_scenario_config.input_state + else: + return [] + + +class TestScenarioBuilder(Generic[SourceType]): + """ + A builder that creates a TestScenario instance for a source of type SourceType + """ + + def __init__(self) -> None: + self._name = "" + self._config: Mapping[str, Any] = {} + self._catalog: Optional[ConfiguredAirbyteCatalog] = None + self._expected_spec: Optional[Mapping[str, Any]] = None + self._expected_check_status: Optional[str] = None + self._expected_catalog: Mapping[str, Any] = {} + self._expected_logs: Optional[Mapping[str, Any]] = None + self._expected_records: List[Mapping[str, Any]] = [] + self._expected_check_error: Tuple[Optional[Type[Exception]], Optional[str]] = None, None + self._expected_discover_error: Tuple[Optional[Type[Exception]], Optional[str]] = None, None + self._expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]] = None, None + self._incremental_scenario_config: Optional[IncrementalScenarioConfig] = None + self._expected_analytics: Optional[List[AirbyteAnalyticsTraceMessage]] = None + self.source_builder: Optional[SourceBuilder[SourceType]] = None + self._log_levels = None + + def set_name(self, name: str) -> "TestScenarioBuilder[SourceType]": + self._name = name + return self + + def set_config(self, config: Mapping[str, Any]) -> "TestScenarioBuilder[SourceType]": + self._config = config + return self + + def set_expected_spec(self, expected_spec: Mapping[str, Any]) -> "TestScenarioBuilder[SourceType]": + self._expected_spec = expected_spec + return self + + def set_catalog(self, catalog: ConfiguredAirbyteCatalog) -> "TestScenarioBuilder[SourceType]": + self._catalog = catalog + return self + + def set_expected_check_status(self, expected_check_status: str) -> "TestScenarioBuilder[SourceType]": + self._expected_check_status = expected_check_status + return self + + def set_expected_catalog(self, expected_catalog: Mapping[str, Any]) -> "TestScenarioBuilder[SourceType]": + self._expected_catalog = expected_catalog + return self + + def set_expected_logs(self, expected_logs: Mapping[str, List[Mapping[str, Any]]]) -> "TestScenarioBuilder[SourceType]": + self._expected_logs = expected_logs + return self + + def set_expected_records(self, expected_records: Optional[List[Mapping[str, Any]]]) -> "TestScenarioBuilder[SourceType]": + self._expected_records = expected_records + return self + + def set_incremental_scenario_config(self, incremental_scenario_config: IncrementalScenarioConfig) -> "TestScenarioBuilder[SourceType]": + self._incremental_scenario_config = incremental_scenario_config + return self + + def set_expected_check_error(self, error: Optional[Type[Exception]], message: str) -> "TestScenarioBuilder[SourceType]": + self._expected_check_error = error, message + return self + + def set_expected_discover_error(self, error: Type[Exception], message: str) -> "TestScenarioBuilder[SourceType]": + self._expected_discover_error = error, message + return self + + def set_expected_read_error(self, error: Type[Exception], message: str) -> "TestScenarioBuilder[SourceType]": + self._expected_read_error = error, message + return self + + def set_log_levels(self, levels: Set[str]) -> "TestScenarioBuilder": + self._log_levels = levels + return self + + def set_source_builder(self, source_builder: SourceBuilder[SourceType]) -> "TestScenarioBuilder[SourceType]": + self.source_builder = source_builder + return self + + def set_expected_analytics(self, expected_analytics: Optional[List[AirbyteAnalyticsTraceMessage]]) -> "TestScenarioBuilder[SourceType]": + self._expected_analytics = expected_analytics + return self + + def copy(self) -> "TestScenarioBuilder[SourceType]": + return deepcopy(self) + + def build(self) -> "TestScenario[SourceType]": + if self.source_builder is None: + raise ValueError("source_builder is not set") + if self._incremental_scenario_config and self._incremental_scenario_config.input_state: + state = [ + AirbyteStateMessageSerializer.load(s) if isinstance(s, dict) else s for s in self._incremental_scenario_config.input_state + ] + else: + state = None + source = self.source_builder.build( + self._configured_catalog(SyncMode.incremental if self._incremental_scenario_config else SyncMode.full_refresh), + self._config, + state, + ) + return TestScenario( + self._name, + self._config, + source, + self._expected_spec, + self._expected_check_status, + self._expected_catalog, + self._expected_logs, + self._expected_records, + self._expected_check_error, + self._expected_discover_error, + self._expected_read_error, + self._incremental_scenario_config, + self._expected_analytics, + self._log_levels, + self._catalog, + ) + + def _configured_catalog(self, sync_mode: SyncMode) -> Optional[Mapping[str, Any]]: + if not self._expected_catalog: + return None + catalog: Mapping[str, Any] = {"streams": []} + for stream in self._expected_catalog["streams"]: + catalog["streams"].append( + { + "stream": stream, + "sync_mode": sync_mode.value, + "destination_sync_mode": "append", + } + ) + + return catalog diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py new file mode 100644 index 000000000000..97c0c491510a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py @@ -0,0 +1,602 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import base64 + +import nltk +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +# import nltk data for pdf parser +nltk.download("punkt") +nltk.download("averaged_perceptron_tagger") + +json_schema = { + "type": "object", + "properties": { + "content": { + "type": ["null", "string"], + "description": "Content of the file as markdown. Might be null if the file could not be parsed", + }, + "document_key": {"type": ["null", "string"], "description": "Unique identifier of the document, e.g. the file path"}, + "_ab_source_file_parse_error": { + "type": ["null", "string"], + "description": "Error message if the file could not be parsed even though the file is supported", + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, +} + +simple_markdown_scenario = ( + TestScenarioBuilder() + .set_name("simple_markdown_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.md": { + "contents": bytes( + "# Title 1\n\n## Title 2\n\n### Title 3\n\n#### Title 4\n\n##### Title 5\n\n###### Title 6\n\n", "UTF-8" + ), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.md": { + "contents": bytes("Just some text", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "c": { + "contents": bytes("Detected via mime type", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + "mime_type": "text/markdown", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "document_key": "a.md", + "content": "# Title 1\n\n## Title 2\n\n### Title 3\n\n#### Title 4\n\n##### Title 5\n\n###### Title 6\n\n", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.md", + }, + "stream": "stream1", + }, + { + "data": { + "document_key": "b.md", + "content": "Just some text", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.md", + }, + "stream": "stream1", + }, + { + "data": { + "document_key": "c", + "content": "Detected via mime type", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c", + }, + "stream": "stream1", + }, + ] + ) +).build() + +simple_txt_scenario = ( + TestScenarioBuilder() + .set_name("simple_txt_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.txt": { + "contents": bytes("Just some raw text", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b": { + "contents": bytes("Detected via mime type", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + "mime_type": "text/plain", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "document_key": "a.txt", + "content": "Just some raw text", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.txt", + }, + "stream": "stream1", + }, + { + "data": { + "document_key": "b", + "content": "Detected via mime type", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b", + }, + "stream": "stream1", + }, + ] + ) +).build() + +# If skip unprocessable file types is set to false, then discover will fail if it encounters a non-matching file type +unstructured_invalid_file_type_discover_scenario_no_skip = ( + TestScenarioBuilder() + .set_name("unstructured_invalid_file_type_discover_scenario_no_skip") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured", "skip_unprocessable_files": False}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": bytes("Just a humble text file", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records([]) + .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files") + .set_expected_read_error( + AirbyteTracedException, + "Please check the logged errors for more information.", + ) +).build() + +# If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types +unstructured_invalid_file_type_discover_scenario_skip = ( + TestScenarioBuilder() + .set_name("unstructured_invalid_file_type_discover_scenario_skip") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured", "skip_unprocessable_files": True}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": bytes("Just a humble text file", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "document_key": "a.csv", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=a.csv message=File type FileType.CSV is not supported. Supported file types are FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT", + }, + "stream": "stream1", + } + ] + ) +).build() + +# TODO When working on https://github.com/airbytehq/airbyte/issues/31605, this test should be split into two tests: +# 1. Test that the file is skipped if skip_unprocessable_files is set to true +# 2. Test that the sync fails if skip_unprocessable_files is set to false +unstructured_invalid_file_type_read_scenario = ( + TestScenarioBuilder() + .set_name("unstructured_invalid_file_type_read_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured", "skip_unprocessable_files": False}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.md": { + "contents": bytes("A harmless markdown file", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": bytes("An evil text file", "UTF-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "document_key": "a.md", + "content": "A harmless markdown file", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.md", + }, + "stream": "stream1", + }, + ] + ) +).build() + +pdf_file = base64.b64decode( + "JVBERi0xLjEKJcKlwrHDqwoKMSAwIG9iagogIDw8IC9UeXBlIC9DYXRhbG9nCiAgICAgL1BhZ2VzIDIgMCBSCiAgPj4KZW5kb2JqCgoyIDAgb2JqCiAgPDwgL1R5cGUgL1BhZ2VzCiAgICAgL0tpZHMgWzMgMCBSXQogICAgIC9Db3VudCAxCiAgICAgL01lZGlhQm94IFswIDAgMzAwIDE0NF0KICA+PgplbmRvYmoKCjMgMCBvYmoKICA8PCAgL1R5cGUgL1BhZ2UKICAgICAgL1BhcmVudCAyIDAgUgogICAgICAvUmVzb3VyY2VzCiAgICAgICA8PCAvRm9udAogICAgICAgICAgIDw8IC9GMQogICAgICAgICAgICAgICA8PCAvVHlwZSAvRm9udAogICAgICAgICAgICAgICAgICAvU3VidHlwZSAvVHlwZTEKICAgICAgICAgICAgICAgICAgL0Jhc2VGb250IC9UaW1lcy1Sb21hbgogICAgICAgICAgICAgICA+PgogICAgICAgICAgID4+CiAgICAgICA+PgogICAgICAvQ29udGVudHMgNCAwIFIKICA+PgplbmRvYmoKCjQgMCBvYmoKICA8PCAvTGVuZ3RoIDU1ID4+CnN0cmVhbQogIEJUCiAgICAvRjEgMTggVGYKICAgIDAgMCBUZAogICAgKEhlbGxvIFdvcmxkKSBUagogIEVUCmVuZHN0cmVhbQplbmRvYmoKCnhyZWYKMCA1CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDAxOCAwMDAwMCBuIAowMDAwMDAwMDc3IDAwMDAwIG4gCjAwMDAwMDAxNzggMDAwMDAgbiAKMDAwMDAwMDQ1NyAwMDAwMCBuIAp0cmFpbGVyCiAgPDwgIC9Sb290IDEgMCBSCiAgICAgIC9TaXplIDUKICA+PgpzdGFydHhyZWYKNTY1CiUlRU9GCg==" +) + +docx_file = base64.b64decode( + "UEsDBBQACAgIAEkqVFcAAAAAAAAAAAAAAAASAAAAd29yZC9udW1iZXJpbmcueG1spZNNTsMwEIVPwB0i79skFSAUNe2CCjbsgAO4jpNYtT3W2Eno7XGbv1IklIZV5Izf98bj5/X2S8mg5mgF6JTEy4gEXDPIhC5S8vnxsngigXVUZ1SC5ik5cku2m7t1k+hK7Tn6fYFHaJsolpLSOZOEoWUlV9QuwXDtizmgos4vsQgVxUNlFgyUoU7shRTuGK6i6JF0GEhJhTrpEAslGIKF3J0kCeS5YLz79Aqc4ttKdsAqxbU7O4bIpe8BtC2FsT1NzaX5YtlD6r8OUSvZ72vMFLcMaePnrGRr1ABmBoFxa/3fXVsciHE0YYAnxKCY0sJPz74TRYUeMKd0XIEG76X37oZ2Ro0HGWdh5ZRG2tKb2CPF4+8u6Ix5XuqNmJTiK4JXuQqHQM5BsJKi6wFyDkECO/DsmeqaDmHOiklxviJlghZI1RhSe9PNxtFVXN5LavhIK/5He0WozBj3+zm0ixcYP9wGWPWAcPMNUEsHCEkTQ39oAQAAPQUAAFBLAwQUAAgICABJKlRXAAAAAAAAAAAAAAAAEQAAAHdvcmQvc2V0dGluZ3MueG1spZVLbtswEIZP0DsY3Nt6xHYLIXKAJGi7aFZODzAmKYkwXyApq759qQcl2wEKxV2J/IfzzXA0Gj0+/RF8caLGMiVzlKxitKASK8JkmaPf79+X39DCOpAEuJI0R2dq0dPuy2OTWeqcP2UXniBtJnCOKud0FkUWV1SAXSlNpTcWyghwfmvKSIA51nqJldDg2IFx5s5RGsdbNGBUjmojswGxFAwbZVXhWpdMFQXDdHgEDzMnbu/yqnAtqHRdxMhQ7nNQ0lZM20AT99K8sQqQ078ucRI8nGv0nGjEQOMLLXgfqFGGaKMwtdarr71xJCbxjAK2iNFjTgrXMUMmApgcMW1z3IDG2Csfeyhah5ouMtXC8jmJ9KZf7GDAnD9mAXfU89Jfs1ldfEPwXq42Y0Peg8AVGBcA/B4CV/hIyQvIE4zNTMpZ7XxDIgxKA2JqUvupN5vEN+2yr0DTiVb+H+2HUbWe2n19D+3iC0w2nwOkAbDzI5AwqzmcnwEfS5+WJN1VF012At/NCYq6Q7SAmrt3OOyd0sH4NY17cz8Kp9W+H6sjZIP8UoLwn9fV1HxThLam2rD5N2hDRlcxudm3TvQNtO7DHsokR5yVlUtavvM74qd2tzmU6WBLO1va27oNYOyHoT89LCYtDdrFuYegPUzaOmjrSdsEbTNp26BtW606a2o4k0dfhrBs9UJxrhpKfk72D9JQj/Ar2/0FUEsHCAbSYFUWAgAADwcAAFBLAwQUAAgICABJKlRXAAAAAAAAAAAAAAAAEgAAAHdvcmQvZm9udFRhYmxlLnhtbKWUTU7DMBCFT8AdIu/bpAgQippUCAQbdsABBsdJrNoea+w09Pa4ND9QJJSGVZSM3/fG4xevNx9aRTtBTqLJ2GqZsEgYjoU0VcbeXh8XtyxyHkwBCo3I2F44tskv1m1aovEuCnLjUs0zVntv0zh2vBYa3BKtMKFYImnw4ZWqWANtG7vgqC14+S6V9Pv4MkluWIfBjDVk0g6x0JITOiz9QZJiWUouukevoCm+R8kD8kYL478cYxIq9IDG1dK6nqbn0kKx7iG7vzax06pf19opbgVBG85Cq6NRi1RYQi6cC18fjsWBuEomDPCAGBRTWvjp2XeiQZoBc0jGCWjwXgbvbmhfqHEj4yycmtLIsfQs3wlo/7sLmDHP73orJ6X4hBBUvqEhkHMQvAbyPUDNISjkW1Hcg9nBEOaimhTnE1IhoSLQY0jdWSe7Sk7i8lKDFSOt+h/tibCxY9yv5tC+/YGr6/MAlz0g7+6/qE0N6BD+O5KgWJyv4+5izD8BUEsHCK2HbQB5AQAAWgUAAFBLAwQUAAgICABJKlRXAAAAAAAAAAAAAAAADwAAAHdvcmQvc3R5bGVzLnhtbN2X7W7aMBSGr2D3gPK/TUgCQ6hp1Q91m1R11dpdwCExxMKxLduBsqufnS8gCVUakNYOfgQf+7zn+PFxbC6uXhMyWCEhMaOBNTx3rAGiIYswXQTW75f7s4k1kApoBIRRFFgbJK2ryy8X66lUG4LkQPtTOU3CwIqV4lPblmGMEpDnjCOqO+dMJKB0UyzsBMQy5WchSzgoPMMEq43tOs7YKmRYYKWCTguJswSHgkk2V8ZlyuZzHKLiUXqILnFzlzsWpgmiKotoC0R0DozKGHNZqiV91XRnXIqs3prEKiHluDXvEi0SsNaLkZA80JqJiAsWIim19S7vrBSHTgeARqLy6JLCfswykwQwrWRMadSEqtjnOnYBLZPaTmTLQpIuieRdD3gmQGyaWUAPnrv+HHeq4pqC9lKpqAqyj0QYg1ClAOmjQFi4RNEt0BVUxRwtOpVzTSnCsBCQbItUvmtlh06tXJ5j4GirtjhO7ZtgKd+Wu99HbWcHDkfvE3BLgUv9AoxYeIfmkBIlTVM8iaJZtLLHPaNKDtZTkCHGgXUtMOjw62kodxoIpLqWGHZM8TWV1XjbSMk/2rwCvVFct7TcyrqNAF2UNkSNzS6Ssesp8nor0+QQ4kyCYLOp3a9jq2j8Sok2QKpYIcsL2V0hu8ElOye0hNpw7c5BmPrisVHNun5EgfVo6jGbd5R76qMoY0whQeV0aD4oj525NuUVzAjak34xlk762cjBY4co7ZP4jsAcm03hOO8YDPMlmoFE0U9a9m4Dai/0qtrsxeIsEeKPO0MKQWN+0Aska3YOC3QjECxvkN7wVTpOUT3VSsNcIX2ODl3HzGeWDQ4s33HeXvmqyLeV6TvNysxtO1XYB6p7EKr7qaB6465QZ3XlCrLXsv1z25GQvYOQvY8NebLP2O3LOGSEiapuPfNtvHsnLe/eyQng+wfh+58JvjvpCn8P9jj7NGD7LbD9E8AeHYQ9+lSw/VPCPnirOBL2+CDs8f8JG9fC/hP4L1jpm1DjjpNZPzT18R71999BRi0oR0ehfE5nqpVm1fGhgXpuL6In/OuCayl22BBey03SO3CTLH/Jy79QSwcI2niuUysDAADPEgAAUEsDBBQACAgIAEkqVFcAAAAAAAAAAAAAAAARAAAAd29yZC9kb2N1bWVudC54bWyllV1u2zAMx0+wOwR6bx0H6VYYTfrQoMOAbQja7QCKJNtCJVGg5GTZ6Ud/t2lRuJlfZIrij3/JNHVz+8ea2V5h0OBWLL2cs5lyAqR2xYr9/nV/cc1mIXInuQGnVuyoArtdf7o5ZBJEZZWLMyK4kFmxYmWMPkuSIEplebgErxw5c0DLI5lYJJbjU+UvBFjPo95po+MxWcznn1mHgRWr0GUd4sJqgRAgj3VIBnmuheqGPgKn5G1DNp3kJmOCypAGcKHUPvQ0ey6NnGUP2b+3ib01/bqDn5JNIj/Q57CmTXQAlB5BqBBodtM6B2I6n3CANWKImCLhZc5eieXaDZi6OE5AQ+5Lyt0dWoMaNzKeRTBThLSu73qHHI+vVfAzzvN5vNeTqviEQFGxwqEgz0GIkmPsAeYcggHxpOQdd3s+FLMsJpXzCUlqXiC3Y5GGD33ZdH5SLo8l92qkFf9H+4pQ+bHcl+fQnv2B6dXHAIsesKYWuOPiqSA9Ts4OmQAD1Ivum4cljR/ksR49uanDyocVm3cP66Y2yrye3L6eetionFcmvuHZ4ovJdJl5jvybHGbTRqzfYj3gFklbMtrvCXlD8Mt0HbEZoqEle15jWJui8zRXRBY8F9QjPKqgcK/Y+g5cpPZZL4zt8lZXHRKUiG2wLx7/Ereky+nqetnIoJaVLhbtO6AmBmEBI3Id24P3xQ9eb2wHMQL9A+myXR3Bj4ZReRwt1EX5zCwVl4p2+mXRmDlA7M0uw8/K/jp6RU66H7EO7Xbda0/6AkjGy3L9D1BLBwi8KP69SwIAAHEHAABQSwMEFAAICAgASSpUVwAAAAAAAAAAAAAAABwAAAB3b3JkL19yZWxzL2RvY3VtZW50LnhtbC5yZWxzrZJNasMwEIVP0DuI2dey0x9KiZxNCGRb3AMo8viHWiMhTUp9+4qUJA4E04WX74l5882M1psfO4hvDLF3pKDIchBIxtU9tQo+q93jG4jImmo9OEIFI0bYlA/rDxw0p5rY9T6KFEJRQcfs36WMpkOrY+Y8UnppXLCakwyt9Np86RblKs9fZZhmQHmTKfa1grCvCxDV6PE/2a5peoNbZ44Wie+0kJxqMQXq0CIrOMk/s8hSGMj7DKslGSIyp+XGK8bZmUN4WhKhccSVPgyTVVysOYjnJSHoaA8Y0txXiIs1B/Gy6DF4HHB6ipM+t5c3n7z8BVBLBwiQAKvr8QAAACwDAABQSwMEFAAICAgASSpUVwAAAAAAAAAAAAAAAAsAAABfcmVscy8ucmVsc43POw7CMAwG4BNwh8g7TcuAEGrSBSF1ReUAUeKmEc1DSXj09mRgAMTAaPv3Z7ntHnYmN4zJeMegqWog6KRXxmkG5+G43gFJWTglZu+QwYIJOr5qTziLXHbSZEIiBXGJwZRz2FOa5IRWpMoHdGUy+mhFLmXUNAh5ERrppq63NL4bwD9M0isGsVcNkGEJ+I/tx9FIPHh5tejyjxNfiSKLqDEzuPuoqHq1q8IC5S39eJE/AVBLBwgtaM8isQAAACoBAABQSwMEFAAICAgASSpUVwAAAAAAAAAAAAAAABUAAAB3b3JkL3RoZW1lL3RoZW1lMS54bWztWUtv2zYcvw/YdyB0b2XZVuoEdYrYsdutTRskboceaYmW2FCiQNJJfBva44ABw7phhxXYbYdhW4EW2KX7NNk6bB3Qr7C/HpYpm86jTbcOrQ82Sf3+7wdJ+fKVw4ihfSIk5XHbci7WLERij/s0DtrW7UH/QstCUuHYx4zHpG1NiLSurH/4wWW8pkISEQT0sVzDbStUKlmzbenBMpYXeUJieDbiIsIKpiKwfYEPgG/E7HqttmJHmMYWinEEbG+NRtQjaJCytNanzHsMvmIl0wWPiV0vk6hTZFh/z0l/5ER2mUD7mLUtkOPzgwE5VBZiWCp40LZq2cey1y/bJRFTS2g1un72KegKAn+vntGJYFgSOv3m6qXNkn8957+I6/V63Z5T8ssA2PPAUmcB2+y3nM6UpwbKh4u8uzW31qziNf6NBfxqp9NxVyv4xgzfXMC3aivNjXoF35zh3UX9Oxvd7koF787wKwv4/qXVlWYVn4FCRuO9BXQazzIyJWTE2TUjvAXw1jQBZihby66cPlbLci3C97joAyALLlY0RmqSkBH2ANfFjA4FTQXgNYK1J/mSJxeWUllIeoImqm19nGCoiBnk5bMfXz57go7uPz26/8vRgwdH9382UF3DcaBTvfj+i78ffYr+evLdi4dfmfFSx//+02e//fqlGah04POvH//x9PHzbz7/84eHBviGwEMdPqARkegmOUA7PALDDALIUJyNYhBiqlNsxIHEMU5pDOieCivomxPMsAHXIVUP3hHQAkzAq+N7FYV3QzFW1AC8HkYV4BbnrMOF0abrqSzdC+M4MAsXYx23g/G+SXZ3Lr69cQK5TE0suyGpqLnNIOQ4IDFRKH3G9wgxkN2ltOLXLeoJLvlIobsUdTA1umRAh8pMdI1GEJeJSUGId8U3W3dQhzMT+02yX0VCVWBmYklYxY1X8VjhyKgxjpiOvIFVaFJydyK8isOlgkgHhHHU84mUJppbYlJR9zq0DnPYt9gkqiKFonsm5A3MuY7c5HvdEEeJUWcahzr2I7kHKYrRNldGJXi1QtI5xAHHS8N9hxJ1ttq+TYPQnCDpk7EwlQTh1XqcsBEmcdHhK706ovFxjTuCvo3Pu3FDq3z+7aP/UcveACeYama+US/DzbfnLhc+ffu78yYex9sECuJ9c37fnN/F5rysns+/Jc+6sK0ftDM20dJT94gytqsmjNyQWf+WYJ7fh8VskhGVh/wkhGEhroILBM7GSHD1CVXhbogTEONkEgJZsA4kSriEq4W1lHd2P6Vgc7bmTi+VgMZqi/v5ckO/bJZsslkgdUGNlMFphTUuvZ4wJweeUprjmqW5x0qzNW9C3SCcvkpwVuq5aEgUzIif+j1nMA3LGwyRU9NiFGKfGJY1+5zGG/GmeyYlzsfJtQUn24vVxOLqDB20rVW37lrIw0nbGsFpCYZRAvxk2mkwC+K25ancwJNrcc7iVXNWOTV3mcEVEYmQahPLMKfKHk1fpcQz/etuM/XD+RhgaCan06LRcv5DLez50JLRiHhqycpsWjzjY0XEbugfoCEbix0Mejfz7PKphE5fn04E5HazSLxq4Ra1Mf/KpqgZzJIQF9ne0mKfw7NxqUM209Szl+j+iqY0ztEU9901Jc1cOJ82/OzSBLu4wCjN0bbFhQo5dKEkpF5fwL6fyQK9EJRFqhJi6QvoVFeyP+tbOY+8yQWh2qEBEhQ6nQoFIduqsPMEZk5d3x6njIo+U6ork/x3SPYJG6TVu5Lab6Fw2k0KR2S4+aDZpuoaBv23+ODSfKWNZyaoeZbNr6k1fW0rWH09FU6zAWvi6maL6+7SnWd+q03gloHSL2jcVHhsdjwd8B2IPir3eQSJeKFVlF+5OASdW5pxKat/6xTUWhLv8zw7as5uLHH28eJe3dmuwdfu8a62F0vU1u4h2Wzhjyg+vAeyN+F6M2b5ikxglg+2RWbwkPuTYshk3hJyR0xbOot3yAhR/3Aa1jmPFv/0lJv5Ti4gtb0kbJxMWOBnm0hJXD+ZuKSY3vFK4uwWZ2LAZpJzfB7lskWWnmLx67jsFMqbXWbM3tO67BSBegWXqcPjXVZ4yjYlHjlUAnenf11B/tqzlF3/B1BLBwghWqKELAYAANsdAABQSwMEFAAICAgASSpUVwAAAAAAAAAAAAAAABMAAABbQ29udGVudF9UeXBlc10ueG1stZNNbsIwEIVP0DtE3lbE0EVVVQQW/Vm2XdADDM4ErPpPnoHC7TsJkAUCqZWajWX7zbz3eSRP5zvvii1msjFUalKOVYHBxNqGVaU+F6+jB1UQQ6jBxYCV2iOp+exmutgnpEKaA1VqzZwetSazRg9UxoRBlCZmDyzHvNIJzBesUN+Nx/faxMAYeMSth5pNn7GBjePi6XDfWlcKUnLWAAuXFjNVvOxEPGC2Z/2Lvm2oz2BGR5Ayo+tqaG0T3Z4HiEptwrtMJtsa/xQRm8YarKPZeGkpv2OuU44GiWSo3pWEzLI7pn5A5jfwYqvbSn1Sy+Mjh0HgvcNrAJ02aHwjXgtYOrxM0MuDQoSNX2KW/WWIXh4Uolc82HAZpC/5Rw6Wj3pl+J10WCenSN399tkPUEsHCDOvD7csAQAALQQAAFBLAQIUABQACAgIAEkqVFdJE0N/aAEAAD0FAAASAAAAAAAAAAAAAAAAAAAAAAB3b3JkL251bWJlcmluZy54bWxQSwECFAAUAAgICABJKlRXBtJgVRYCAAAPBwAAEQAAAAAAAAAAAAAAAACoAQAAd29yZC9zZXR0aW5ncy54bWxQSwECFAAUAAgICABJKlRXrYdtAHkBAABaBQAAEgAAAAAAAAAAAAAAAAD9AwAAd29yZC9mb250VGFibGUueG1sUEsBAhQAFAAICAgASSpUV9p4rlMrAwAAzxIAAA8AAAAAAAAAAAAAAAAAtgUAAHdvcmQvc3R5bGVzLnhtbFBLAQIUABQACAgIAEkqVFe8KP69SwIAAHEHAAARAAAAAAAAAAAAAAAAAB4JAAB3b3JkL2RvY3VtZW50LnhtbFBLAQIUABQACAgIAEkqVFeQAKvr8QAAACwDAAAcAAAAAAAAAAAAAAAAAKgLAAB3b3JkL19yZWxzL2RvY3VtZW50LnhtbC5yZWxzUEsBAhQAFAAICAgASSpUVy1ozyKxAAAAKgEAAAsAAAAAAAAAAAAAAAAA4wwAAF9yZWxzLy5yZWxzUEsBAhQAFAAICAgASSpUVyFaooQsBgAA2x0AABUAAAAAAAAAAAAAAAAAzQ0AAHdvcmQvdGhlbWUvdGhlbWUxLnhtbFBLAQIUABQACAgIAEkqVFczrw+3LAEAAC0EAAATAAAAAAAAAAAAAAAAADwUAABbQ29udGVudF9UeXBlc10ueG1sUEsFBgAAAAAJAAkAQgIAAKkVAAAAAA==" +) + +pptx_file = base64.b64decode( + "UEsDBBQAAAAIAHFwW1fGr8RntAEAALoMAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbM2XyU7DMBCG7zxFlEsOqHHZFzXlwHJiqQQ8gEmmrcGxLc+00Ldnki6q2FKWCl8S2TPz/58nUTTpnLyUOhqDR2VNlmyl7SQCk9tCmUGW3N9dtA6TCEmaQmprIEsmgMlJd6NzN3GAERcbzOIhkTsWAvMhlBJT68BwpG99KYmXfiCczJ/kAMR2u70vcmsIDLWo0oi7nTPoy5Gm6PyFt2uQ+EGZODqd5lVWWSyd0yqXxGExNsUbk5bt91UOhc1HJZekzgPyvU4vNS8VS/lbIOKDYSw+NH10MHjjqsqKug58XONB4/dIZ61IubLOwaFyuMkJnzhUkc8NZnU3/Ai9KiDqSU/XsuQswc3oeetQcH76tUpzQ6ECKqBoOZYETwoWzF9659bD983nPaqqV3R0jkT11GvbXx/33fszE16FYF63DoiFdimVaYJBzZuXcmJHhMuLrb8mW9L+MVM7RKgQO7UdINNOgEy7ATLtBci0HyDTQYBMhwEyHf0305VEnqtwebGeb+ZUeyWmGc16OJoISD5ouKWJhj8fQpakGyl4EIfp9fdtqGWaHMcKntcyei2E5wSi/vXovgJQSwMEFAAAAAgAcXBbV/ENN+wAAQAA4QIAAAsAAABfcmVscy8ucmVsc62Sz04DIRCH7z4F2QunLttqjDFlezEmvRlTH2CE6S51gQlMTfv2ool/arZNDz3C/PjmG2C+2PlBvGPKLgYtp3UjBQYTrQudli+rx8mdFJkhWBhiQC33mOWivZo/4wBczuTeURYFErKuema6VyqbHj3kOhKGUlnH5IHLMnWKwLxBh2rWNLcq/WVU7QFTLK2u0tJOK7HaE57Djuu1M/gQzdZj4JEW/xKFDKlD1hURK0qYy+ZXui7kSo0Lzc4XOj6s8shggUFxv/WvAdzwa2OjeUqxhH5q9YawOyZ0fVkhExNOqPTHxA7ziNZn4tQN3VzyyXDHGCza00pA9G2kDn5m+wFQSwMEFAAAAAgAcXBbVwV3nA87AgAAtAwAABQAAABwcHQvcHJlc2VudGF0aW9uLnhtbO2X327aMBTG7/cUlm+4mGj+EJI0wlRaJ6RJnYQKfQDXOUBUx4lsh0GffnZwSGCa1AfIne1zvu+c/GxZzuLpVHJ0BKmKSpBJ8OBPEAhW5YXYk8nbdjVNJ0hpKnLKKwFkcgY1eVp+W9RZLUGB0FQbJTIuQmWU4IPWdeZ5ih2gpOqhqkGY2K6SJdVmKvdeLukf415yL/T92CtpIbDTy6/oq92uYPCzYk1pyl9MJPC2D3UoatW51V9xG37FbUuKHmHTvCvQq0poRXCAEW109VyVVqTWBdONGRDs46XhoXj+myoN8lf+ovTdCipygsMgSqJ0FkcpRjKzKyYSYG+58P4jvx1fTObxQJ306mHu5hOxE8GPQRT5vo8ROxMcp/O0nehzDQQrJgFEdJpZhzoTlQblZNdMK+s82qwcdrThegsnvdFnDssFtWvrtXSj17VEnJqzg0FM3zZtd8MUfuRBbXJKKl8sOET5XhDMMTI5W/q++SQ4miehrS41b1OAvogf8qPdALvNwk1N6GBKmbO0bgTTNj7oQhmnILU+HyBNicB62riqeJGvCs7biT0Z8MwlOlJTTZ8C1/JNVlu15bajzLD7Xoop1zaTZkDvAkAvAabuAkz1OF4tDu/Kw6EJezQdhJFP2POZ9Xwux3Lkc4Hi+EQ9n2CWBPEIqKPiAM0HgNIwTUdAHRUHKO4BhWEa+yOgjooDlAwAJdFsvKOvVBygtAdk6YyX9JWKA/Q4ABTPk/GSvlJpX7L/PjG923+N5V9QSwMEFAAAAAgAcXBbV1KcUMkcAQAAcQQAAB8AAABwcHQvX3JlbHMvcHJlc2VudGF0aW9uLnhtbC5yZWxzrZTBTsMwDIbvPEWUS0407YCB0NJdENIOSIiNB8hat41IkygOg709EUxbW20Vhx792/79yYqzWH63muzAo7JGsCxJGQFT2FKZWrD3zfP1AyMYpCmltgYE2wOyZX61eAMtQ+zBRjkk0cSgoE0I7pFzLBpoJSbWgYmZyvpWhhj6mjtZfMga+CxN59x3PWje8ySrUlC/KjNKNnsH//G2VaUKeLLFZwsmnBnBUasSXiQG8NFW+hqCoB2xV5El0Z/y81izKbGcVyYOXEMIce14QhskhoVZslXmEuHNtISAr966HttBGlvT7ZQQOwVfA4ijNAZxNyVEiL1wAvgN/8TR9zKflEFuNazDXkNnFR1xDOR+8nsaXNJBPW6D936K/AdQSwMEFAAAAAgAcXBbV6YtojXuBgAA0i4AACEAAABwcHQvc2xpZGVNYXN0ZXJzL3NsaWRlTWFzdGVyMS54bWztWu9u4zYS/35PIeg+5MPBK4ki9cdYp4iddW+BdBs06QPQEm3rQks6ik6TPRTYd+gb9C3a+3aPsk9yQ0q0ZMeJE6zTru8MLCxqOBrOzG9mSE727Td3C27dMlFlRT448d64JxbLkyLN8tng5MfrcS86sSpJ85TyImeDk3tWnXxz+pe3Zb/i6Xe0kkxYICKv+nRgz6Us+45TJXO2oNWbomQ5zE0LsaASXsXMSQX9CUQvuINcN3AWNMvt5nvxnO+L6TRL2HmRLBcsl7UQwTiVoH41z8rKSCufI60UrAIx+us1lU7BvuSKp+o5mdW/P7CplaV3A9tzXQ84aF9LZiMurFvKB/Zk5tnO6VunYW5G6uOqvBaMqVF++60or8pLoVf4cHspQCaItK2cLtjAVgL0RMPm1B/pgbPx+cwMaf9uKhbqCe6xQEPXtu7Vr6No7E5aSU1MWmoy/34LbzJ/t4XbMQs4nUWVVbVyD81BxpzrTHJmXXKasHnBU4gVb2Wh0b0qL4rkprLyAmxTrqhNXXHU9qtnObfkfQlipRJrG5eoSaerSLXdK5iEgLA2F4U48KN1/0QIxYHb2O152HfddetpvxSV/JYVC0sNBrZgidSBQG8vKlmzGhatUtUoJO+GRXqvOCfwBCdBwsH380J8tC3+Pq8GduxhDGtL/aI1tS3RnZmszUg+KrhGieYJyBnYiRRalxzi+2wpi2nWaFQvqaZ4Ja/kPWfa7FL9aLIAhTiFfLdZ3vvxyraqhRxxRvNVWMjTEc+SG0sWFkszaTV5r2GA6gAi1UJSL6dFsjy9pIL+sCG5cZH2jfGJYwLp8XDyV+GksOpGE9pHNCkH2U1qf0lQeRA9yHWfiCpMEIkD/+uPqhcHUqmQvuWriPnCwFLe03FVrQWWY1ZbW9J74ZJXLCny1OLslvFniEcvFH89z8TzpfsvlD4ulkLOny0ev1R8Nt0qfd8pjU1Kn1O5vkH4+0jpVIJ1HyEXKJ82qY2+JLUDn8C/jdRGnu+vUtsPiIfI15/Za/uF001mPb7lnoodymcQFVwrm7KpAl2501P+0JAUPEvHGedbjkHyrj4dySyXNSUk7Va6Yq7fWjmOWUkPG0XqcUdBHd1Tnuog+hcZjs7O3Yj03kVnQS+KMOkNz/G73miIR6Mzl8TjEf7ZNjEBkSazBRtns6Vg3y9rKJ6TFJ6DQsfz24SYqpPhvlOCmJQYF4Uqgt2kwPtIiikgrmH855IKWKFJDP/FieF7CD+dGVFM/qczwxy2vr7c2G9MBiYmr0AXZn1YLiYbkUn2EZlwlQTR24ITvzg4A0L8/++y/bWG5qpsj7zxODg/i3uuG4170RBHvRhBAR8GBE7LEQ6j4XhVtisVeTlEx3Or9edPv/3186ff91Ctne7NHcIH0G9G1lJkYMhwGAdoFA17Qw+Pe/g8Dntn44D0xsTHeDSMzkb+u59VM8HD/UQw3Wd4n5oOhYcf9CgWWSKKqpjKN0mxaJodTln8xERZZLrf4blN00RDhJAbx2FIvLjJE9DNPLW2TtvHSLj4jpbWZObBzi498O8djNIbGE1mSNGQoiFFgxFNEpZL4GgGhoIMZcXjG4pvKNhQsKEQQyGGEhgK1Jg5z/IbcIZ62Na04H+vCWZU1xioEhf0vljK92mDRIdS9x08HOLID3AMudNXFPE+9R58vcZL3A4v2sHrdXj9Hbyow4t38PodXrKDF3d4gx28pMMb7uANOrzRDt6wwxvv4I26WLg7mNeAM1vHQ+DlnS4tlR6rLsQT+7QF9emaTq4+tid6qKu6qDJ6kQ/Fje6/qR5i3rzC1BxKRJbPLpd5ItV8vbMlQ9XX06PLpCmTqxK5mp0sPxR5fTnuVGEo7yD3hon8BRXZ2ay3YKFSVBfHKWzDA/tvi3/0uGz2OLoxwWjT2Ks2JpKqkb21eq97tdT72QMXL6i4gB0Uo1gZluVQpsFVPUMwd4jX9j9IdLdhMC5gI2uNPhMZ5bUzJsvRnAorgZ+B/fnTr/YmVPUB4jWgyh+DKn8MqvxpqPQQtXCE4H3ShQNFJCSHBMcvD+BA0QHAgVo4/BYO00fu4IGi4MDTA71aJdsjHn6LB+7g0fRoDxiPLfnhHgAeuMWDtHggl4T4kPH4z78PEw7SwhF04CAeDg4Zjq3l6hDwCFo8wg4ecehFRzz+BDzCFo9o87B7xOOPxyNq8Yg7eERRcODb+YHiEZuLYudqWPYLOWdidVGELy5r1BrrHvbdWpb1W+WrINhtiR7ClWL7Dc844eif7Vcu3Ug/+ufxK5Afeq9UIg/NQdvvJF6EoujooCduCXqPPTro8WN7iP1jjX7qHA3qHov0UwfbgITHIr1+0uweLp3u34Cczn9GP/0vUEsDBBQAAAAIAHFwW1e+a0K9DQEAAMYHAAAsAAAAcHB0L3NsaWRlTWFzdGVycy9fcmVscy9zbGlkZU1hc3RlcjEueG1sLnJlbHPF1d1qwyAUB/D7PYV449VikrZpWmp6MwaFXY3uAURPPliionYsbz/ZGDSwyQYFbwQ/zv/8ODceju/TiN7AukErRoosJwiU0HJQHSMv58f7miDnuZJ81AoYmcGRY3N3eIaR+1Dj+sE4FEKUY7j33uwpdaKHibtMG1DhptV24j5sbUcNF6+8A1rmeUXtdQZuFpnoJBm2J1lgdJ4N/CVbt+0g4EGLywTK/9CCunGQ8MRnffEhltsOPMNZdn2+eFRkoQWmv8jypLQ8aks7tvjcylvafKiFherz5GuNOm7K+O+IyphslVK2isnWKWXrmGyTUraJyaqUsiom26aUbWOyOqWsjsl2KWW7bxldfL/NB1BLAwQUAAAACABxcFtXAP3sDSoEAAAFEQAAIQAAAHBwdC9zbGlkZUxheW91dHMvc2xpZGVMYXlvdXQxLnhtbM1YXY7bNhB+7ykI9cFPCvVDSbQRb2DJq6LAZncRbw7AlWhbCCWqJO3YKQLkWu1xcpJSlGR5f9o6gAP4xaKomeE3882QHL99tysZ2FIhC15NR+4bZwRolfG8qFbT0ceH1MYjIBWpcsJ4RaejPZWjd1e/vK0nkuU3ZM83CmgTlZyQqbVWqp5AKLM1LYl8w2ta6W9LLkqi9KtYwVyQz9p0yaDnOCEsSVFZnb44RZ8vl0VG5zzblLRSrRFBGVEavlwXteyt1adYqwWV2ozRfgpJ7Ws6tVShGLWAERNbPeFaV9rzbMFyUJFSTzw0EmDBipyaT7J+EJQ2o2r7m6gX9b0wGrfbewGKvLHQaVqw+9CJwVbJDOAz9VU/JJPdUpTNUwcC7KaWY4F98wubObpTIGsns2E2W9+9Iputr1+Rhv0C8GjRxqsW3Et3POtJINyDVz1eWd/w7JMEFdf+NO637h0kWp+bZ73uop4pYaxZfSSa7/B4ffl6MEIcYKf10nN9B3nB07hEUeQhp/PXRZHjtBLHXstuCbWLeb5vtB/107BCJkyqhdozal7q5sfAEDoYjOiCsWhlf1xYQJYqYZRUh2irq4QV2SegOKB5ocB7IhUVwOSXLi9tsgGhDBRjklb5PRHkwzPLLdjaIO0Rwp6ff2fJ71labB7bNb1zECU3jy1RepHdoHI6Ya4fuWHHmI9xqAvwKWOhpgsfGIsCL3Re5OlJjJnxlrlaFpRE3Ji0L6pcV78ZEraqTOZZxsDmVm92xkBOlx+6AHFd5WnBmHlpNhWaMAG2hOmNYucaRVVUqp2JAucA9SDcvg124GAfHvB1UL0BKgqiJjIXiNcb8PoD3rGL0GXi9Qe8aMB7SMPLA4wGwMERYOxhfJmAgwFwOAD2PBw6lwk4HABHR4Aj5F9ozUUDYDwAbtBeaNHhAfD4CHAYRBdadOO6Hx+dHmc47mV/+v78Ex/1J/6cKAruGcnomrNcg/DPcfLnSnv9RV+xCVv2p7/z38c//IFb1VLfrxsv/gziZDZ3cGBf41loY4wCO56jazuJUZLMnGCcJuhrf1vPtauqKGlarDaC3m2UdSpbLvQi6PoDIxrA+TkJek5Szpt0OGYFnYOVpS4cQ8sfGyL0Cj0z/3Mx+xFmzhuR8HAvbRoocLspH5/FJTjLPZXl2vSrofF+QtImbpqG89nY1ndX3T/HCNtjT6dvHAaeN8YownF6SFrZeF5pdKfm6vdvf/36/dvfZ8hVeNyu6hv3jVTdCGxEoR2J43HoJTi2YxelNpqPI3uWhoGdBj5CSYxniX/9tWl7XTTJBDVt9O9534C76EULXhaZ4JIv1ZuMl10vD2v+mYqaF6add52uATfbt++G2ImCAPsdTRpb/zRoYduMmxRh4j2p77YmSUqz4SZmqi6qVZcjgwg8+v/i6h9QSwMEFAAAAAgAcXBbV4Bl4Yi3AAAANgEAACwAAABwcHQvc2xpZGVMYXlvdXRzL19yZWxzL3NsaWRlTGF5b3V0MS54bWwucmVsc43PvQ7CIBAH8N2nICxMQutgjCntYkwcXIw+wAWuLbEFwqHRt5fRJg6O9/X755ruNU/siYlc8FrUshIMvQnW+UGL2/W43glGGbyFKXjU4o0kunbVXHCCXG5odJFYQTxpPuYc90qRGXEGkiGiL5M+pBlyKdOgIpg7DKg2VbVV6dvg7cJkJ6t5Otmas+s74j926Htn8BDMY0aff0QompzFM1DGVFhIA2bNpfzuL5ZqWSK4ahu1eLf9AFBLAwQUAAAACABxcFtXN8Y1+I0DAADNCwAAIgAAAHBwdC9zbGlkZUxheW91dHMvc2xpZGVMYXlvdXQxMC54bWy1VsGO2zYQvfcrCPXgk5aSLHtlI97AkldFgU12UTu9MxK9JkKJLEk7dooA+a32c/IlHVKS197sAnbrXkSKGr5582Yozpu324qjDVWaiXrSC6+CHqJ1IUpWP056Hxa5n/SQNqQuCRc1nfR2VPfe3vz0Ro41L+/ITqwNAohaj8nEWxkjxxjrYkUroq+EpDV8WwpVEQOv6hGXinwG6IrjKAiGuCKs9tr96pT9YrlkBZ2JYl3R2jQginJigL5eMak7NHkKmlRUA4zbfUzJ7CSdeKCLWWw95OzUBlZC7wZCL+a8RDWpYGHBDKcI9EG/gzErCEcLujXOTMuFotTO6s0vSs7lg3K7328eFGKlRWtRPNx+aM1ws8lN8LPtj92UjLdLVdkRVEHbiRd4aGef2K4BCVQ0i8XTarG6f8G2WN2+YI07B/jAqY2qIfdjOJF3JEq4j6rjq+WdKD5pVAuIx4bfhLe3aGK2o1y1KTAWyutksB/xoXPdiWW2qSh31slHGN0iGXNt5mbHqXuR9uFoKODLCRS4R2v/w9xDujIZp6TeC2JuMs6KT8gIREtm0DuiDVXIkYHjAJBWHeM0cpC0Lh+IIr89Q25UlI50xxB3Er4uZL8T8qim0AMnBV0JXgKV6BLiWqk8JBSDQ9BUuwf+t0+bz1Hc/kUAhRJL2ntFf2kF2vC90P8xH1YVlw59lA/ceTtyGZ7pck4LAeea0w3lJ8BHZ8IvVkydjt4/Ez0Xa2VWJ8PH58Kz5Yvolz4JcXcSZsTQowPQv8QBKKHg9Re4KghfdqUfXO5vs4Rrwkbx5yDNprMgGfi3yXToJ0k88NNZfOtnaZxl02AwyrP4a3frlBCqYRXN2eNa0fu1vUxOy0qIo2sc9p8yAgQun5NBl5NcCHsKD7MSXyIrS6OatPyxJgo8dJn5N3+lVzJzWUWGnSJzzkqK3q+rj890GVxCF+i4APpFaaL/oWizMM+Hs+nID4IE+sA0TvxRBOWbDgdRNEri6yTN90WrbeQ1sDu1Vr9/++vn79/+vkCt4sNOC26EO23aGVorBoGk6WgYZUnqp2Gc+/FsdO1P8+HAzwf9OM7SZJr1b7/aji2Mx4Wirh38tewayTD+oZWsWKGEFktzVYiq7UmxFJ+pkoK5tjQM2kZyQ+zVMAqDUXQ9GsZtmoBbNzq2uOkpXYlw9Y7I+40rksrdc5lbktA3tzXyZIIP+vCbfwBQSwMEFAAAAAgAcXBbV4Bl4Yi3AAAANgEAAC0AAABwcHQvc2xpZGVMYXlvdXRzL19yZWxzL3NsaWRlTGF5b3V0MTAueG1sLnJlbHONz70OwiAQB/DdpyAsTELrYIwp7WJMHFyMPsAFri2xBcKh0beX0SYOjvf1++ea7jVP7ImJXPBa1LISDL0J1vlBi9v1uN4JRhm8hSl41OKNJLp21VxwglxuaHSRWEE8aT7mHPdKkRlxBpIhoi+TPqQZcinToCKYOwyoNlW1Venb4O3CZCereTrZmrPrO+I/duh7Z/AQzGNGn39EKJqcxTNQxlRYSANmzaX87i+WalkiuGobtXi3/QBQSwMEFAAAAAgAcXBbV0uJUFfAAwAArQwAACIAAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0MTEueG1stVfRkps2FH3vV2jog59YAQaMPfFmDF46ndlkd2on7wrIayYCUUl27HQyk99qPydf0isBXtvrpPbUeTEgro7OPecKXb96vSkZWlMhC16Ne+6N00O0ynheVE/j3rt5akc9JBWpcsJ4Rce9LZW917e/vKpHkuX3ZMtXCgFEJUdkbC2VqkcYy2xJSyJveE0reLfgoiQKHsUTzgX5BNAlw57jhLgkRWW188U58/liUWR0yrNVSSvVgAjKiAL6clnUskOrz0GrBZUAY2YfUlLbmo4t0EXNC8XopMrnGwuZeLGGN651CxJkM5ajipQw8B5Ci4wwZOIRCIbmdKNMmKznglJ9V61/E/WsfhRm9tv1o0BFrtFaFAu3L9ow3EwyN/ho+lN3S0abhSj1FdRBm7HlWGirf7EeAxIoawaz59Fs+XAiNlvenYjG3QJ4b1GdVUPuZTqedVoUd5deR1zW9zz7KFHFITGtQ5PnLqJJXl/rZeuJ0lAW4qIA5xqLrE4dHYr3OcnTAoWhN/SdJnVv4If96FArzwkG5r3WIIgCN/CCYyVku4TaxDzf6tkf4AoKaEZji5L3LTMyYlLN1JZR81DrH0NKQDAjsM8sWtnvZhaSpUoYJdXOD3WbsCL7iBRHNC8UekOkogIZCWBXAqSmpAwxA0mr/JEI8scRckO9Nrw7vrhz8Ps+9l/6qBV6ZCSjS85yoOJdw1It3JGjsP7mefL5zvrBwPuBsaHjDqOfaWytlV+znYP/02jN2/gsD4zG3WoHS7oXLjmjGYfPFKNrys6A9y6Eny8LcT56/0L0lK+EWp4N718KXyxOol97i/ndFpsSRQ92Vv8aOyuHnSQ/w1FI2KLbU86PNxU+VfvfqfYFHH86i7+COJlMnSiw76JJaEeRH9jx1L+zk9hPkokTDNPE/9KdqjmkqoqSpsXTStCHlT4kz3PFxd4Au/1nR4DA9T0JOk9SzvUu3HfFv4YrCyUaW/5cEQErdM78x+fuEmeuq0jYKTJjRU7R21X54UiX4Bq6QEcJ0Cel8X5C0SZumobTydB2nAj63NiP7KEH5RuHgecNI38QxemuaKXOvAJ259bqt69///rt6z9XqFW830HCiXAvVXuHVqKAROJ4GHpJFNux66e2Px0O7EkaBnYa9H0/iaNJ0r/7ojtR1x9lgpp29/e8a5Rd/0WrXBaZ4JIv1E3Gy7bnxjX/REXNC9N2u07bKK+J/niHrud5/cGwswm4dVfDFje9sikRJt6Q+mFtiqQ051xihmr4X9DWyHMI3vufcfsvUEsDBBQAAAAIAHFwW1eAZeGItwAAADYBAAAtAAAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDExLnhtbC5yZWxzjc+9DsIgEAfw3acgLExC62CMKe1iTBxcjD7ABa4tsQXCodG3l9EmDo739fvnmu41T+yJiVzwWtSyEgy9Cdb5QYvb9bjeCUYZvIUpeNTijSS6dtVccIJcbmh0kVhBPGk+5hz3SpEZcQaSIaIvkz6kGXIp06AimDsMqDZVtVXp2+DtwmQnq3k62Zqz6zviP3boe2fwEMxjRp9/RCianMUzUMZUWEgDZs2l/O4vlmpZIrhqG7V4t/0AUEsDBBQAAAAIAHFwW1eTCm11IQYAAOcdAAAUAAAAcHB0L3RoZW1lL3RoZW1lMS54bWztWU1v2zYYvg/YfyB0b2XZVuoEdYrYsdutTRskboceaYmW2FCiQNJJfBva44ABw7phlwG77TBsK9ACu3S/JluHrQP6F/bqwzJl04nTZluB1gebpJ73+4OkfPXaccTQIRGS8rhtOZdrFiKxx30aB23r7qB/qWUhqXDsY8Zj0rYmRFrXNj/84CreUCGJCAL6WG7gthUqlWzYtvRgGcvLPCExPBtxEWEFUxHYvsBHwDdidr1WW7MjTGMLxTgCtndGI+oRNEhZWptT5j0GX7GS6YLHxL6XSdQpMqx/4KQ/ciK7TKBDzNoWyPH50YAcKwsxLBU8aFu17GPZm1ftkoipJbQaXT/7FHQFgX9Qz+hEMCwJnX5z/cp2yb+e81/E9Xq9bs8p+WUA7HlgqbOAbfZbTmfKUwPlw0Xe3Zpba1bxGv/GAn690+m46xV8Y4ZvLuBbtbXmVr2Cb87w7qL+na1ud62Cd2f4tQV8/8r6WrOKz0Aho/HBAjqNZxmZEjLi7IYR3gJ4a5oAM5StZVdOH6tluRbhB1z0AZAFFysaIzVJyAh7gOtiRoeCpgLwBsHak3zJkwtLqSwkPUET1bY+TjBUxAzy6vmPr54/Ra+ePzl5+Ozk4S8njx6dPPzZQHgDx4FO+PL7L/7+9lP019PvXj7+yoyXOv73nz777dcvzUClA198/eSPZ09efPP5nz88NsC3BB7q8AGNiES3yRHa4xHYZhBAhuJ8FIMQU51iKw4kjnFKY0D3VFhB355ghg24Dql68J6ALmACXh8/qCi8H4qxogbgzTCqAHc4Zx0ujDbdTGXpXhjHgVm4GOu4PYwPTbK7c/HtjRNIZ2pi2Q1JRc1dBiHHAYmJQukzfkCIgew+pRW/7lBPcMlHCt2nqIOp0SUDOlRmohs0grhMTApCvCu+2bmHOpyZ2G+TwyoSqgIzE0vCKm68jscKR0aNccR05C2sQpOS+xPhVRwuFUQ6IIyjnk+kNNHcEZOKujehe5jDvsMmURUpFD0wIW9hznXkNj/ohjhKjDrTONSxH8kDSFGMdrkyKsGrFZLOIQ44Xhrue5So89X2XRqE5gRJn4yFqSQIr9bjhI0wiYsmX2nXEY3f9+6Ve/eWoMbime/Yy3DzfbrLhU/f/ja9jcfxLoHKeN+l33fpd7FLL6vni+/Ns3Zs64fujE209AQ+ooztqwkjt2TWyCWY5/dhMZtkROWBPwlhWIir4AKBszESXH1CVbgf4gTEOJmEQBasA4kSLuGaYS3lnd1VKdicrbnTCyagsdrhfr7c0C+eJZtsFkhdUCNlsKqwxpU3E+bkwBWlOa5ZmnuqNFvzJtQNwulrBWetnouGRMGM+KnfcwbTsPyLIXJqWoxC7BPDsmaf0/hXvOmeS4mLcXJtwcn2YjWxuDpDR21r3a27FvJw0rZGcGyCYZQAP5l2GsyCuG15Kjfw7Fqcs3jdnFVOzV1mcEVEIqTaxjLMqbJH09cq8Uz/uttM/XAxBhiayWpaNFrO/6iFPR9aMhoRTy1ZmU2LZ3ysiNgP/SM0ZGOxh0HvZp5dPpXQ6evTiYDcbhaJVy3cojbmX98UNYNZEuIi21ta7HN4Ni51yGaaevYS3V/TlMYFmuK+u6akmQvn04af3Z5gFxcYpTnatrhQIYculITU6wvY9zNZoBeCskhVQix9GZ3qSg5nfSvnkTe5IFR7NECCQqdToSBkVxV2nsHMqevb45RR0WdKdWWS/w7JIWGDtHrXUvstFE67SeGIDDcfNNtUXcOg/xYfXJqvtfHMBDXPs/k1taavbQXrb6bCKhuwJq5utrjuLt155rfaBG4ZKP2Cxk2Fx2bH0wHfg+ijcp9HkIiXWkX5lYtD0LmlGZey+q9OQa0l8b7Is6Pm7MYSZ58u7vWd7Rp87Z7uanuxRG3tHpLNFv6U4sMHIHsbrjdjlq/IBGb5YFdkBg+5PymGTOYtIXfEtKWzeI+MEPWPp2Gd82jxr0+5me/lAlLbS8LG2YQFfraJlMT1s4lLiukdryTObnEmBmwmOcfnUS5bZOkpFr+Jy1ZQ3uwyY/au6rIVAvUaLlPHp7us8JRtSjxyrATuTv/Ggvy1Zym7+Q9QSwMEFAAAAAgAcXBbVwFX6IttAwAAlgsAACEAAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0Mi54bWy1VtFymzoQfb9foaEPfiICDA721OkYHO7cmbTJ1OkHKCCCWoF0Jdm12+lMf6v9nH5JJQGOnaYzzpS+ICFWZ3fPHqR9+WpbU7DBQhLWzEf+mTcCuMlZQZr7+ejdbebGIyAVagpEWYPnox2Wo1cX/7zkM0mLK7RjawU0RCNnaO5USvEZhDKvcI3kGeO40d9KJmqk9Ku4h4VAHzV0TWHgeRNYI9I43X5xyn5WliTHS5ava9yoFkRgipQOX1aEyx6Nn4LGBZYaxu4+DkntOJ477O69A6yR2OhX37nQeecrWoAG1XrhliiKgSYHpKxRGskaSH4rMDazZvOv4Ct+I+y+N5sbAUhhcLr9Duw+dGaw3WQn8NH2+36KZttS1GbUZIDt3PEcsDNPaNbwVoG8XcwfVvPq+gnbvLp8whr2DuCBU5NVG9yv6QTOER3+Pqs+XsmvWP5BgobpfEz6bXp7izZnM/KqY14ZKKenwXyEh85lT5baJqzYGSd3erSLaEalWqkdxfaFm4cNQ+h4KdK6dnDjvls5QNYqpRg1e0LURUpJ/gEoBnBBFHiNpMIC2GD0X6AhDTvKcmQhcVPcIIHePkJuWeQ26D5C2FP4eyLHPZGdmsANRTmuGC10EMGf0UqK7YPJAIxyk/KG7qn7Q4aNbC3B8ohh2Hs7cuk/0+UK50z/oxRvMD0BPngm/G1FxOno42eiZ2wtVHUyfPhceFI+iT60tsNe20uk8JGwx0OcF4XS2X3SZz6ipdOJ3RtO7aU+8k0Wn6MkXSy9OHIv48XEjeMwcpNleOmmSZimCy+aZmn4pb8+Cp2qIjXOyP1a4Ou1uR5Oq4oPg3Pojx8qogMYviZRX5OMMfMXHlYlHKIqpRJtWf5fI6E99JUZ8BwalpFJz8iKkgKDN+v67hEv0RC86NZJQz9JTfAXRJv6WTZZLqau58W6oUvC2J0GWr7JJAqCaRyex0m2F600mTc6ulO1+uPrtxc/vn4fQKvwsHfSN8KVVN0MrAXRiSTJdBKkceImfpi54XJ67i6ySeRm0TgM0yRepOPLL6YH88NZLrDt6/4r+o7QD3/pCWuSCyZZqc5yVnfNJeTsIxacEdtf+l7XEW6QuRomfjj2wyCKuzLp2PrRRgvb/tBKhIrXiF9vrEhqe8+ldonrBrjTyIMJPGioL34CUEsDBBQAAAAIAHFwW1eAZeGItwAAADYBAAAsAAAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDIueG1sLnJlbHONz70OwiAQB/DdpyAsTELrYIwp7WJMHFyMPsAFri2xBcKh0beX0SYOjvf1++ea7jVP7ImJXPBa1LISDL0J1vlBi9v1uN4JRhm8hSl41OKNJLp21VxwglxuaHSRWEE8aT7mHPdKkRlxBpIhoi+TPqQZcinToCKYOwyoNlW1Venb4O3CZCereTrZmrPrO+I/duh7Z/AQzGNGn39EKJqcxTNQxlRYSANmzaX87i+WalkiuGobtXi3/QBQSwMEFAAAAAgAcXBbV4tg7VpjBAAAWBEAACEAAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0My54bWzNWNtu2zYYvt9TCOqFrxRSEnUK6hSWHG0D0iSo0wdgJNoWSh1G0q69oUBfa3ucPslISrIcN2ndzgtyI1LUf/j+A/nz1+s3m5Iaa8J4UVfjkX0GRwapsjovqsV49P4utcKRwQWuckzrioxHW8JHby5+ed2cc5pf4W29EoYUUfFzPDaXQjTnAPBsSUrMz+qGVPLbvGYlFvKVLUDO8EcpuqTAgdAHJS4qs+Nnx/DX83mRkWmdrUpSiVYIIxQLCZ8vi4b30ppjpDWMcClGcz+EJLYNGZucZL8RnJuGJmRruWSbF9L2bEZzo8KlXJiRTLEbipAw/ZU3d4wQNavWv7Jm1twyzXS9vmVGkSshHbMJug8dGWiZ9AQcsC/6KT7fzFmpRukNYzM2oWls1ROoNbIRRtYuZsNqtrx5hDZbXj5CDXoFYE+psqoF97U5Tm/OXSEoMeydVT1e3lzV2QduVLW0R5nfmrejaG1WY7PsXC+UKLN3g/oI9pXzxz0ROI5ru9pEhKAfwQOnBEHgINgZa7u+AwPv0GTeqRCbuM63ivtejtJUXGXLWmapaGVSLmZiS4mer6ndKBK6qMYmNdVaTubv5BL/U2KBSue9DnyGpQcwpZ3ajrOd70ls1EObyKQQiuV2NEllvZ+ZBi9FQgmudmEUFwktsg+GqA2SF8J4i7kgzNAulJtXSlTShdahRZIqv8UMvzuQ3CJqtBd660Ef+KfD7+7Cr9x8S3FGljWVm8FwTpEJyvumVLQZyH8qIZwI+oGcfyMhPAjtMPjhhLh/OiFKzK707iqqXJ40aqoFrK7laQoO0sRRaaK9VNMiTwtK9Ys6v0hCmbHGVGbfxtY0oqhEuxJ4EPYbd0fcvg1yQK/pYdbpqTMgRV7gwCPh2uEzwnUGuO4AN7IROhqu/4xw3QEuGuDabqBRHIcXPSNeNOD19vCGThi+SLzegNcf8DpO6MMXidcf8AZ7eAPkHr/dnhNvMOANB7wK7PH77TnxhgPeaA+v7wUvc79FT9Z8hV4S7Ir7f7wDqEKnrwD8wR3gZ+o86uv8FAvyoM67p6jzuTB1HJaYzvt6D79d8MFjZflBLQY7v87ljV1Z8ZcXJ5MpDD3rMpz4Vhgiz4qn6NJKYpQkE+hFaYI+9R1ALk0VRUnSYrFi5GYlzGPDYQMnALY7eF0COP3dy+tjkta1ivd+VNApojIXrA3LHyvMpIY+Mt+5iv1IZE7rEb/3yEzuPmJcr8r7A794p/CL7H6l6Edd4/wPSZvYaepPJ5EFYSh78hiFVuTI9I19z3GiEAVhnO6SlivLK4nu2Fz98vnvV18+/3OCXAX73a88e6646GbGihXSkDiOfCcJYyu2UWqhaRRYk9T3rNRzEUricJK4l59UF22j84wR3Zr/nvdNvY2+auvLImM1r+fiLKvL7v8AaOqPhDV1oX8R2LBr6vV5HfnQR6Hb9X0aWj9qsKDt7nWGUPYWNzdrnSOlPlATvdQU1aJLkYEE7P0SufgXUEsDBBQAAAAIAHFwW1eAZeGItwAAADYBAAAsAAAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDMueG1sLnJlbHONz70OwiAQB/DdpyAsTELrYIwp7WJMHFyMPsAFri2xBcKh0beX0SYOjvf1++ea7jVP7ImJXPBa1LISDL0J1vlBi9v1uN4JRhm8hSl41OKNJLp21VxwglxuaHSRWEE8aT7mHPdKkRlxBpIhoi+TPqQZcinToCKYOwyoNlW1Venb4O3CZCereTrZmrPrO+I/duh7Z/AQzGNGn39EKJqcxTNQxlRYSANmzaX87i+WalkiuGobtXi3/QBQSwMEFAAAAAgAcXBbV0/KghwIBAAAaBIAACEAAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0NC54bWztWN1y2jgUvt+n0LgXXDmyjWwMU9LBJt7ZmbTJFPoAii2Ct7LllQSB7nSmr7X7OH2SlYSNIaEFtlzmBgv503f+j+3z9t2qoGBJuMhZOey4V04HkDJlWV4+DjufpokddoCQuMwwZSUZdtZEdN5d//a2Ggia3eI1W0igKEoxwENrLmU1gFCkc1JgccUqUqp7M8YLLNVf/ggzjp8UdUGh5zgBLHBeWvV5fsp5NpvlKRmzdFGQUm5IOKFYKvXFPK9Ew1adwlZxIhSNOb2vklxXZGjJJ3b38KcFDI4v1Y5rXSvT0wnNQIkLtTF9YiBmpVQ05paoppwQvSqXv/NqUt1zc+LD8p6DPNMM9UkL1jdqGNwcMgv47Phjs8SD1YwX+qo8AVZDy7HAWv9CvUdWEqSbzbTdTed3B7Dp/OYAGjYC4I5QbdVGuZfmeI0501xSAtytVY2+orpl6WcBSqbs0eZvzNsiNjbrazVv3K6prMYN+ibcFS4aZ8lVxLK1FvKgrmYTD6iQE7mmxPyp9I9Rgyt9KVZJbZHS/jSxgChkTAkutw6R1zHN089AMkCyXIL3WEjCgVFGlYCi1N6RxkeGkpTZPeb44zPmjRcro3SjIWxc+GNHdhtH1tkE7ilOyZzRTCnh/ZpbxRdVDZjOLCVp1YJ/4NsDWYb8nioOkz5u4Dh6vZdwyOmGgVMnEvI9vx90n6eTqEX8NGpmvaRurUZGZtq9Wn8vdJoM3QGopXcAi3axXovtHsA6u9hui0Uvse6eDqjF+sewfosNjmGDFts7hu212PAYNmyx/WPYDQDuB8ZUU6XTfUm3ZfOL1aUzyBSX2Ksu2EjbE+meKXJCUlZmgJIloSfQe2fST+c5P529eyZ7whZczk+mR+fS57OD7Jfua+hnfa170b7mnd/XAhS+NrbXxvba2F4b27mNzW8a2xhLstfV0CVegjNpvXhvcy73UjxTXzDair/9KB6NndC3b8JRYIch8u1ojG7sOEJxPHL8fhKjr80HUaZMlXlBkvxxwcndQn/znBYVF3o96HbbiCgFLh+ToIlJwpiuwt2o+JeIykzyTVj+WmCuJDSROfJKfU5kLuuRXuORCc0zAj4siodnfgku4RdBM0V90DVHnsr/K2ljN0mC8ahvO06Y2GGEQrvvqfSNAt/z+iHqhVGyTVqhLS+Vdqfm6vdv/7z5/u3fC+Qq3B0IqCfCrZD1Cix4rgyJon7gxWFkRy5KbDTu9+xREvh24ncRiqNwFHdvvurBgosGKSdmUvFH1sw4XPRiylHkKWeCzeRVyop6XAIr9kR4xXIzMXGdesaxxPrR0As9D6E+6tVhUro1V6Mt3Iw7TIpQ/h5Xd0uTJIV5zsVmq8rLxzpHWgjcGRFd/wdQSwMEFAAAAAgAcXBbV4Bl4Yi3AAAANgEAACwAAABwcHQvc2xpZGVMYXlvdXRzL19yZWxzL3NsaWRlTGF5b3V0NC54bWwucmVsc43PvQ7CIBAH8N2nICxMQutgjCntYkwcXIw+wAWuLbEFwqHRt5fRJg6O9/X755ruNU/siYlc8FrUshIMvQnW+UGL2/W43glGGbyFKXjU4o0kunbVXHCCXG5odJFYQTxpPuYc90qRGXEGkiGiL5M+pBlyKdOgIpg7DKg2VbVV6dvg7cJkJ6t5Otmas+s74j926Htn8BDMY0aff0QompzFM1DGVFhIA2bNpfzuL5ZqWSK4ahu1eLf9AFBLAwQUAAAACABxcFtX6aTEj+MEAAA2HAAAIQAAAHBwdC9zbGlkZUxheW91dHMvc2xpZGVMYXlvdXQ1LnhtbO1Z3ZKiOBS+36eg2AuvGAgECNbYUy3dbm1VT3fX6DxAGmLLDhA2ibbO1lTNa+0+zjzJJgiitto4erFV6w3EcPLl/H4cyfsP8yzVZoTxhOa9DnhndTSSRzRO8ude5/NoYKCOxgXOY5zSnPQ6C8I7H65+eV90eRrf4QWdCk1C5LyLe/pEiKJrmjyakAzzd7QguXw2pizDQv5kz2bM8IuEzlLTtizPzHCS69V61mY9HY+TiNzQaJqRXCxBGEmxkOrzSVLwGq1og1YwwiVMuXpTJbEoSE8XL3Q0H73Qh6c/dK0UZjM5DfQraX80TGMtx5mcCGlWYJZwmpdPeDFihKhRPvuNFcPikZUL7mePTEtiBVAt1M3qQSVmLheVA3Nr+XM9xN35mGXqLr2hzXu6pWsLdTXVHJkLLVpORs1sNHnYIRtNbndIm/UG5tqmyqqlcq/NsWtzRolIiQZWVtX68uKORl+4llNpjzJ/ad5KYmmzuheT2vUKSq/doB6a65vz2lli3qfxQm3yJO/lJO6mXAzFIiXleJaCSo2YjD8tXbs2bW6KF+pSSjNpXYplGegkNz4PdY1nIkwJzlfuE1dhmkRfNEE1EidC+4i5IEwrVZdFIxEVuij3KCFJHj9ihj9tIS81KkoTa3vM2uH73e6s3K5i/pjiiExoGksN7HNEQPlTlxvNG/E9gdiRktD1ZTWVuQZcxwXA2cxOaEELILTMOs8JfM/eTj1e7bAdYQ3n0YRKtnjS9wVbyzC7K5M6yWNZ4GpYAkzvJYmZTS5o/KtMX6g0farN3EgZObQbwNqqVqjWa1S7QXUa1ABA2BYVoNeoToMKG1Tg+MBrDeu9hoUNrLsGi2yEToF1G1ivgbVt5FmnwHoNrL8G60OndcR2wfoNLGpgFWb7kO2ARQ1ssAbruf5JIQv2MpraRAqsqOtEhlNlXBIc32C4n2ExqK9eormQVm8QmXMakSk/TXA6rmjMPoXGbOBD5LsHaMwJXCCLoy2Pvf2mathpHy/t4px9bLOLSfZxyK5c20cMB2W3qv2g7FYJH5TdqsuDslvFdlD2v1FB21uCI7cckojmsZaSGUlbwNtHwo8mCWuP7hyJPqBTJiat4eGx8Ml4J/q5uzN3b3cGz9edqQT+c4qZTKmK45zjOc6DrmW7B3s14Evmu/Rql17t0qv9n3s171Cv5p7eq21SGTyJyvb1aw2VXfq1S7926dcu/dqS2/ya226wIBvE5p2jX4uFvv13FFinft80V+4dp3FpxV9uP7y+sZBr3KJrz0AIukb/Bt4aYR+G4bXlBoMQfqu/b8fSVJFkZJA8Txl5mAq9bVSAafsmcJqISAXOHxNUx2RAqarC9aj454jKWLBdTTR444PnMZE5r0eC2iPDNImJdj/Nnrb8gs7hF57GEnqna974iPJTSRuCwcC7uQ4My0IDA/UhMgJbpm/fc207QNBH/cEqabmyPJfatc3VH9///vXH93/OkKvm+tmOfCPccVGNtClLpCH9fuDZIeobfQAHBrwJfON64LnGwHUgDPvoOnRuv6kzIgC7ESPlwdPvcX1kBeCrQ6ssiRjldCzeRTSrTr/Mgr4QVtCkPAADVnVkNcOSXYPAAi7yHa+KklStvpfKmstzqzJDUvYRFw+zMkey8jUXllNFkj9XKdKImGsHflf/AlBLAwQUAAAACABxcFtXgGXhiLcAAAA2AQAALAAAAHBwdC9zbGlkZUxheW91dHMvX3JlbHMvc2xpZGVMYXlvdXQ1LnhtbC5yZWxzjc+9DsIgEAfw3acgLExC62CMKe1iTBxcjD7ABa4tsQXCodG3l9EmDo739fvnmu41T+yJiVzwWtSyEgy9Cdb5QYvb9bjeCUYZvIUpeNTijSS6dtVccIJcbmh0kVhBPGk+5hz3SpEZcQaSIaIvkz6kGXIp06AimDsMqDZVtVXp2+DtwmQnq3k62Zqz6zviP3boe2fwEMxjRp9/RCianMUzUMZUWEgDZs2l/O4vlmpZIrhqG7V4t/0AUEsDBBQAAAAIAHFwW1cttCb1EgMAALgIAAAhAAAAcHB0L3NsaWRlTGF5b3V0cy9zbGlkZUxheW91dDYueG1stVbdbtowFL7fU1jZBVepkxAgoMFEQjNNakc12gfwEgPRHNuzDYNNlfZa2+P0SXbsEMq6TuoFu4md4/Pzne8c5+TN213N0JYqXQk+7oQXQQdRXoiy4qtx5+4295MO0obwkjDB6bizp7rzdvLqjRxpVl6RvdgYBC64HpGxtzZGjjDWxZrWRF8ISTmcLYWqiYFXtcKlIl/Bdc1wFAR9XJOKewd79RJ7sVxWBZ2JYlNTbhonijJiAL5eV1K33uRLvElFNbhx1n9CMntJx56pDKNzzvYecqpqC8LQm0D2xYKViJMaBLdWCzk1e6LlraLU7vj2nZILeaOcwYftjUJVaR0cDD18ODio4cbIbfAT81W7JaPdUtV2BS7QbuwFHtrbJ7YyujOoaITFo7RYz5/RLdaXz2jjNgA+CWqzasD9nU7k/cFDeMyqxavllSg+a8QF5GPTb9I7ajQ521WuT4n3WhrsIT4NrluyzC4V5d4G+QSrE5IR02Zh9oy6F2kfDoYCvIxAW3uU+3cLD+naZIwSfiTETDJWFZ+REYiWlUHXRBuqkAMDlwBcWnaM48i5pLy8IYp8fOK5YVE60C1C3FL4byK7LZEzYii6YaSga8FKQBCdg9PSQMrf4FoQtvQgINQ9DM7H8RLug83iey/NprMg6fmXybTvJ0nc89NZfOlnaZxl06A3zLP4vr1hJaRqqprm1Wqj6HxjvJeWKsTRAIfdx4oAgPPXJG5rkgthe+G0Kt1zVGVpVFOWLxuiIEJbmfB8lTkvI72WkQWrSoo+bOpPT3iJz8ELTBdw/Sw10X9o2izM8/5sOvSDIIGZl8aJP4ygfdN+L4qGSTxI0vzYtNpmzgHdS3v14cfP1w8/fp2hV/HpfIGP/ZU2hx3aqAoSSdNhP8qS1E/DOPfj2XDgT/N+z8973TjO0mSadS/v7ZwK41GhqBt978t2aIbxX2OzrgoltFiai0LUh/mLpfhKlRSVG8FhcBiaW8LG3iAaBNFgcGxggNauDixuZqfrEKauiZxvXY/U7mObOZGEX4RDizyq4JNfjslvUEsDBBQAAAAIAHFwW1eAZeGItwAAADYBAAAsAAAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDYueG1sLnJlbHONz70OwiAQB/DdpyAsTELrYIwp7WJMHFyMPsAFri2xBcKh0beX0SYOjvf1++ea7jVP7ImJXPBa1LISDL0J1vlBi9v1uN4JRhm8hSl41OKNJLp21VxwglxuaHSRWEE8aT7mHPdKkRlxBpIhoi+TPqQZcinToCKYOwyoNlW1Venb4O3CZCereTrZmrPrO+I/duh7Z/AQzGNGn39EKJqcxTNQxlRYSANmzaX87i+WalkiuGobtXi3/QBQSwMEFAAAAAgAcXBbV+sXn3fmAgAAZwcAACEAAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0Ny54bWy1VdFumzAUfd9XIPaQJ2ogJIWoSRVImSZ1bbS0H+CCSVDB9mwnSzZV6m9tn9Mv2bWBNGs7qQ/ZC7Yv917fc87V9dn5tq6sDRGyZHTc807cnkVoxvKSLse925vUCXuWVJjmuGKUjHs7Invnkw9nfCSr/BLv2FpZkILKER7bK6X4CCGZrUiN5QnjhMK/gokaKziKJcoF/g6p6wr5rjtENS6p3caL98SzoigzMmPZuiZUNUkEqbCC8uWq5LLLxt+TjQsiIY2J/rskteNkbN9VmN7blnETGzB49gSQZ4sqtyiuwRAbD22U/EYQond080nwBZ8L43u1mQurzHVsG2Oj9kfrhpogs0EvwpfdFo+2haj1ChRY27Ht2tZOf5G2ka2yssaYPVuz1fUbvtnq4g1v1F2ADi7VqJriXsPxOzgzrIg1r3BGVqzKibC8PcCudMkvWXYvLcoAmmaiQbr3aODrla9a6nNlW/IHiIirwoYLoVzPtTuGtDM6rEt2PKptzPKdvvQOVmPEo0qqhdpVxBy4/hSgoEbxcxAn05kbDpyLcDp0wjAYOPEsuHCSOEiSqTuI0iR46PohB6iqrElaLteCXK+VrXMJYATaYDm2CXVuF1B3rZKKYLqnXE085J8ir69pVoZsKMAIR/M5FvjrixSNINyA7BChTo1/a9LvNEkZU6DEoSr+MVQplGhk+bbGAm7olPGOp8xxGQk6RhZVmRPral3fveClfwxeYBZC6jep8f9D0yZemg5n08hx3RAmdByETuRD+8bDge9HYXAaxum+aaVGTqG69/bq0+Ovj0+Pv4/Qq+hwLMKMupSq3VlrUQKQOI6GfhLGTuwFqRPMolNnmg4HTjroB0ESh9Okf/Ggx6sXjDJBzKD+nHcj3gteDfm6zASTrFAnGavb1wJx9p0IzkrzYHhuO+I3uNLyeH4URaEXtjJBbd1qqkXNuDctUokvmF9vTJPAZSByYkwcXrS2R55d0MELOfkDUEsDBBQAAAAIAHFwW1eAZeGItwAAADYBAAAsAAAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDcueG1sLnJlbHONz70OwiAQB/DdpyAsTELrYIwp7WJMHFyMPsAFri2xBcKh0beX0SYOjvf1++ea7jVP7ImJXPBa1LISDL0J1vlBi9v1uN4JRhm8hSl41OKNJLp21VxwglxuaHSRWEE8aT7mHPdKkRlxBpIhoi+TPqQZcinToCKYOwyoNlW1Venb4O3CZCereTrZmrPrO+I/duh7Z/AQzGNGn39EKJqcxTNQxlRYSANmzaX87i+WalkiuGobtXi3/QBQSwMEFAAAAAgAcXBbV83KitWyBAAAwhIAACEAAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0OC54bWzNWN1yozYYve9TMPTCVwQE4i+zzo4hodOZbJJZZx9AAdmmC4hKstduZ2f2tdrH2SepJMB2HMfGiS96Y2T56Ejfdz4dYX34uCwLbYEpy0k1HIALa6DhKiVZXk2Hgy+PiREMNMZRlaGCVHg4WGE2+Hj1y4f6khXZLVqROdcERcUu0VCfcV5fmiZLZ7hE7ILUuBK/TQgtERdf6dTMKPomqMvCtC3LM0uUV3o7nvYZTyaTPMXXJJ2XuOINCcUF4mL5bJbXrGOr+7DVFDNBo0Y/XxJf1Xiok6c/Hpe6pmB0ITqAfiUiT8dFplWoFB0xqbhg0L7lfKbFqJZMCsPqR4qxbFWL32g9rh+oGnq3eKBankmqlkI32x9amNkMUg1zZ/i0a6LL5YSW8ikyoi2HuqVrK/lpyj685FradKab3nR2vwebzm72oM1uAnNrUhlVs7iX4dhdOI85L7AG1lF162X1LUm/Mq0iIh4ZfhPeGtHELJ/1rE0/l1R6lwb5o7k9OdufCej6QkgVou07lruTE8eyAgc4TawAeHaL2I6YtTPwZUSylRz9JJ4iUlSlMyIK9anhLBgf81WBVXtRgFpCimk11Atd9mV48ll0sb/EUiy5pqcu8DW+aW/x1PJDxUXF0AKJfajjyvgy1jVW8rjAqFprx6/iIk+/apxoOMu59gkxjqmm8iZ2rWCU7FzNoShxlT0gij7vMDcrqlXsXcxmp/brmjv6zi54KFCKZ6TIxCLs91VAni03kP7iO67vSkFfU98FAPhuW+lu4DpAlEJP9V+TfEdpR1bfjsaqab/E2sE21t5gnT1YuI11Nli4B2ttY+EG6x7DuhusdwzrbbD+May/wQbHsMEGGx7Dhq/uIbkZBWC9Wd65p2QFqS3Fnu0ps5vt2ZTgxCnHOCVVphV4gYse9PaJ9I+znPZnd05kT8icitOvLz08lT6f7GU/t5vB9Qkmpd62Mucch5n0EF0V8AwVE70xOPs9pxuAjgusQ8cb9EJgee82OK1E9Fa9H+RVJnxeNtWo+Z14JzR39ieAB/yvpeqi6MVnH/DIli8EEPbmsw74aMsHHB94fQnDA17b8QV2ELyJb8ePWz7bDjzrTXw7nt3x+dDpLUh4wNdbPknWW5DwgPd3fJ7rv02P/8f5cJoTuZ0TXSOOnzkRPIcTZfyFDwHrsBGZR+3CXOd1Iv4cySj+dqN4dG0FrnETjDwjCKBrRNfwxogjGMcjyw2TGH7v/mplIlSelzjJp3OK7+dc7ysHMG3fBM4m62IB5z8dvE6ThBCp97Yq7jlUmXDayPLnHFExQ6fMkXfgU5Q5b0b8LiPjIs+wdjcvn3by4p0jL6zIBPXe1Bw5Pd9UtDFIEu96FBriHE2MIIKBEdqifCPPte0wgH4QJeuiZTLySqyub63+/PHPrz9//HuGWjW3rxiE99wy3ra0Oc1FIFEUenYcREYEYGLA69A3RonnGonrQBhHwSh2br7LqwoAL1OK1R3I71l3ewLgi/uTMk8pYWTCL1JSthcxZk2+YVqTXN3FAKu9PVkg+Q4cQMu3PdfrvEWsrXuq1ZrNTYoqkYJ+QvX9QhVJqRw1Vl11Xk3bGtlAzK3Lp6v/AFBLAwQUAAAACABxcFtXgGXhiLcAAAA2AQAALAAAAHBwdC9zbGlkZUxheW91dHMvX3JlbHMvc2xpZGVMYXlvdXQ4LnhtbC5yZWxzjc+9DsIgEAfw3acgLExC62CMKe1iTBxcjD7ABa4tsQXCodG3l9EmDo739fvnmu41T+yJiVzwWtSyEgy9Cdb5QYvb9bjeCUYZvIUpeNTijSS6dtVccIJcbmh0kVhBPGk+5hz3SpEZcQaSIaIvkz6kGXIp06AimDsMqDZVtVXp2+DtwmQnq3k62Zqz6zviP3boe2fwEMxjRp9/RCianMUzUMZUWEgDZs2l/O4vlmpZIrhqG7V4t/0AUEsDBBQAAAAIAHFwW1da07SSeQQAADESAAAhAAAAcHB0L3NsaWRlTGF5b3V0cy9zbGlkZUxheW91dDkueG1svVjdcps4FL7fp2Doha+I+BEgMnU6Bsc7O5MmmSZ9AAVkmyl/K8mOvTud6WvtPk6fpJIAQ5ykYV1mb4wsjj6d75yjT0LvP+zyTNsSytKymE6sM3OikSIuk7RYTSef7xcGmmiM4yLBWVmQ6WRP2OTDxW/vq3OWJVd4X264JiAKdo6n+prz6hwAFq9JjtlZWZFCvFuWNMdc/KUrkFD8KKDzDNim6YEcp4XejKdDxpfLZRqTeRlvclLwGoSSDHPhPlunFWvRqiFoFSVMwKjRT13i+4pM9SqN73e6pszoVnRY+oVgHt9liVbgXHTcpjHfUKI9pnytRbiSSMqGVfeUENkqtr/T6q66pWro9faWamkioRoIHTQvGjNQD1INcDR81Tbx+W5Jc/kUEdF2U93Utb38BbKP7LgW151x1xuvb16wjdeXL1iDdgLQm1Syqp17Tsdu6dynPCOadWDV+suqqzL+wrSiFHwk/ZrewaLmLJ/Vugk/l1B6Gwb5EvQnZy9HwvID20ZIcYRIpNQ8iooLkQfNhq3reb6DjimzZgq+C8tkLwc/iKegiot4XYpKfaghM8bv+D4jqr3NrEqaZKtiqme67EvI8pPoYn+JAJlyyoeW+cG+bvdwKvmjiFExNMNiIeqkMD7f6RrLeZQRXBySxy+iLI2/aLzUSJJy7SNmnFBNBU4sW4Eo0bmaQ0GSIrnFFH86Qq49qhT3ljNo0/160h39aBncZjgm6zJLhBP2GCUgVqAuptp11qcVgmfZvu/+pA6gZcliGVoIr2Y/x/RKLaW0SIS0yKYatbkW8gmOasKxDzMeqkE17Q4Kur60GoRnoz6e3eE5HV5gQTgYD/bxnA4PdniW41veYECzDwg7QLcHiETSTgN0O0CvAxRF4JmnAXodoN8D9KEzPCdPAP0OEHWAEm14Up4Aog4w6AF6rn9iUoJXNWlc7YCHDUOux75wOGMIh1ymuqK3xtmy0RD7lzTEdcRWUe8Vr4gIMsU/+//VEAuOqyGWPa6GWObIGhKMLCHByAoSjCwgwcj6EYwsH8Ew9ZDowuBwdPnFE45cf+qAw56ccE5RIrdVojnmT48wcAwlSvgzHbLMnwsReFMuwCGuS/EtIln87YbRbG4i17hEM89ACLpGOIeXRhTCKJqZbrCI4Nf2yyYRVHmak0W6Eue2mw3Xh6bDArYPLKeLunBg/N3Ba3OyKEuZ735W3DGysuS0TsufG0zFDG1m3jhm/pfMjBsRv43IXZYmRLve5A9HcfHGiIv4qhfQL4bmjd3zpKKNrMXCm88CwzTRwkAhREZgi/INPde2AwR9FC4ORcsk80J4N7RWv3/75933b/+OUKug/0UvtOeK8aalbWgqiIRh4NkRCo3QggsDzgPfmC0811i4DoRRiGaRc/lV3gxY8DymRF05/JG0lxUWfHZdkacxLVm55GdxmTf3HqAqHwmtylRdfVhmc1mxxUJWHYQC2/ECJ2jSJHxrn8pbUF9cqBLJ6Edc3WxVkeRKUSPVVaXFqqmRzgT07noufgBQSwMEFAAAAAgAcXBbV4Bl4Yi3AAAANgEAACwAAABwcHQvc2xpZGVMYXlvdXRzL19yZWxzL3NsaWRlTGF5b3V0OS54bWwucmVsc43PvQ7CIBAH8N2nICxMQutgjCntYkwcXIw+wAWuLbEFwqHRt5fRJg6O9/X755ruNU/siYlc8FrUshIMvQnW+UGL2/W43glGGbyFKXjU4o0kunbVXHCCXG5odJFYQTxpPuYc90qRGXEGkiGiL5M+pBlyKdOgIpg7DKg2VbVV6dvg7cJkJ6t5Otmas+s74j926Htn8BDMY0aff0QompzFM1DGVFhIA2bNpfzuL5ZqWSK4ahu1eLf9AFBLAwQUAAAACABxcFtX6ORJ0TkDAACzJAAAKAAAAHBwdC9wcmludGVyU2V0dGluZ3MvcHJpbnRlclNldHRpbmdzMS5iaW7tWc9u2jAYz3orb7BbljsxUFbYlFIxKBoSbaMSKu1UuYnL3IY4cswYe6S93+5zAgETMIQd1iTqoVVw7C+/P/YX+8uJoijv+N/v94piXP6cuOoPRANMvAutqlc0FXk2cbA3vtBGVq/c1C5bJeND97ZjfTOvVN/FAVPN0ZdBv6NqZQDavu8iALpWVzUH/aGl8hgAXN1oqvadMf8zALPZTIdhL90mk7BjAExKfETZfMCDlfkA3WGOxh+ziL4Bh7c62Gat0qnxguYtHmIZzKfYY7oJx6hH6ATyy+uvhOJfxGPQvUOBAcL+fNhy+O7xDNsviOk2RZARGo85NQLGb4+F7s/kcdHXAMt7B0JihiZtSuF8HRSGP8OrNShJjMO0wpEctNtq1AwQXcijLREFDDLUc+FYjMHvozGirYoB4ssIIFjJBmLYq7bDkG8pRhww4zYWx4cdpEQFq5sKZsWKoQ1dLlNxbEgQWi2EagbXwT3PctguWD7aQSrb2SgGXLilICGWtSURTB+txXN8yN/7D9h7Ig+xZru8MK9Ns2uGfTvEQTdwgtZSrfQ5xrW0th3pm2ic6NxBFgKiAWIM0Q0Qx3slNUtwS7BL9HCF1KLQC9zo9TaMsETQcy1+CkoCvNFQzYYZFiZjmHP1JRwEPB4suxmQe2+CbTvP04AhJ2y8QzbLoxf/RjARdY/K+28tdgVndfFNFDd/bJxvNAsmZXYe8Ald8ImQZLg9EyLLytXmLk8lzY3G7hnwqZ7lGcCl6PO9Cpcn19n4OGJ5SNFr/CMPFjJHp2L4lqQlOhUqS6ej+Jamdd93ipuqZeQEoFk4vUie1Db794uy/lYlpVLRa5W0tRM299FWBKloVtKs/XUKKVYZ1LRI5UDjHVgSaQzUANE3kVbpRFGUP6UCfLHpEns6Qd6ScVjP9QlxFyrkujKXhpiwWMOh2I5qE8B3njZX7SsWTsP/Q55IOJaAk+gQH+e9eL2Xkqhehj7hbGOed4jr8mcWzYskr3Aoo1MEsuZBD9OAhSm7UA5sscrHghjAAnqRJCUqWKvWG/Xm2Xm9kVlPovMp9Apmyhar5ElLulrSmCeepF7Pyf+/8xVFPrj5/QtQSwMEFAAAAAgAcXBbV1ycRxREAQAAiQIAABEAAABwcHQvcHJlc1Byb3BzLnhtbLWSy07DMBBF90j8Q+S9aztJ81KTKmmChMSCBXyAlTitpfgh230gxL8TQgoUNt2wm9Ho3jl3NKv1SQzegRnLlcwBWWDgMdmqjsttDp6f7mACPOuo7OigJMvBC7NgXdzerHSmDbNMOupG6aPxRiNpM5qDnXM6Q8i2OyaoXSjN5DjrlRHUja3Zos7Q47hADMjHOEKCcglmvblGr/qet6xW7V6MAJ8mhg0Tid1xbc9u+hq3nzkukIoxJDu5B+vmytsbnoPXJo42TRqWMMLBBoYk9GGVNhWMahLEGBNc+vHbh5qEWcdtS013L+iWNR13NXX0DEfCP3iCt0ZZ1btFq8ScE2l1ZEYrPkUleL7XgQ45wAAVKzTBXTLWASlx5JcwTpMShoGfwrKqa1hVZbKMIh8vCf5iZD3dD25irDX/Lzz0fU30+3uKd1BLAwQUAAAACABxcFtXZzMmjZsBAACCAwAAEQAAAHBwdC92aWV3UHJvcHMueG1sjVPBTuMwEL2vxD9YvoOTCEKJmnJBcEFapIa9G2eaGjm25XFLy9fvJG5pCz1wmzfjeX5vxp7eb3rD1hBQO1vz/CrjDKxyrbZdzV+bx8sJZxilbaVxFmq+BeT3s4s/U1+tNXy8BEYEFitZ82WMvhIC1RJ6iVfOg6XawoVeRoKhE22QH0TcG1FkWSl6qS3f9Yff9LvFQit4cGrVg42JJICRkcTjUnvcs/nfsPkASDRj96kkIzH+I3c1R9M2y1X/ZqU2Q4bPyLgdSEb4EgZMPNEFaJ9hERl+0hhvyiLj4rjWOD+W7q7LciyJnzxodAsHqOamTYihlb5xT0G3NacNJfj37R1URLpuVKV2Z9cyzJU0sM/jAGZTWeGGDSsurjkjmjwbZVB6eyYtvvp85YLutGWbml/mN3nB2XaIKEjn1EFxtyIDzxi/Yka9NGLahgufnHlHaou83M0mHUnJyWR/74FEHM8gaTqdkHURsIFNPBra0Ti/GSdn54yfps8bz0bT2XfH4qyEjtY091LRS2eKmm/pMRCB2u7DxJK+z+w/UEsDBBQAAAAIAHFwW1fY/Y2PpQAAALYAAAATAAAAcHB0L3RhYmxlU3R5bGVzLnhtbA3MSQ6CMBhA4b2Jd2j+fS1DUSQUwiArd+oBKpQh6UBooxLj3WX58pIvzT9KopdY7GQ0A//gARK6Nd2kBwaPe4NjQNZx3XFptGCwCgt5tt+lPHFPeXOrFFfr0KZom3AGo3NzQohtR6G4PZhZ6O31ZlHcbbkMpFv4e9OVJIHnHYnikwbUiZ7BN6qCIKK0wKfL5YhpSANcejTGcVTW1bmp/SosfkCyP1BLAwQUAAAACABxcFtXN2scvHQBAACZAwAAFQAAAHBwdC9zbGlkZXMvc2xpZGUxLnhtbK2T30rDMBTG732KkJteuWwTRMragYre+GfQ+QBZe7YW0yTkZHV9e5O0tUMnDPQmJ8k53++cD5LF8lAL0oDBSskkmk2mEQGZq6KSuyR6Wz9c3kQELZcFF0pCErWA0TK9WOgYRUGcWGLME1paq2PGMC+h5jhRGqTLbZWpuXVHs2OF4R8OWgs2n06vWc0rSXu9PkevDSBIy60b9BTEnANR222Vw73K97VjdRADIkCxrDTS1DnLM1H4iHptAPxONo9GZ3plQvqlWRlSFQmdUSJ5DQmlrE/0ZawThQ37Jt8dlaDuCn+i5wN6XVkBZPbVoSvlTvqk8nckUjm2H6Vr9VXR9fdRl8S22qFyawKNDlP5PDvuj8Ng9nCritb32bgYLnks0Ga2FRAO2i9hEpsG6oL5rV9NWHVgDyA2mP3d8tVgOdtvbHA9/w/XuN90rl2Twyj5o3t2yh0b3wwbn1EuzDPXr00w4B6mBXMXrrT7D/38YwkLPyv9BFBLAwQUAAAACABxcFtXNuhQzbcAAAA2AQAAIAAAAHBwdC9zbGlkZXMvX3JlbHMvc2xpZGUxLnhtbC5yZWxzjc+9CsIwEAfw3acIWTKZtA4i0tRFBMFJ9AGO5NoG2yTkoti3N6MFB8f7+v255vCeRvbCRC54LWpZCYbeBOt8r8X9dlrvBKMM3sIYPGoxI4lDu2quOEIuNzS4SKwgnjQfco57pcgMOAHJENGXSRfSBLmUqVcRzAN6VJuq2qr0bfB2YbKz1Tydbc3ZbY74jx26zhk8BvOc0OcfEYpGZ/ECc3jmwkLqMWsu5Xd/sVTLEsFV26jFu+0HUEsDBBQAAAAIAHFwW1daoA6towUAAOMPAAAXAAAAZG9jUHJvcHMvdGh1bWJuYWlsLmpwZWftVmtwE1UUPrt7NyltzRAoLRQHwrsywKQtQisCJmnappQ2pC2vcYZJk00TmiZhd9OWTp2R+kD9Iw/ffywFFR1nHFS0oI6tIqCjA4gFCgxjEbX4Gh6Kr4F47m5eQBCUv707e++Xc7577vnOvXM3kWORr2F4RamtFBiGgXJ8IHJa222zWFbZHdWltkorOgC0252hkJ81ADQFZNFRZjYsX7HSoO0HFsZABuRChtMlhUx2eyVgo1y4rl06AgwdD89M7f/XluEWJBcAk4Y46JZcTYhbAXi/KyTKAJozaC9qkUOItXcizhIxQcRGihtUXEJxvYqXK5xahwUxzUXn8jrdiNsRz6hPsjckYTUHpWWVCQFB9LkMtBZ2Mejx+YWkdG/ivsXW5A/H1huHb6bUWLMIxzyq3SuWO6K40+W01iCejHh/SDZT+1TEP4Ub60yIpwOwIzxiaZ3KZ+9t89YuQ5yN2O2TbbVRe1ugvqpanct2NQYXOaKc/S7JgjWDiYhPeQVbpZoPB26hxErrhXicN1wejc9VSM011licNq+lSo3DiaudFXbEuYgfE4OOajVnrkvwlznU+NzekGyP5sANBvxVlWpMohMkRaNil7215epcMkfGTVTnkpUeX6ktym8P+ZWziLmRbWLYURflHHSK1jI1DrkgBOqiMfnRbmcJre0sxAtgKeMEAYJQj70LAnAZDOCAMjDjGAIRPR7wgR8tAnoFtPiYO6ARbal5doWj4gSjQZk9SGfjKqk56gpno5wgySFGUojvPFJJ5pMiUgwGspDcRxaQErQWk3nxufak9elaZ+Nx1kAYo1LeUjBvyA3nJdbrEFf5XAeePHfV7OB1OQuxfJIrABJWIMacmax/X/v7oxMx+kj3/Ycz97VD9c3qy5/hB/k+7Pv5kwkGf4I/iU8/mDA3v5JRE74+JQ8pKYNkDb34yuDEfgB5wSTeVSt6AhtyEx5aCWF91aUq6JiRsBqPGn829hm3GLcZf7ymyimrxG3mdnIfcLu43dznYOB6uF7uQ24v9wb3XtJe3fh8xPde0RtTSz2pai2AX2fWjdVN0pXoxuum6CoT8XQ5unxduW4aesbG9y15vWQtPliBfayqqddSeXXo9UGLokBSKhyAtdec/+hsMo7kE9s1p7aInuUYQ2PVlGhMYNBM1xRr8jUVFMfy00xDXzH21qtOnesGCoQkVrLOmcqpo2eVzm5WfBIIstAq04vWEgytFX0NXtlQYDTONZjwUyUYbAHXrBkGp99vUFySQRQkQWwW3LOAfgfVK/qiQ/m+MdkHEjZ5McD8X/DOOpiwrQwDvC4B5MxO2PLwThz1IkD3HFdYbI7e+QzzBYDkKSxQf2Wa8W46FYlcxPtKuwng8sZI5O+uSOTyVox/EqDHHxkA2drq8wAsXkxvfUgDwuQCT2fju4AZG8elTB5e4BSzAOt9QKL2quja5dHf6sh2sjEGA51cnN1DqZETYKH/Hm6r0SC3G4OJ9IA+DXoY4Bg9sHqG0zORPTAec+VVQuzDyrAc4TXatGHpGUjYORxYhuNYwvE8QWnMA+gHoudHTMg3aUYucWonrskqWLdxS9ok847eUY5D5yYX1osdw9Kzc0aPyZ0ydVreXdNn3z1nblHxPZYSa2lZua2iprZu6TLcXpdb8DR4faslOdzc0rq27aGHH3l0/WOPP7Fp81NPP/Psc8+/0LV120svv7L91dfefOvtne+8271r90cf7/lk7779n3725eGv+o4cPdZ/fOD0N2e+/e77wbM/nL9w8dffLv3+x59/UV1UZ6yl1IVFYFhCOKKluhi2hRL0hJ+QrxlhWqJ1rhk5sWBdWpZ545YdvcMmFTrOjaoXD6VnT549MOU8laYouzVhHf9LWVxYQtdxyOTwwOk5PSyEK1fyoJN9MB2GhqFhaBgahob/OET6/wFQSwMEFAAAAAgAcXBbV4sU/ON5AQAA2wIAABEAAABkb2NQcm9wcy9jb3JlLnhtbI2SzU7DMBCE7zxF1EtOqeMWSomSIAHiBBJSi0DcjL1NDYlt2dumeXucpE356YFbVjP7aTyb9HpXlcEWrJNaZSEdx2EAimshVZGFz8v7aB4GDpkSrNQKsrABF17nZyk3CdcWnqw2YFGCCzxIuYSbbLRGNAkhjq+hYm7sHcqLK20rhn60BTGMf7ICyCSOZ6QCZIIhIy0wMgNxtEcKPiDNxpYdQHACJVSg0BE6puToRbCVO7nQKd+clcTGwEnrQRzcOycHY13X43raWX1+Sl4fHxbdUyOp2qo4jPJU8AQllkC6T7d5/wCO/cAtMNTWD77ET2hqbYXrJQGOW2nQHyMvQIFlCCLYOH+NwDS41ioyBncp+eVtSSVz+OgPt5Igbpp8gbCF4JYp1aTkr9xuWNjK9u457RzDmO5b7JP6AP71Sd/VQXmZ3t4t70f5JKbTKKbR5HIZXyX0PKGztzbdj/0jsNoH+D/xIrmYfyMeAF1+7uGFto3vjvz5H/MvUEsDBBQAAAAIAHFwW1ee0I557wEAAG0EAAAQAAAAZG9jUHJvcHMvYXBwLnhtbJ1UwY7TMBC9I/EPlk9waJNChVDlZgVdrXqgNFKzy3mwJ42FY0e26W75eiYJyaZQIUFO7808vRnP2BE3T7VhJ/RBO7vmi3nKGVrplLbHNb8v7mbvOQsRrALjLK75GQO/yV6+ELl3DfqoMTCysGHNqxibVZIEWWENYU5pS5nS+RoiUX9MXFlqibdOfq/RxuRNmr5L8CmiVahmzWjIe8fVKf6vqXKy7S88FOeG/DJRuAim0DVmC5E8E/HFeRWyVCQ9EB+axmgJkaaR7bT0Lrgysh1IbaMLFcvdI/rcERPJVEvjwEDlO3bXdZft7SxIj2jZoXKP7NVy9fa1SK4IRQ4ejh6aqmtlwsTBaIVd9BcSn13sAz0QW60U2mfdBRe73cbopksMUBwkGNzQeLISTECyHgNii9CuPgftSXmKqxPK6DwL+gctf8nZVwjYDnXNT+A12Mh7WU86bJoQfVbQwsh75B2cyqZYL9u99OCvwt6rOx0rdDQY/qFEer1EMh6T8OUA+hL7klYSr8xjMZ1H1wOfdLnvLia7Poih3m8VdmDhiG1iRBtXN2DPFBrRJ22/hfumcLcQcdjiZVAcKvCo6FmMWx4DYksNe0P6j9R9e+hLPtKwqcAeUQ0WfybaB/PQ/z2yxXKe0tc9jCHW3vfhWWc/AVBLAQIUAxQAAAAIAHFwW1fGr8RntAEAALoMAAATAAAAAAAAAAAAAACAAQAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAhQDFAAAAAgAcXBbV/ENN+wAAQAA4QIAAAsAAAAAAAAAAAAAAIAB5QEAAF9yZWxzLy5yZWxzUEsBAhQDFAAAAAgAcXBbVwV3nA87AgAAtAwAABQAAAAAAAAAAAAAAIABDgMAAHBwdC9wcmVzZW50YXRpb24ueG1sUEsBAhQDFAAAAAgAcXBbV1KcUMkcAQAAcQQAAB8AAAAAAAAAAAAAAIABewUAAHBwdC9fcmVscy9wcmVzZW50YXRpb24ueG1sLnJlbHNQSwECFAMUAAAACABxcFtXpi2iNe4GAADSLgAAIQAAAAAAAAAAAAAAgAHUBgAAcHB0L3NsaWRlTWFzdGVycy9zbGlkZU1hc3RlcjEueG1sUEsBAhQDFAAAAAgAcXBbV75rQr0NAQAAxgcAACwAAAAAAAAAAAAAAIABAQ4AAHBwdC9zbGlkZU1hc3RlcnMvX3JlbHMvc2xpZGVNYXN0ZXIxLnhtbC5yZWxzUEsBAhQDFAAAAAgAcXBbVwD97A0qBAAABREAACEAAAAAAAAAAAAAAIABWA8AAHBwdC9zbGlkZUxheW91dHMvc2xpZGVMYXlvdXQxLnhtbFBLAQIUAxQAAAAIAHFwW1eAZeGItwAAADYBAAAsAAAAAAAAAAAAAACAAcETAABwcHQvc2xpZGVMYXlvdXRzL19yZWxzL3NsaWRlTGF5b3V0MS54bWwucmVsc1BLAQIUAxQAAAAIAHFwW1c3xjX4jQMAAM0LAAAiAAAAAAAAAAAAAACAAcIUAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0MTAueG1sUEsBAhQDFAAAAAgAcXBbV4Bl4Yi3AAAANgEAAC0AAAAAAAAAAAAAAIABjxgAAHBwdC9zbGlkZUxheW91dHMvX3JlbHMvc2xpZGVMYXlvdXQxMC54bWwucmVsc1BLAQIUAxQAAAAIAHFwW1dLiVBXwAMAAK0MAAAiAAAAAAAAAAAAAACAAZEZAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0MTEueG1sUEsBAhQDFAAAAAgAcXBbV4Bl4Yi3AAAANgEAAC0AAAAAAAAAAAAAAIABkR0AAHBwdC9zbGlkZUxheW91dHMvX3JlbHMvc2xpZGVMYXlvdXQxMS54bWwucmVsc1BLAQIUAxQAAAAIAHFwW1eTCm11IQYAAOcdAAAUAAAAAAAAAAAAAACAAZMeAABwcHQvdGhlbWUvdGhlbWUxLnhtbFBLAQIUAxQAAAAIAHFwW1cBV+iLbQMAAJYLAAAhAAAAAAAAAAAAAACAAeYkAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0Mi54bWxQSwECFAMUAAAACABxcFtXgGXhiLcAAAA2AQAALAAAAAAAAAAAAAAAgAGSKAAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDIueG1sLnJlbHNQSwECFAMUAAAACABxcFtXi2DtWmMEAABYEQAAIQAAAAAAAAAAAAAAgAGTKQAAcHB0L3NsaWRlTGF5b3V0cy9zbGlkZUxheW91dDMueG1sUEsBAhQDFAAAAAgAcXBbV4Bl4Yi3AAAANgEAACwAAAAAAAAAAAAAAIABNS4AAHBwdC9zbGlkZUxheW91dHMvX3JlbHMvc2xpZGVMYXlvdXQzLnhtbC5yZWxzUEsBAhQDFAAAAAgAcXBbV0/KghwIBAAAaBIAACEAAAAAAAAAAAAAAIABNi8AAHBwdC9zbGlkZUxheW91dHMvc2xpZGVMYXlvdXQ0LnhtbFBLAQIUAxQAAAAIAHFwW1eAZeGItwAAADYBAAAsAAAAAAAAAAAAAACAAX0zAABwcHQvc2xpZGVMYXlvdXRzL19yZWxzL3NsaWRlTGF5b3V0NC54bWwucmVsc1BLAQIUAxQAAAAIAHFwW1fppMSP4wQAADYcAAAhAAAAAAAAAAAAAACAAX40AABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0NS54bWxQSwECFAMUAAAACABxcFtXgGXhiLcAAAA2AQAALAAAAAAAAAAAAAAAgAGgOQAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDUueG1sLnJlbHNQSwECFAMUAAAACABxcFtXLbQm9RIDAAC4CAAAIQAAAAAAAAAAAAAAgAGhOgAAcHB0L3NsaWRlTGF5b3V0cy9zbGlkZUxheW91dDYueG1sUEsBAhQDFAAAAAgAcXBbV4Bl4Yi3AAAANgEAACwAAAAAAAAAAAAAAIAB8j0AAHBwdC9zbGlkZUxheW91dHMvX3JlbHMvc2xpZGVMYXlvdXQ2LnhtbC5yZWxzUEsBAhQDFAAAAAgAcXBbV+sXn3fmAgAAZwcAACEAAAAAAAAAAAAAAIAB8z4AAHBwdC9zbGlkZUxheW91dHMvc2xpZGVMYXlvdXQ3LnhtbFBLAQIUAxQAAAAIAHFwW1eAZeGItwAAADYBAAAsAAAAAAAAAAAAAACAARhCAABwcHQvc2xpZGVMYXlvdXRzL19yZWxzL3NsaWRlTGF5b3V0Ny54bWwucmVsc1BLAQIUAxQAAAAIAHFwW1fNyorVsgQAAMISAAAhAAAAAAAAAAAAAACAARlDAABwcHQvc2xpZGVMYXlvdXRzL3NsaWRlTGF5b3V0OC54bWxQSwECFAMUAAAACABxcFtXgGXhiLcAAAA2AQAALAAAAAAAAAAAAAAAgAEKSAAAcHB0L3NsaWRlTGF5b3V0cy9fcmVscy9zbGlkZUxheW91dDgueG1sLnJlbHNQSwECFAMUAAAACABxcFtXWtO0knkEAAAxEgAAIQAAAAAAAAAAAAAAgAELSQAAcHB0L3NsaWRlTGF5b3V0cy9zbGlkZUxheW91dDkueG1sUEsBAhQDFAAAAAgAcXBbV4Bl4Yi3AAAANgEAACwAAAAAAAAAAAAAAIABw00AAHBwdC9zbGlkZUxheW91dHMvX3JlbHMvc2xpZGVMYXlvdXQ5LnhtbC5yZWxzUEsBAhQDFAAAAAgAcXBbV+jkSdE5AwAAsyQAACgAAAAAAAAAAAAAAIABxE4AAHBwdC9wcmludGVyU2V0dGluZ3MvcHJpbnRlclNldHRpbmdzMS5iaW5QSwECFAMUAAAACABxcFtXXJxHFEQBAACJAgAAEQAAAAAAAAAAAAAAgAFDUgAAcHB0L3ByZXNQcm9wcy54bWxQSwECFAMUAAAACABxcFtXZzMmjZsBAACCAwAAEQAAAAAAAAAAAAAAgAG2UwAAcHB0L3ZpZXdQcm9wcy54bWxQSwECFAMUAAAACABxcFtX2P2Nj6UAAAC2AAAAEwAAAAAAAAAAAAAAgAGAVQAAcHB0L3RhYmxlU3R5bGVzLnhtbFBLAQIUAxQAAAAIAHFwW1c3axy8dAEAAJkDAAAVAAAAAAAAAAAAAACAAVZWAABwcHQvc2xpZGVzL3NsaWRlMS54bWxQSwECFAMUAAAACABxcFtXNuhQzbcAAAA2AQAAIAAAAAAAAAAAAAAAgAH9VwAAcHB0L3NsaWRlcy9fcmVscy9zbGlkZTEueG1sLnJlbHNQSwECFAMUAAAACABxcFtXWqAOraMFAADjDwAAFwAAAAAAAAAAAAAAgAHyWAAAZG9jUHJvcHMvdGh1bWJuYWlsLmpwZWdQSwECFAMUAAAACABxcFtXixT843kBAADbAgAAEQAAAAAAAAAAAAAAgAHKXgAAZG9jUHJvcHMvY29yZS54bWxQSwECFAMUAAAACABxcFtXntCOee8BAABtBAAAEAAAAAAAAAAAAAAAgAFyYAAAZG9jUHJvcHMvYXBwLnhtbFBLBQYAAAAAJgAmAKMLAACPYgAAAAA=" +) + +simple_unstructured_scenario = ( + TestScenarioBuilder() + .set_name("simple_unstructured_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "sample.pdf": { + # minimal pdf file inlined as base 64 + "contents": pdf_file, + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "sample.docx": { + # minimal docx file inlined as base 64 + "contents": docx_file, + "last_modified": "2023-06-06T03:54:07.000Z", + }, + "sample.pptx": { + # minimal pptx file inlined as base 64 + "contents": pptx_file, + "last_modified": "2023-06-07T03:54:07.000Z", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "document_key": "sample.pdf", + "content": "# Hello World", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "sample.pdf", + }, + "stream": "stream1", + }, + { + "data": { + "document_key": "sample.docx", + "content": "# Content", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "sample.docx", + }, + "stream": "stream1", + }, + { + "data": { + "document_key": "sample.pptx", + "content": "# Title", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "sample.pptx", + }, + "stream": "stream1", + }, + ] + ) +).build() + +corrupted_file_scenario = ( + TestScenarioBuilder() + .set_name("corrupted_file_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "sample.pdf": { + # bytes that can't be parsed as pdf + "contents": bytes("___ corrupted file ___", "utf-8"), + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "document_key": "sample.pdf", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=No /Root object! - Is this really a PDF?", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "sample.pdf", + }, + "stream": "stream1", + }, + ] + ) +).build() + +no_file_extension_unstructured_scenario = ( + TestScenarioBuilder() + .set_name("no_file_extension_unstructured_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "unstructured"}, + "globs": ["*"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "pdf_without_extension": { + # same file, but can't be detected via file extension + "contents": pdf_file, + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "docx_without_extension": { + # same file, but can't be detected via file extesion + "contents": docx_file, + "last_modified": "2023-06-06T03:54:07.000Z", + }, + "pptx_without_extension": { + # minimal pptx file inlined as base 64 + "contents": pptx_file, + "last_modified": "2023-06-07T03:54:07.000Z", + }, + } + ) + .set_file_type("unstructured") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": json_schema, + "name": "stream1", + "source_defined_cursor": True, + "source_defined_primary_key": [["document_key"]], + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "document_key": "pdf_without_extension", + "content": "# Hello World", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "pdf_without_extension", + }, + "stream": "stream1", + }, + { + "data": { + "document_key": "docx_without_extension", + "content": "# Content", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", + "_ab_source_file_url": "docx_without_extension", + }, + "stream": "stream1", + }, + { + "data": { + "document_key": "pptx_without_extension", + "content": "# Title", + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", + "_ab_source_file_url": "pptx_without_extension", + }, + "stream": "stream1", + }, + ] + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py new file mode 100644 index 000000000000..3c10e701c629 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py @@ -0,0 +1,758 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError +from airbyte_cdk.test.catalog_builder import CatalogBuilder +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +""" +User input schema rules: + - `check`: Successful if the schema conforms to a record, otherwise ConfigValidationError. + - `discover`: User-input schema is output if the schema is valid, otherwise ConfigValidationError. + - `read`: If the schema is valid, record values are cast to types in the schema; if this is successful + the records are emitted. otherwise an error is logged. If the schema is not valid, ConfigValidationError. +""" + + +_base_user_input_schema_scenario = ( + TestScenarioBuilder() + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11", "val12"), + ("val21", "val22"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + } + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val21", + "col2": "val22", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +) + + +valid_single_stream_user_input_schema_scenario = ( + _base_user_input_schema_scenario.copy() + .set_name("valid_single_stream_user_input_schema_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "string"}', + } + ] + } + ) + .set_expected_check_status("SUCCEEDED") +).build() + + +single_stream_user_input_schema_scenario_schema_is_invalid = ( + _base_user_input_schema_scenario.copy() + .set_name("single_stream_user_input_schema_scenario_schema_is_invalid") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "x", "col2": "string"}', + } + ] + } + ) + .set_catalog(CatalogBuilder().with_stream("stream1", SyncMode.full_refresh).build()) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) + .set_expected_discover_error(ConfigValidationError, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) + .set_expected_read_error(ConfigValidationError, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) +).build() + + +single_stream_user_input_schema_scenario_emit_nonconforming_records = ( + _base_user_input_schema_scenario.copy() + .set_name("single_stream_user_input_schema_scenario_emit_nonconforming_records") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "integer", "col2": "string"}', + } + ] + } + ) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "integer"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +).build() + + +single_stream_user_input_schema_scenario_skip_nonconforming_records = ( + _base_user_input_schema_scenario.copy() + .set_name("single_stream_user_input_schema_scenario_skip_nonconforming_records") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*"], + "validation_policy": "Skip Record", + "input_schema": '{"col1": "integer", "col2": "string"}', + } + ] + } + ) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "integer"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) + .set_expected_records([]) + .set_expected_logs( + { + "read": [ + { + "level": "WARN", + "message": "Records in file did not pass validation policy. stream=stream1 file=a.csv n_skipped=2 validation_policy=skip_record", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col1: value=val11,expected_type=integer", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col1: value=val21,expected_type=integer", + }, + ] + } + ) +).build() + + +_base_multi_stream_user_input_schema_scenario = ( + TestScenarioBuilder() + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val11a", 21), + ("val12a", 22), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { + "contents": [ + ("col1", "col2", "col3"), + ("val11b", "val12b", "val13b"), + ("val21b", "val22b", "val23b"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "c.csv": { + "contents": [ + ("col1",), + ("val11c",), + ("val21c",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": "string", + }, + "col2": { + "type": "integer", + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": "string", + }, + "col2": { + "type": "string", + }, + "col3": { + "type": "string", + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream3", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": 21, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val12a", + "col2": 22, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + # The files in b.csv are emitted despite having an invalid schema + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream2", + }, + { + "data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, + "stream": "stream3", + }, + { + "data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, + "stream": "stream3", + }, + ] + ) +) + + +valid_multi_stream_user_input_schema_scenario = ( + _base_multi_stream_user_input_schema_scenario.copy() + .set_name("valid_multi_stream_user_input_schema_scenario") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "integer"}', + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "string", "col3": "string"}', + }, + { + "name": "stream3", + "format": {"filetype": "csv"}, + "globs": ["c.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_expected_check_status("SUCCEEDED") +).build() + + +multi_stream_user_input_schema_scenario_schema_is_invalid = ( + _base_multi_stream_user_input_schema_scenario.copy() + .set_name("multi_stream_user_input_schema_scenario_schema_is_invalid") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "integer"}', + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "x", "col2": "string", "col3": "string"}', # this stream's schema is invalid + }, + { + "name": "stream3", + "format": {"filetype": "csv"}, + "globs": ["c.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_catalog( + CatalogBuilder() + .with_stream("stream1", SyncMode.full_refresh) + .with_stream("stream2", SyncMode.full_refresh) + .with_stream("stream3", SyncMode.full_refresh) + .build() + ) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) + .set_expected_discover_error(ConfigValidationError, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) + .set_expected_read_error(ConfigValidationError, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) +).build() + + +multi_stream_user_input_schema_scenario_emit_nonconforming_records = ( + _base_multi_stream_user_input_schema_scenario.copy() + .set_name("multi_stream_user_input_schema_scenario_emit_nonconforming_records") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "integer"}', + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "integer", "col3": "string"}', # this stream's records do not conform to the schema + }, + { + "name": "stream3", + "format": {"filetype": "csv"}, + "globs": ["c.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "integer"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "integer"}, + "col3": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream3", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": 21, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val12a", + "col2": 22, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val11b", + "col2": "val12b", + "col3": "val13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val21b", + "col2": "val22b", + "col3": "val23b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream2", + }, + { + "data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, + "stream": "stream3", + }, + { + "data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, + "stream": "stream3", + }, + ] + ) + .set_expected_logs( + { + "read": [ + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=val12b,expected_type=integer", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=val22b,expected_type=integer", + }, + ] + } + ) +).build() + + +multi_stream_user_input_schema_scenario_skip_nonconforming_records = ( + _base_multi_stream_user_input_schema_scenario.copy() + .set_name("multi_stream_user_input_schema_scenario_skip_nonconforming_records") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a.csv"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "integer"}', + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b.csv"], + "validation_policy": "Skip Record", + "input_schema": '{"col1": "string", "col2": "integer", "col3": "string"}', # this stream's records do not conform to the schema + }, + { + "name": "stream3", + "format": {"filetype": "csv"}, + "globs": ["c.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "integer"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "integer"}, + "col3": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": ["null", "string"], + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream3", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) + .set_expected_check_status("FAILED") + .set_expected_check_error(None, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) + .set_expected_records( + [ + { + "data": { + "col1": "val11a", + "col2": 21, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val12a", + "col2": 22, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + # "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + # "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, + { + "data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, + "stream": "stream3", + }, + { + "data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, + "stream": "stream3", + }, + ] + ) + .set_expected_logs( + { + "read": [ + { + "level": "WARN", + "message": "Records in file did not pass validation policy. stream=stream2 file=b.csv n_skipped=2 validation_policy=skip_record", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=val12b,expected_type=integer", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=val22b,expected_type=integer", + }, + ] + } + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py new file mode 100644 index 000000000000..47e37dffd9c4 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -0,0 +1,729 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder + +_base_single_stream_scenario = ( + TestScenarioBuilder() + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("val_a_11", "1"), + ("val_a_12", "2"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b.csv": { # The records in this file do not conform to the schema + "contents": [ + ("col1", "col2"), + ("val_b_11", "this is text that will trigger validation policy"), + ("val_b_12", "2"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "c.csv": { + "contents": [ + ("col1",), + ("val_c_11",), + ("val_c_12", "val_c_22"), # This record is not parsable + ("val_c_13",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "d.csv": { + "contents": [ + ("col1",), + ("val_d_11",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": "string", + }, + "col2": { + "type": "integer", + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + } + ] + } + ) +) + + +_base_multi_stream_scenario = ( + TestScenarioBuilder() + .set_source_builder( + FileBasedSourceBuilder() + .set_files( + { + "a/a1.csv": { + "contents": [ + ("col1", "col2"), + ("val_aa1_11", "1"), + ("val_aa1_12", "2"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "a/a2.csv": { + "contents": [ + ("col1", "col2"), + ("val_aa2_11", "this is text that will trigger validation policy"), + ("val_aa2_12", "2"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "a/a3.csv": { + "contents": [ + ("col1",), + ("val_aa3_11",), + ("val_aa3_12", "val_aa3_22"), # This record is not parsable + ("val_aa3_13",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "a/a4.csv": { + "contents": [ + ("col1",), + ("val_aa4_11",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b/b1.csv": { # The records in this file do not conform to the schema + "contents": [ + ("col1", "col2"), + ("val_bb1_11", "1"), + ("val_bb1_12", "2"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b/b2.csv": { + "contents": [ + ("col1", "col2"), + ("val_bb2_11", "this is text that will trigger validation policy"), + ("val_bb2_12", "2"), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + "b/b3.csv": { + "contents": [ + ("col1",), + ("val_bb3_11",), + ("val_bb3_12",), + ], + "last_modified": "2023-06-05T03:54:07.000Z", + }, + } + ) + .set_file_type("csv") + ) + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": "string", + }, + "col2": { + "type": "integer", + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + { + "json_schema": { + "default_cursor_field": ["_ab_source_file_last_modified"], + "type": "object", + "properties": { + "col1": { + "type": "string", + }, + "col2": { + "type": "integer", + }, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream2", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + "is_resumable": True, + }, + ] + } + ) +) + + +skip_record_scenario_single_stream = ( + _base_single_stream_scenario.copy() + .set_name("skip_record_scenario_single_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Skip Record", + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val_a_11", + "col2": 1, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_a_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val_b_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + { + "data": { + "col1": "val_b_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_c_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed + { + "data": { + "col1": "val_d_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_logs( + { + "read": [ + { + "level": "WARN", + "message": "Records in file did not pass validation policy. stream=stream1 file=b.csv n_skipped=1 validation_policy=skip_record", + }, + { + "level": "ERROR", + "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=c.csv line_no=2 n_skipped=0", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + ] + } + ) + .set_expected_read_error( + AirbyteTracedException, + "Please check the logged errors for more information.", + ) +).build() + + +skip_record_scenario_multi_stream = ( + _base_multi_stream_scenario.copy() + .set_name("skip_record_scenario_multi_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a/*.csv"], + "validation_policy": "Skip Record", + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b/*.csv"], + "validation_policy": "Skip Record", + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val_aa1_11", + "col2": 1, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a1.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_aa1_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a1.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + { + "data": { + "col1": "val_aa2_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a2.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_aa3_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a3.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed + { + "data": { + "col1": "val_aa4_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a4.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_bb1_11", + "col2": 1, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b1.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb1_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b1.csv", + }, + "stream": "stream2", + }, + # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform + { + "data": { + "col1": "val_bb2_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b2.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb3_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b3.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb3_12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b3.csv", + }, + "stream": "stream2", + }, + ] + ) + .set_expected_logs( + { + "read": [ + { + "level": "WARN", + "message": "Records in file did not pass validation policy. stream=stream1 file=a/a2.csv n_skipped=1 validation_policy=skip_record", + }, + { + "level": "ERROR", + "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a/a3.csv line_no=2 n_skipped=0", + }, + { + "level": "WARN", + "message": "Records in file did not pass validation policy. stream=stream2 file=b/b2.csv n_skipped=1 validation_policy=skip_record", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + ] + } + ) + .set_expected_read_error( + AirbyteTracedException, + "Please check the logged errors for more information.", + ) +).build() + + +emit_record_scenario_single_stream = ( + _base_single_stream_scenario.copy() + .set_name("emit_record_scenario_single_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Emit Record", + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val_a_11", + "col2": 1, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_a_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_b_11", + "col2": "this is text that will trigger validation policy", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, # This record is skipped because it does not conform + { + "data": { + "col1": "val_b_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_c_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "c.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error + { + "data": { + "col1": "val_d_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "d.csv", + }, + "stream": "stream1", + }, + ] + ) + .set_expected_read_error( + AirbyteTracedException, + "Please check the logged errors for more information.", + ) +).build() + + +emit_record_scenario_multi_stream = ( + _base_multi_stream_scenario.copy() + .set_name("emit_record_scenario_multi_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a/*.csv"], + "validation_policy": "Emit Record", + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b/*.csv"], + "validation_policy": "Emit Record", + }, + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "val_aa1_11", + "col2": 1, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a1.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_aa1_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a1.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_aa2_11", + "col2": "this is text that will trigger validation policy", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a2.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_aa2_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a2.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_aa3_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a3.csv", + }, + "stream": "stream1", + }, + # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed + { + "data": { + "col1": "val_aa4_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a/a4.csv", + }, + "stream": "stream1", + }, + { + "data": { + "col1": "val_bb1_11", + "col2": 1, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b1.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb1_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b1.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb2_11", + "col2": "this is text that will trigger validation policy", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b2.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb2_12", + "col2": 2, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b2.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb3_11", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b3.csv", + }, + "stream": "stream2", + }, + { + "data": { + "col1": "val_bb3_12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b/b3.csv", + }, + "stream": "stream2", + }, + ] + ) + .set_expected_read_error( + AirbyteTracedException, + "Please check the logged errors for more information.", + ) +).build() + + +wait_for_rediscovery_scenario_single_stream = ( + _base_single_stream_scenario.copy() + .set_name("wait_for_rediscovery_scenario_single_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["*.csv"], + "validation_policy": "Wait for Discover", + } + ] + } + ) + .set_expected_records(None) # When syncing streams concurrently we don't know how many records will be emitted before the sync stops + .set_expected_logs( + { + "read": [ + { + "level": "WARN", + "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=b.csv validation_policy=Wait for Discover n_skipped=0", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + ] + } + ) +).build() + + +wait_for_rediscovery_scenario_multi_stream = ( + _base_multi_stream_scenario.copy() + .set_name("wait_for_rediscovery_scenario_multi_stream") + .set_config( + { + "streams": [ + { + "name": "stream1", + "format": {"filetype": "csv"}, + "globs": ["a/*.csv"], + "validation_policy": "Wait for Discover", + }, + { + "name": "stream2", + "format": {"filetype": "csv"}, + "globs": ["b/*.csv"], + "validation_policy": "Wait for Discover", + }, + ] + } + ) + .set_expected_records(None) # When syncing streams concurrently we don't know how many records will be emitted before the sync stops + .set_expected_logs( + { + "read": [ + { + "level": "WARN", + "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=a/a2.csv validation_policy=Wait for Discover n_skipped=0", + }, + { + "level": "WARN", + "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream2 file=b/b2.csv validation_policy=Wait for Discover n_skipped=0", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + "level": "WARN", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + ] + } + ) +).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/schema_validation_policies/test_default_schema_validation_policy.py b/airbyte-cdk/python/unit_tests/sources/file_based/schema_validation_policies/test_default_schema_validation_policy.py new file mode 100644 index 000000000000..a7d2bfb7c019 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/schema_validation_policies/test_default_schema_validation_policy.py @@ -0,0 +1,50 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Mapping + +import pytest +from airbyte_cdk.sources.file_based.config.file_based_stream_config import ValidationPolicy +from airbyte_cdk.sources.file_based.exceptions import StopSyncPerValidationPolicy +from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES + +CONFORMING_RECORD = { + "col1": "val1", + "col2": 1, +} + +NONCONFORMING_RECORD = { + "col1": "val1", + "extra_col": "x", +} + + +SCHEMA = { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "integer"}, + }, +} + + +@pytest.mark.parametrize( + "record,schema,validation_policy,expected_result", + [ + pytest.param(CONFORMING_RECORD, SCHEMA, ValidationPolicy.emit_record, True, id="record-conforms_emit_record"), + pytest.param(NONCONFORMING_RECORD, SCHEMA, ValidationPolicy.emit_record, True, id="nonconforming_emit_record"), + pytest.param(CONFORMING_RECORD, SCHEMA, ValidationPolicy.skip_record, True, id="record-conforms_skip_record"), + pytest.param(NONCONFORMING_RECORD, SCHEMA, ValidationPolicy.skip_record, False, id="nonconforming_skip_record"), + pytest.param(CONFORMING_RECORD, SCHEMA, ValidationPolicy.wait_for_discover, True, id="record-conforms_wait_for_discover"), + pytest.param(NONCONFORMING_RECORD, SCHEMA, ValidationPolicy.wait_for_discover, False, id="nonconforming_wait_for_discover"), + ], +) +def test_record_passes_validation_policy( + record: Mapping[str, Any], schema: Mapping[str, Any], validation_policy: ValidationPolicy, expected_result: bool +) -> None: + if validation_policy == ValidationPolicy.wait_for_discover and expected_result is False: + with pytest.raises(StopSyncPerValidationPolicy): + DEFAULT_SCHEMA_VALIDATION_POLICIES[validation_policy].record_passes_validation_policy(record, schema) + else: + assert DEFAULT_SCHEMA_VALIDATION_POLICIES[validation_policy].record_passes_validation_policy(record, schema) == expected_result diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/stream/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/stream/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/__init__.py b/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/test_adapters.py b/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/test_adapters.py new file mode 100644 index 000000000000..9d10ff5e051b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/test_adapters.py @@ -0,0 +1,362 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +import unittest +from datetime import datetime +from unittest.mock import MagicMock, Mock + +import pytest +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, Level, SyncMode +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.file_based.availability_strategy import DefaultFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.discovery_policy import DefaultDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector +from airbyte_cdk.sources.file_based.file_types import default_parsers +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_validation_policies import EmitRecordPolicy +from airbyte_cdk.sources.file_based.stream import DefaultFileBasedStream +from airbyte_cdk.sources.file_based.stream.concurrent.adapters import ( + FileBasedStreamFacade, + FileBasedStreamPartition, + FileBasedStreamPartitionGenerator, +) +from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedFinalStateCursor +from airbyte_cdk.sources.message import InMemoryMessageRepository +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor +from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer +from freezegun import freeze_time + +_ANY_SYNC_MODE = SyncMode.full_refresh +_ANY_STATE = {"state_key": "state_value"} +_ANY_CURSOR_FIELD = ["a", "cursor", "key"] +_STREAM_NAME = "stream" +_ANY_CURSOR = Mock(spec=FileBasedFinalStateCursor) + + +@pytest.mark.parametrize( + "sync_mode", + [ + pytest.param(SyncMode.full_refresh, id="test_full_refresh"), + pytest.param(SyncMode.incremental, id="test_incremental"), + ], +) +def test_file_based_stream_partition_generator(sync_mode): + stream = Mock() + message_repository = Mock() + stream_slices = [ + {"files": [RemoteFile(uri="1", last_modified=datetime.now())]}, + {"files": [RemoteFile(uri="2", last_modified=datetime.now())]}, + ] + stream.stream_slices.return_value = stream_slices + + partition_generator = FileBasedStreamPartitionGenerator( + stream, message_repository, _ANY_SYNC_MODE, _ANY_CURSOR_FIELD, _ANY_STATE, _ANY_CURSOR + ) + + partitions = list(partition_generator.generate()) + slices = [partition.to_slice() for partition in partitions] + assert slices == stream_slices + stream.stream_slices.assert_called_once_with(sync_mode=_ANY_SYNC_MODE, cursor_field=_ANY_CURSOR_FIELD, stream_state=_ANY_STATE) + + +@pytest.mark.parametrize( + "transformer, expected_records", + [ + pytest.param( + TypeTransformer(TransformConfig.NoTransform), + [Record({"data": "1"}, Mock(spec=FileBasedStreamPartition, stream_name=Mock(return_value=_STREAM_NAME))), Record({"data": "2"}, Mock(spec=FileBasedStreamPartition, stream_name=Mock(return_value=_STREAM_NAME)))], + id="test_no_transform", + ), + pytest.param( + TypeTransformer(TransformConfig.DefaultSchemaNormalization), + [Record({"data": 1}, Mock(spec=FileBasedStreamPartition, stream_name=Mock(return_value=_STREAM_NAME))), Record({"data": 2}, Mock(spec=FileBasedStreamPartition, stream_name=Mock(return_value=_STREAM_NAME)))], + id="test_default_transform", + ), + ], +) +def test_file_based_stream_partition(transformer, expected_records): + stream = Mock() + stream.name = _STREAM_NAME + stream.get_json_schema.return_value = {"type": "object", "properties": {"data": {"type": ["integer"]}}} + stream.transformer = transformer + message_repository = InMemoryMessageRepository() + _slice = None + sync_mode = SyncMode.full_refresh + cursor_field = None + state = None + partition = FileBasedStreamPartition(stream, _slice, message_repository, sync_mode, cursor_field, state, _ANY_CURSOR) + + a_log_message = AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.INFO, + message='slice:{"partition": 1}', + ), + ) + + stream_data = [a_log_message, {"data": "1"}, {"data": "2"}] + stream.read_records.return_value = stream_data + + records = list(partition.read()) + messages = list(message_repository.consume_queue()) + + assert records == expected_records + assert messages == [a_log_message] + + +@pytest.mark.parametrize( + "exception_type, expected_display_message", + [ + pytest.param(Exception, None, id="test_exception_no_display_message"), + pytest.param(ExceptionWithDisplayMessage, "display_message", id="test_exception_no_display_message"), + ], +) +def test_file_based_stream_partition_raising_exception(exception_type, expected_display_message): + stream = Mock() + stream.get_error_display_message.return_value = expected_display_message + + message_repository = InMemoryMessageRepository() + _slice = None + + partition = FileBasedStreamPartition(stream, _slice, message_repository, _ANY_SYNC_MODE, _ANY_CURSOR_FIELD, _ANY_STATE, _ANY_CURSOR) + + stream.read_records.side_effect = Exception() + + with pytest.raises(exception_type) as e: + list(partition.read()) + if isinstance(e, ExceptionWithDisplayMessage): + assert e.display_message == "display message" + + +@freeze_time("2023-06-09T00:00:00Z") +@pytest.mark.parametrize( + "_slice, expected_hash", + [ + pytest.param( + {"files": [RemoteFile(uri="1", last_modified=datetime.strptime("2023-06-09T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ"))]}, + hash(("stream", "2023-06-09T00:00:00.000000Z_1")), + id="test_hash_with_slice", + ), + pytest.param(None, hash("stream"), id="test_hash_no_slice"), + ], +) +def test_file_based_stream_partition_hash(_slice, expected_hash): + stream = Mock() + stream.name = "stream" + partition = FileBasedStreamPartition(stream, _slice, Mock(), _ANY_SYNC_MODE, _ANY_CURSOR_FIELD, _ANY_STATE, _ANY_CURSOR) + + _hash = partition.__hash__() + assert _hash == expected_hash + + +class StreamFacadeTest(unittest.TestCase): + def setUp(self): + self._abstract_stream = Mock() + self._abstract_stream.name = "stream" + self._abstract_stream.as_airbyte_stream.return_value = AirbyteStream( + name="stream", + json_schema={"type": "object"}, + supported_sync_modes=[SyncMode.full_refresh], + ) + self._legacy_stream = DefaultFileBasedStream( + cursor=FileBasedFinalStateCursor(stream_config=MagicMock(), stream_namespace=None, message_repository=Mock()), + config=FileBasedStreamConfig(name="stream", format=CsvFormat()), + catalog_schema={}, + stream_reader=MagicMock(), + availability_strategy=DefaultFileBasedAvailabilityStrategy(MagicMock()), + discovery_policy=DefaultDiscoveryPolicy(), + parsers=default_parsers, + validation_policy=EmitRecordPolicy(), + errors_collector=FileBasedErrorsCollector(), + ) + self._cursor = Mock(spec=Cursor) + self._logger = Mock() + self._slice_logger = Mock() + self._slice_logger.should_log_slice_message.return_value = False + self._facade = FileBasedStreamFacade(self._abstract_stream, self._legacy_stream, self._cursor, self._slice_logger, self._logger) + self._source = Mock() + + self._stream = Mock() + self._stream.primary_key = "id" + + def test_name_is_delegated_to_wrapped_stream(self): + assert self._facade.name == self._abstract_stream.name + + def test_cursor_field_is_a_string(self): + self._abstract_stream.cursor_field = "cursor_field" + assert self._facade.cursor_field == "cursor_field" + + def test_source_defined_cursor_is_true(self): + assert self._facade.source_defined_cursor + + def test_json_schema_is_delegated_to_wrapped_stream(self): + json_schema = {"type": "object"} + self._abstract_stream.get_json_schema.return_value = json_schema + assert self._facade.get_json_schema() == json_schema + self._abstract_stream.get_json_schema.assert_called_once_with() + + def test_given_cursor_is_noop_when_supports_incremental_then_return_legacy_stream_response(self): + assert ( + FileBasedStreamFacade( + self._abstract_stream, self._legacy_stream, _ANY_CURSOR, Mock(spec=SliceLogger), Mock(spec=logging.Logger) + ).supports_incremental + == self._legacy_stream.supports_incremental + ) + + def test_given_cursor_is_not_noop_when_supports_incremental_then_return_true(self): + assert FileBasedStreamFacade( + self._abstract_stream, self._legacy_stream, Mock(spec=Cursor), Mock(spec=SliceLogger), Mock(spec=logging.Logger) + ).supports_incremental + + def test_full_refresh(self): + expected_stream_data = [{"data": 1}, {"data": 2}] + + partition = Mock() + records = [Record(data, partition) for data in expected_stream_data] + partition.read.return_value = records + self._abstract_stream.generate_partitions.return_value = [partition] + + actual_stream_data = list(self._facade.read_records(SyncMode.full_refresh, None, {}, None)) + + assert actual_stream_data == expected_stream_data + + def test_read_records(self): + expected_stream_data = [{"data": 1}, {"data": 2}] + records = [Record(data, "stream") for data in expected_stream_data] + partition = Mock() + partition.read.return_value = records + self._abstract_stream.generate_partitions.return_value = [partition] + + actual_stream_data = list(self._facade.read(None, None, None, None, None, None)) + + assert actual_stream_data == expected_stream_data + + def test_create_from_stream_stream(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = "id" + stream.cursor_field = "cursor" + + facade = FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + assert facade.name == "stream" + assert facade.cursor_field == "cursor" + assert facade._abstract_stream._primary_key == ["id"] + + def test_create_from_stream_stream_with_none_primary_key(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = None + stream.cursor_field = [] + + facade = FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + assert facade._abstract_stream._primary_key == [] + + def test_create_from_stream_with_composite_primary_key(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = ["id", "name"] + stream.cursor_field = [] + + facade = FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + assert facade._abstract_stream._primary_key == ["id", "name"] + + def test_create_from_stream_with_empty_list_cursor(self): + stream = Mock() + stream.primary_key = "id" + stream.cursor_field = [] + + facade = FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + assert facade.cursor_field == [] + + def test_create_from_stream_raises_exception_if_primary_key_is_nested(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = [["field", "id"]] + + with self.assertRaises(ValueError): + FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + def test_create_from_stream_raises_exception_if_primary_key_has_invalid_type(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = 123 + + with self.assertRaises(ValueError): + FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + def test_create_from_stream_raises_exception_if_cursor_field_is_nested(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = "id" + stream.cursor_field = ["field", "cursor"] + + with self.assertRaises(ValueError): + FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + def test_create_from_stream_with_cursor_field_as_list(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = "id" + stream.cursor_field = ["cursor"] + + facade = FileBasedStreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + assert facade.cursor_field == "cursor" + + def test_create_from_stream_none_message_repository(self): + self._stream.name = "stream" + self._stream.primary_key = "id" + self._stream.cursor_field = "cursor" + self._source.message_repository = None + + with self.assertRaises(ValueError): + FileBasedStreamFacade.create_from_stream(self._stream, self._source, self._logger, {}, self._cursor) + + def test_get_error_display_message_no_display_message(self): + self._stream.get_error_display_message.return_value = "display_message" + + facade = FileBasedStreamFacade.create_from_stream(self._stream, self._source, self._logger, _ANY_STATE, self._cursor) + + expected_display_message = None + e = Exception() + + display_message = facade.get_error_display_message(e) + + assert display_message == expected_display_message + + def test_get_error_display_message_with_display_message(self): + self._stream.get_error_display_message.return_value = "display_message" + + facade = FileBasedStreamFacade.create_from_stream(self._stream, self._source, self._logger, _ANY_STATE, self._cursor) + + expected_display_message = "display_message" + e = ExceptionWithDisplayMessage("display_message") + + display_message = facade.get_error_display_message(e) + + assert display_message == expected_display_message + + +@pytest.mark.parametrize( + "exception, expected_display_message", + [ + pytest.param(Exception("message"), None, id="test_no_display_message"), + pytest.param(ExceptionWithDisplayMessage("message"), "message", id="test_no_display_message"), + ], +) +def test_get_error_display_message(exception, expected_display_message): + stream = Mock() + legacy_stream = Mock() + cursor = Mock(spec=Cursor) + facade = FileBasedStreamFacade(stream, legacy_stream, cursor, Mock().Mock(), Mock()) + + display_message = facade.get_error_display_message(exception) + + assert display_message == expected_display_message diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py b/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py new file mode 100644 index 000000000000..96c907901a38 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py @@ -0,0 +1,455 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + + +from datetime import datetime +from typing import Any, Dict, List, MutableMapping, Optional, Tuple +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.models import AirbyteStateMessage, SyncMode +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition +from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from freezegun import freeze_time + +DATE_TIME_FORMAT = FileBasedConcurrentCursor.DATE_TIME_FORMAT +MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3 + + +def _make_cursor(input_state: Optional[MutableMapping[str, Any]]) -> FileBasedConcurrentCursor: + stream = MagicMock() + stream.name = "test" + stream.namespace = None + stream_config = MagicMock() + stream_config.days_to_sync_if_history_is_full = MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL + cursor = FileBasedConcurrentCursor( + stream_config, + stream.name, + None, + input_state, + MagicMock(), + ConnectorStateManager(state=[AirbyteStateMessage(input_state)] if input_state is not None else None), + CursorField(FileBasedConcurrentCursor.CURSOR_FIELD), + ) + return cursor + + +@pytest.mark.parametrize( + "input_state, expected_cursor_value", + [ + pytest.param({}, (datetime.min, ""), id="no-state-gives-min-cursor"), + pytest.param({"history": {}}, (datetime.min, ""), id="missing-cursor-field-gives-min-cursor"), + pytest.param( + {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}, "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"}, + (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + id="cursor-value-matches-earliest-file", + ), + pytest.param( + {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}, "_ab_source_file_last_modified": "2020-01-01T00:00:00.000000Z_a.csv"}, + (datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + id="cursor-value-is-earlier", + ), + pytest.param( + {"history": {"a.csv": "2022-01-01T00:00:00.000000Z"}, "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"}, + (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + id="cursor-value-is-later", + ), + pytest.param( + { + "history": { + "a.csv": "2021-01-01T00:00:00.000000Z", + "b.csv": "2021-01-02T00:00:00.000000Z", + "c.csv": "2021-01-03T00:00:00.000000Z", + }, + "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv", + }, + (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + id="cursor-not-earliest", + ), + pytest.param( + {"history": {"b.csv": "2020-12-31T00:00:00.000000Z"}, "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"}, + (datetime.strptime("2020-12-31T00:00:00.000000Z", DATE_TIME_FORMAT), "b.csv"), + id="state-with-cursor-and-earlier-history", + ), + pytest.param( + {"history": {"b.csv": "2021-01-02T00:00:00.000000Z"}, "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"}, + (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + id="state-with-cursor-and-later-history", + ), + ], +) +def test_compute_prev_sync_cursor(input_state: MutableMapping[str, Any], expected_cursor_value: Tuple[datetime, str]): + cursor = _make_cursor(input_state) + assert cursor._compute_prev_sync_cursor(input_state) == expected_cursor_value + + +@pytest.mark.parametrize( + "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value", + [ + pytest.param( + {"history": {}}, + [("newfile.csv", "2021-01-05T00:00:00.000000Z")], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [], + "2021-01-05T00:00:00.000000Z_newfile.csv", + id="add-to-empty-history-single-pending-file", + ), + pytest.param( + {"history": {}}, + [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [("pending.csv", "2020-01-05T00:00:00.000000Z")], + "2020-01-05T00:00:00.000000Z_pending.csv", + id="add-to-empty-history-pending-file-is-older", + ), + pytest.param( + {"history": {}}, + [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [("pending.csv", "2022-01-05T00:00:00.000000Z")], + "2022-01-05T00:00:00.000000Z_pending.csv", + id="add-to-empty-history-pending-file-is-newer", + ), + pytest.param( + {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}}, + [("newfile.csv", "2021-01-05T00:00:00.000000Z")], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [], + "2021-01-05T00:00:00.000000Z_newfile.csv", + id="add-to-nonempty-history-single-pending-file", + ), + pytest.param( + {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}}, + [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [("pending.csv", "2020-01-05T00:00:00.000000Z")], + "2020-01-05T00:00:00.000000Z_pending.csv", + id="add-to-nonempty-history-pending-file-is-older", + ), + pytest.param( + {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}}, + [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [("pending.csv", "2022-01-05T00:00:00.000000Z")], + "2022-01-05T00:00:00.000000Z_pending.csv", + id="add-to-nonempty-history-pending-file-is-newer", + ), + ], +) +def test_add_file( + initial_state: MutableMapping[str, Any], + pending_files: List[Tuple[str, str]], + file_to_add: Tuple[str, str], + expected_history: Dict[str, Any], + expected_pending_files: List[Tuple[str, str]], + expected_cursor_value: str, +): + cursor = _make_cursor(initial_state) + mock_message_repository = MagicMock() + cursor._message_repository = mock_message_repository + stream = MagicMock() + + cursor.set_pending_partitions( + [ + FileBasedStreamPartition( + stream, + {"files": [RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]}, + mock_message_repository, + SyncMode.full_refresh, + FileBasedConcurrentCursor.CURSOR_FIELD, + initial_state, + cursor, + ) + for uri, timestamp in pending_files + ] + ) + + uri, timestamp = file_to_add + cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))) + assert cursor._file_to_datetime_history == expected_history + assert cursor._pending_files == { + uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files + } + assert ( + mock_message_repository.emit_message.call_args_list[0].args[0].state.stream.stream_state._ab_source_file_last_modified + == expected_cursor_value + ) + + +@pytest.mark.parametrize( + "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value", + [ + pytest.param( + {"history": {}}, + [], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [], + "2021-01-05T00:00:00.000000Z_newfile.csv", + id="add-to-empty-history-no-pending-files", + ), + pytest.param( + {"history": {}}, + [("pending.csv", "2021-01-05T00:00:00.000000Z")], + ("newfile.csv", "2021-01-05T00:00:00.000000Z"), + {"newfile.csv": "2021-01-05T00:00:00.000000Z"}, + [("pending.csv", "2021-01-05T00:00:00.000000Z")], + "2021-01-05T00:00:00.000000Z_pending.csv", + id="add-to-empty-history-file-not-in-pending-files", + ), + ], +) +def test_add_file_invalid( + initial_state: MutableMapping[str, Any], + pending_files: List[Tuple[str, str]], + file_to_add: Tuple[str, str], + expected_history: Dict[str, Any], + expected_pending_files: List[Tuple[str, str]], + expected_cursor_value: str, +): + cursor = _make_cursor(initial_state) + cursor._pending_files = { + uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in pending_files + } + mock_message_repository = MagicMock() + cursor._message_repository = mock_message_repository + + uri, timestamp = file_to_add + cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))) + assert cursor._file_to_datetime_history == expected_history + assert cursor._pending_files == { + uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files + } + assert mock_message_repository.emit_message.call_args_list[0].args[0].log.level.value == "WARN" + assert ( + mock_message_repository.emit_message.call_args_list[1].args[0].state.stream.stream_state._ab_source_file_last_modified + == expected_cursor_value + ) + + +@pytest.mark.parametrize( + "input_state, pending_files, expected_cursor_value", + [ + pytest.param({}, [], f"{datetime.min.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}_", id="no-state-no-pending"), + pytest.param( + {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}}, [], "2021-01-01T00:00:00.000000Z_a.csv", id="no-pending-with-history" + ), + pytest.param( + {"history": {}}, [("b.csv", "2021-01-02T00:00:00.000000Z")], "2021-01-02T00:00:00.000000Z_b.csv", id="pending-no-history" + ), + pytest.param( + {"history": {"a.csv": "2022-01-01T00:00:00.000000Z"}}, + [("b.csv", "2021-01-02T00:00:00.000000Z")], + "2021-01-01T00:00:00.000000Z_a.csv", + id="with-pending-before-history", + ), + pytest.param( + {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}}, + [("b.csv", "2022-01-02T00:00:00.000000Z")], + "2022-01-01T00:00:00.000000Z_a.csv", + id="with-pending-after-history", + ), + ], +) +def test_get_new_cursor_value(input_state: MutableMapping[str, Any], pending_files: List[Tuple[str, str]], expected_cursor_value: str): + cursor = _make_cursor(input_state) + pending_partitions = [] + for url, timestamp in pending_files: + partition = MagicMock() + partition.to_slice = lambda *args, **kwargs: { + "files": [RemoteFile(uri=url, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))] + } + pending_partitions.append(partition) + + cursor.set_pending_partitions(pending_partitions) + + +@pytest.mark.parametrize( + "all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync", + [ + pytest.param( + [RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))], + {}, + False, + (datetime.min, ""), + ["new.csv"], + id="empty-history-one-new-file", + ), + pytest.param( + [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))], + {"a.csv": "2021-01-01T00:00:00.000000Z"}, + False, + (datetime.min, ""), + ["a.csv"], + id="non-empty-history-file-in-history-modified", + ), + pytest.param( + [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))], + {"a.csv": "2021-01-01T00:00:00.000000Z"}, + False, + (datetime.min, ""), + [], + id="non-empty-history-file-in-history-not-modified", + ), + ], +) +def test_get_files_to_sync(all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync): + cursor = _make_cursor({}) + cursor._file_to_datetime_history = history + cursor._prev_cursor_value = prev_cursor_value + cursor._is_history_full = MagicMock(return_value=is_history_full) + files_to_sync = list(cursor.get_files_to_sync(all_files, MagicMock())) + assert [f.uri for f in files_to_sync] == expected_files_to_sync + + +@freeze_time("2023-06-16T00:00:00Z") +@pytest.mark.parametrize( + "file_to_check, history, is_history_full, prev_cursor_value, sync_start, expected_should_sync", + [ + pytest.param( + RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)), + {}, + False, + (datetime.min, ""), + datetime.min, + True, + id="file-not-in-history-not-full-old-cursor", + ), + pytest.param( + RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)), + {}, + False, + (datetime.strptime("2024-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), ""), + datetime.min, + True, + id="file-not-in-history-not-full-new-cursor", + ), + pytest.param( + RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {"a.csv": "2021-01-01T00:00:00.000000Z"}, + False, + (datetime.min, ""), + datetime.min, + False, + id="file-in-history-not-modified", + ), + pytest.param( + RemoteFile(uri="a.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {"a.csv": "2021-01-01T00:00:00.000000Z"}, + False, + (datetime.min, ""), + datetime.min, + False, + id="file-in-history-modified-before", + ), + pytest.param( + RemoteFile(uri="a.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {"a.csv": "2021-01-01T00:00:00.000000Z"}, + False, + (datetime.min, ""), + datetime.min, + True, + id="file-in-history-modified-after", + ), + pytest.param( + RemoteFile(uri="new.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {}, + True, + (datetime.strptime("2021-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + datetime.min, + True, + id="history-full-file-modified-after-cursor", + ), + pytest.param( + RemoteFile(uri="new1.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {}, + True, + (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new0.csv"), + datetime.min, + True, + id="history-full-modified-eq-cursor-uri-gt", + ), + pytest.param( + RemoteFile(uri="new0.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {}, + True, + (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new1.csv"), + datetime.min, + False, + id="history-full-modified-eq-cursor-uri-lt", + ), + pytest.param( + RemoteFile(uri="new.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {}, + True, + (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + datetime.min, + True, + id="history-full-modified-before-cursor-and-after-sync-start", + ), + pytest.param( + RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)), + {}, + True, + (datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"), + datetime.strptime("2024-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), + False, + id="history-full-modified-before-cursor-and-before-sync-start", + ), + ], +) +def test_should_sync_file( + file_to_check: RemoteFile, + history: Dict[str, Any], + is_history_full: bool, + prev_cursor_value: Tuple[datetime, str], + sync_start: datetime, + expected_should_sync: bool, +): + cursor = _make_cursor({}) + cursor._file_to_datetime_history = history + cursor._prev_cursor_value = prev_cursor_value + cursor._sync_start = sync_start + cursor._is_history_full = MagicMock(return_value=is_history_full) + should_sync = cursor._should_sync_file(file_to_check, MagicMock()) + assert should_sync == expected_should_sync + + +@freeze_time("2023-06-16T00:00:00Z") +@pytest.mark.parametrize( + "input_history, is_history_full, expected_start_time", + [ + pytest.param({}, False, datetime.min, id="empty-history"), + pytest.param( + {"a.csv": "2021-01-01T00:00:00.000000Z"}, + False, + datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), + id="non-full-history", + ), + pytest.param( + {f"file{i}.csv": f"2021-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all before the time window + True, + datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), # Time window start time + id="full-history-earliest-before-window", + ), + pytest.param( + {f"file{i}.csv": f"2024-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all after the time window + True, + datetime.strptime("2023-06-13T00:00:00.000000Z", DATE_TIME_FORMAT), # Earliest file time + id="full-history-earliest-after-window", + ), + ], +) +def test_compute_start_time(input_history, is_history_full, expected_start_time, monkeypatch): + cursor = _make_cursor({"history": input_history}) + cursor._file_to_datetime_history = input_history + cursor._is_history_full = MagicMock(return_value=is_history_full) + assert cursor._compute_start_time() == expected_start_time diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_cursor.py b/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_cursor.py new file mode 100644 index 000000000000..957ed912aa4b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_cursor.py @@ -0,0 +1,310 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from datetime import datetime, timedelta +from typing import Any, List, Mapping +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat +from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor +from freezegun import freeze_time + + +@pytest.mark.parametrize( + "files_to_add, expected_start_time, expected_state_dict", + [ + pytest.param( + [ + RemoteFile( + uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="b.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="c.csv", last_modified=datetime.strptime("2020-12-31T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + ], + [datetime(2021, 1, 1), datetime(2021, 1, 1), datetime(2020, 12, 31)], + { + "history": { + "a.csv": "2021-01-01T00:00:00.000000Z", + "b.csv": "2021-01-02T00:00:00.000000Z", + "c.csv": "2020-12-31T00:00:00.000000Z", + }, + "_ab_source_file_last_modified": "2021-01-02T00:00:00.000000Z_b.csv", + }, + id="test_file_start_time_is_earliest_time_in_history", + ), + pytest.param( + [ + RemoteFile( + uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="b.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="c.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="d.csv", last_modified=datetime.strptime("2021-01-04T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + ], + [datetime(2021, 1, 1), datetime(2021, 1, 1), datetime(2021, 1, 1), datetime(2021, 1, 2)], + { + "history": { + "b.csv": "2021-01-02T00:00:00.000000Z", + "c.csv": "2021-01-03T00:00:00.000000Z", + "d.csv": "2021-01-04T00:00:00.000000Z", + }, + "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv", + }, + id="test_earliest_file_is_removed_from_history_if_history_is_full", + ), + pytest.param( + [ + RemoteFile( + uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="file_with_same_timestamp_as_b.csv", + last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), + file_type="csv", + ), + RemoteFile( + uri="b.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="c.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="d.csv", last_modified=datetime.strptime("2021-01-04T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + ], + [ + datetime(2021, 1, 1), + datetime(2021, 1, 1), + datetime(2021, 1, 1), + datetime(2021, 1, 2), + datetime(2021, 1, 2), + ], + { + "history": { + "file_with_same_timestamp_as_b.csv": "2021-01-02T00:00:00.000000Z", + "c.csv": "2021-01-03T00:00:00.000000Z", + "d.csv": "2021-01-04T00:00:00.000000Z", + }, + "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv", + }, + id="test_files_are_sorted_by_timestamp_and_by_name", + ), + ], +) +def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[datetime], expected_state_dict: Mapping[str, Any]) -> None: + cursor = get_cursor(max_history_size=3, days_to_sync_if_history_is_full=3) + assert cursor._compute_start_time() == datetime.min + + for index, f in enumerate(files_to_add): + cursor.add_file(f) + assert cursor._compute_start_time() == expected_start_time[index] + assert cursor.get_state() == expected_state_dict + + +@pytest.mark.parametrize( + "files, expected_files_to_sync, max_history_size, history_is_partial", + [ + pytest.param( + [ + RemoteFile( + uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="b.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="c.csv", last_modified=datetime.strptime("2020-12-31T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + ], + [ + RemoteFile( + uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="b.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="c.csv", last_modified=datetime.strptime("2020-12-31T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + ], + 3, + True, + id="test_all_files_should_be_synced", + ), + pytest.param( + [ + RemoteFile( + uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="b.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="c.csv", last_modified=datetime.strptime("2020-12-31T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + ], + [ + RemoteFile( + uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="b.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + RemoteFile( + uri="c.csv", last_modified=datetime.strptime("2020-12-31T00:00:00.000Z", "%Y-%m-%dT%H:%M:%S.%fZ"), file_type="csv" + ), + ], + 2, + True, + id="test_sync_more_files_than_history_size", + ), + ], +) +def test_get_files_to_sync( + files: List[RemoteFile], expected_files_to_sync: List[RemoteFile], max_history_size: int, history_is_partial: bool +) -> None: + logger = MagicMock() + cursor = get_cursor(max_history_size, 3) + + files_to_sync = list(cursor.get_files_to_sync(files, logger)) + for f in files_to_sync: + cursor.add_file(f) + + assert files_to_sync == expected_files_to_sync + assert cursor._is_history_full() == history_is_partial + + +@freeze_time("2023-06-16T00:00:00Z") +def test_only_recent_files_are_synced_if_history_is_full() -> None: + logger = MagicMock() + cursor = get_cursor(2, 3) + + files_in_history = [ + RemoteFile(uri="b1.csv", last_modified=datetime(2021, 1, 2), file_type="csv"), + RemoteFile(uri="b2.csv", last_modified=datetime(2021, 1, 3), file_type="csv"), + ] + + state = { + "history": {f.uri: f.last_modified.strftime(DefaultFileBasedCursor.DATE_TIME_FORMAT) for f in files_in_history}, + } + cursor.set_initial_state(state) + + files = [ + RemoteFile(uri="a.csv", last_modified=datetime(2021, 1, 1), file_type="csv"), + RemoteFile(uri="c.csv", last_modified=datetime(2021, 1, 2), file_type="csv"), + RemoteFile(uri="d.csv", last_modified=datetime(2021, 1, 4), file_type="csv"), + ] + + expected_files_to_sync = [ + RemoteFile(uri="c.csv", last_modified=datetime(2021, 1, 2), file_type="csv"), + RemoteFile(uri="d.csv", last_modified=datetime(2021, 1, 4), file_type="csv"), + ] + + files_to_sync = list(cursor.get_files_to_sync(files, logger)) + assert files_to_sync == expected_files_to_sync + logger.warning.assert_called_once() + + +@pytest.mark.parametrize( + "modified_at_delta, should_sync_file", + [ + pytest.param(timedelta(days=-1), False, id="test_modified_at_is_earlier"), + pytest.param(timedelta(days=0), False, id="test_modified_at_is_equal"), + pytest.param(timedelta(days=1), True, id="test_modified_at_is_more_recent"), + ], +) +def test_sync_file_already_present_in_history(modified_at_delta: timedelta, should_sync_file: bool) -> None: + logger = MagicMock() + cursor = get_cursor(2, 3) + original_modified_at = datetime(2021, 1, 2) + filename = "a.csv" + files_in_history = [ + RemoteFile(uri=filename, last_modified=original_modified_at, file_type="csv"), + ] + + state = { + "history": {f.uri: f.last_modified.strftime(DefaultFileBasedCursor.DATE_TIME_FORMAT) for f in files_in_history}, + } + cursor.set_initial_state(state) + + files = [ + RemoteFile(uri=filename, last_modified=original_modified_at + modified_at_delta, file_type="csv"), + ] + + files_to_sync = list(cursor.get_files_to_sync(files, logger)) + assert bool(files_to_sync) == should_sync_file + + +@freeze_time("2023-06-06T00:00:00Z") +@pytest.mark.parametrize( + "file_name, last_modified, earliest_dt_in_history, should_sync_file", + [ + pytest.param("a.csv", datetime(2023, 6, 3), datetime(2023, 6, 6), True, id="test_last_modified_is_equal_to_time_buffer"), + pytest.param("b.csv", datetime(2023, 6, 6), datetime(2023, 6, 6), False, id="test_file_was_already_synced"), + pytest.param("b.csv", datetime(2023, 6, 7), datetime(2023, 6, 6), True, id="test_file_was_synced_in_the_past"), + pytest.param( + "b.csv", + datetime(2023, 6, 3), + datetime(2023, 6, 6), + False, + id="test_file_was_synced_in_the_past_but_last_modified_is_earlier_in_history", + ), + pytest.param( + "a.csv", + datetime(2023, 6, 3), + datetime(2023, 6, 3), + False, + id="test_last_modified_is_equal_to_earliest_dt_in_history_and_lexicographically_smaller", + ), + pytest.param( + "c.csv", + datetime(2023, 6, 3), + datetime(2023, 6, 3), + True, + id="test_last_modified_is_equal_to_earliest_dt_in_history_and_lexicographically_greater", + ), + ], +) +def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_in_history: datetime, should_sync_file: bool) -> None: + logger = MagicMock() + cursor = get_cursor(1, 3) + + cursor.add_file(RemoteFile(uri="b.csv", last_modified=earliest_dt_in_history, file_type="csv")) + cursor._start_time = cursor._compute_start_time() + cursor._initial_earliest_file_in_history = cursor._compute_earliest_file_in_history() + + assert ( + bool(list(cursor.get_files_to_sync([RemoteFile(uri=file_name, last_modified=last_modified, file_type="csv")], logger))) + == should_sync_file + ) + + +def test_set_initial_state_no_history() -> None: + cursor = get_cursor(1, 3) + cursor.set_initial_state({}) + + +def get_cursor(max_history_size: int, days_to_sync_if_history_is_full: int) -> DefaultFileBasedCursor: + cursor_cls = DefaultFileBasedCursor + cursor_cls.DEFAULT_MAX_HISTORY_SIZE = max_history_size + config = FileBasedStreamConfig( + format=CsvFormat(), + name="test", + validation_policy=ValidationPolicy.emit_record, + days_to_sync_if_history_is_full=days_to_sync_if_history_is_full, + ) + return cursor_cls(config) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_stream.py b/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_stream.py new file mode 100644 index 000000000000..f6c3192734bc --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_stream.py @@ -0,0 +1,299 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import traceback +import unittest +from datetime import datetime, timezone +from typing import Any, Iterable, Iterator, Mapping +from unittest import mock +from unittest.mock import Mock + +import pytest +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types import FileTransfer +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +class MockFormat: + pass + + +@pytest.mark.parametrize( + "input_schema, expected_output", + [ + pytest.param({}, {}, id="empty-schema"), + pytest.param( + {"type": "string"}, + {"type": ["null", "string"]}, + id="simple-schema", + ), + pytest.param( + {"type": ["string"]}, + {"type": ["null", "string"]}, + id="simple-schema-list-type", + ), + pytest.param( + {"type": ["null", "string"]}, + {"type": ["null", "string"]}, + id="simple-schema-already-has-null", + ), + pytest.param( + {"properties": {"type": "string"}}, + {"properties": {"type": ["null", "string"]}}, + id="nested-schema", + ), + pytest.param( + {"items": {"type": "string"}}, + {"items": {"type": ["null", "string"]}}, + id="array-schema", + ), + pytest.param( + {"type": "object", "properties": {"prop": {"type": "string"}}}, + { + "type": ["null", "object"], + "properties": {"prop": {"type": ["null", "string"]}}, + }, + id="deeply-nested-schema", + ), + ], +) +def test_fill_nulls(input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]) -> None: + assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output + + +class DefaultFileBasedStreamTest(unittest.TestCase): + _NOW = datetime(2022, 10, 22, tzinfo=timezone.utc) + _A_RECORD = {"a_record": 1} + + def setUp(self) -> None: + self._stream_config = Mock() + self._stream_config.format = MockFormat() + self._stream_config.name = "a stream name" + self._catalog_schema = Mock() + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._availability_strategy = Mock(spec=AbstractFileBasedAvailabilityStrategy) + self._discovery_policy = Mock(spec=AbstractDiscoveryPolicy) + self._parser = Mock(spec=FileTypeParser) + self._validation_policy = Mock(spec=AbstractSchemaValidationPolicy) + self._validation_policy.name = "validation policy name" + self._cursor = Mock(spec=AbstractFileBasedCursor) + + self._stream = DefaultFileBasedStream( + config=self._stream_config, + catalog_schema=self._catalog_schema, + stream_reader=self._stream_reader, + availability_strategy=self._availability_strategy, + discovery_policy=self._discovery_policy, + parsers={MockFormat: self._parser}, + validation_policy=self._validation_policy, + cursor=self._cursor, + errors_collector=FileBasedErrorsCollector(), + ) + + def test_when_read_records_from_slice_then_return_records(self) -> None: + self._parser.parse_records.return_value = [self._A_RECORD] + messages = list(self._stream.read_records_from_slice({"files": [RemoteFile(uri="uri", last_modified=self._NOW)]})) + assert list(map(lambda message: message.record.data["data"], messages)) == [self._A_RECORD] + + def test_when_transform_record_then_return_updated_record(self) -> None: + file = RemoteFile(uri="uri", last_modified=self._NOW) + last_updated = self._NOW.isoformat() + transformed_record = self._stream.transform_record(self._A_RECORD, file, last_updated) + assert transformed_record[self._stream.ab_last_mod_col] == last_updated + assert transformed_record[self._stream.ab_file_name_col] == file.uri + + def test_given_exception_when_read_records_from_slice_then_do_process_other_files( + self, + ) -> None: + """ + The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example + we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open + the file but the sync won't fail. + Ticket: https://github.com/airbytehq/airbyte/issues/29680 + """ + self._parser.parse_records.side_effect = [ + ValueError("An error"), + [self._A_RECORD], + ] + + messages = list( + self._stream.read_records_from_slice( + { + "files": [ + RemoteFile(uri="invalid_file", last_modified=self._NOW), + RemoteFile(uri="valid_file", last_modified=self._NOW), + ] + } + ) + ) + + assert messages[0].log.level == Level.ERROR + assert messages[1].record.data["data"] == self._A_RECORD + + def test_given_traced_exception_when_read_records_from_slice_then_fail( + self, + ) -> None: + """ + When a traced exception is raised, the stream shouldn't try to handle but pass it on to the caller. + """ + self._parser.parse_records.side_effect = [AirbyteTracedException("An error")] + + with pytest.raises(AirbyteTracedException): + list( + self._stream.read_records_from_slice( + { + "files": [ + RemoteFile(uri="invalid_file", last_modified=self._NOW), + RemoteFile(uri="valid_file", last_modified=self._NOW), + ] + } + ) + ) + + def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning( + self, + ) -> None: + self._stream_config.schemaless = False + self._validation_policy.record_passes_validation_policy.return_value = False + self._parser.parse_records.side_effect = [self._iter([self._A_RECORD, ValueError("An error")])] + + messages = list( + self._stream.read_records_from_slice( + { + "files": [ + RemoteFile(uri="invalid_file", last_modified=self._NOW), + RemoteFile(uri="valid_file", last_modified=self._NOW), + ] + } + ) + ) + + assert messages[0].log.level == Level.ERROR + assert messages[1].log.level == Level.WARN + + def test_override_max_n_files_for_schema_inference_is_respected(self) -> None: + self._discovery_policy.n_concurrent_requests = 1 + self._discovery_policy.get_max_n_files_for_schema_inference.return_value = 3 + self._stream.config.input_schema = None + self._stream.config.schemaless = None + self._parser.infer_schema.return_value = {"data": {"type": "string"}} + files = [RemoteFile(uri=f"file{i}", last_modified=self._NOW) for i in range(10)] + self._stream_reader.get_matching_files.return_value = files + self._stream.config.recent_n_files_to_read_for_schema_discovery = 3 + + schema = self._stream.get_json_schema() + + assert schema == { + "type": "object", + "properties": { + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + "data": {"type": ["null", "string"]}, + }, + } + assert self._parser.infer_schema.call_count == 3 + + def _iter(self, x: Iterable[Any]) -> Iterator[Any]: + for item in x: + if isinstance(item, Exception): + raise item + yield item + + +class TestFileBasedErrorCollector: + test_error_collector: FileBasedErrorsCollector = FileBasedErrorsCollector() + + @pytest.mark.parametrize( + "stream, file, line_no, n_skipped, collector_expected_len", + ( + ("stream_1", "test.csv", 1, 1, 1), + ("stream_2", "test2.csv", 2, 2, 2), + ), + ids=[ + "Single error", + "Multiple errors", + ], + ) + def test_collect_parsing_error(self, stream, file, line_no, n_skipped, collector_expected_len) -> None: + test_error_pattern = "Error parsing record." + # format the error body + test_error = ( + AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.ERROR, + message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={stream} file={file} line_no={line_no} n_skipped={n_skipped}", + stack_trace=traceback.format_exc(), + ), + ), + ) + # collecting the error + self.test_error_collector.collect(test_error) + # check the error has been collected + assert len(self.test_error_collector.errors) == collector_expected_len + # check for the patern presence for the collected errors + for error in self.test_error_collector.errors: + assert test_error_pattern in error[0].log.message + + def test_yield_and_raise_collected(self) -> None: + # we expect the following method will raise the AirbyteTracedException + with pytest.raises(AirbyteTracedException) as parse_error: + list(self.test_error_collector.yield_and_raise_collected()) + assert parse_error.value.message == "Some errors occured while reading from the source." + assert parse_error.value.internal_message == "Please check the logged errors for more information." + + +class DefaultFileBasedStreamFileTransferTest(unittest.TestCase): + _NOW = datetime(2022, 10, 22, tzinfo=timezone.utc) + _A_RECORD = {'bytes': 10, 'file_relative_path': 'relative/path/file.csv', 'file_url': '/absolute/path/file.csv'} + + def setUp(self) -> None: + self._stream_config = Mock() + self._stream_config.format = MockFormat() + self._stream_config.name = "a stream name" + self._catalog_schema = Mock() + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._availability_strategy = Mock(spec=AbstractFileBasedAvailabilityStrategy) + self._discovery_policy = Mock(spec=AbstractDiscoveryPolicy) + self._parser = Mock(spec=FileTypeParser) + self._validation_policy = Mock(spec=AbstractSchemaValidationPolicy) + self._validation_policy.name = "validation policy name" + self._cursor = Mock(spec=AbstractFileBasedCursor) + + self._stream = DefaultFileBasedStream( + config=self._stream_config, + catalog_schema=self._catalog_schema, + stream_reader=self._stream_reader, + availability_strategy=self._availability_strategy, + discovery_policy=self._discovery_policy, + parsers={MockFormat: self._parser}, + validation_policy=self._validation_policy, + cursor=self._cursor, + errors_collector=FileBasedErrorsCollector(), + use_file_transfer=True + ) + + def test_when_read_records_from_slice_then_return_records(self) -> None: + """Verify that we have the new file method and data is empty""" + with mock.patch.object(FileTransfer, "get_file", return_value=[self._A_RECORD]): + messages = list(self._stream.read_records_from_slice({"files": [RemoteFile(uri="uri", last_modified=self._NOW)]})) + assert list(map(lambda message: message.record.file, messages)) == [self._A_RECORD] + assert list(map(lambda message: message.record.data, messages)) == [{}] + + def test_when_transform_record_then_return_updated_record(self) -> None: + file = RemoteFile(uri="uri", last_modified=self._NOW) + last_updated = int(self._NOW.timestamp()) * 1000 + transformed_record = self._stream.transform_record_for_file_transfer(self._A_RECORD, file) + assert transformed_record[self._stream.modified] == last_updated + assert transformed_record[self._stream.source_file_url] == file.uri diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_scenarios.py new file mode 100644 index 000000000000..247a9f349a13 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_scenarios.py @@ -0,0 +1,336 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from pathlib import PosixPath + +import pytest +from _pytest.capture import CaptureFixture +from airbyte_cdk.sources.abstract_source import AbstractSource +from freezegun import freeze_time +from unit_tests.sources.file_based.scenarios.avro_scenarios import ( + avro_all_types_scenario, + avro_file_with_double_as_number_scenario, + multiple_avro_combine_schema_scenario, + multiple_streams_avro_scenario, + single_avro_scenario, +) +from unit_tests.sources.file_based.scenarios.check_scenarios import ( + error_empty_stream_scenario, + error_listing_files_scenario, + error_multi_stream_scenario, + error_reading_file_scenario, + error_record_validation_user_provided_schema_scenario, + success_csv_scenario, + success_extensionless_scenario, + success_multi_stream_scenario, + success_user_provided_schema_scenario, +) +from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios import ( + multi_csv_different_timestamps_scenario_concurrent, + multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer, + multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older, + multi_csv_per_timestamp_scenario_concurrent, + multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer, + multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older, + multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer, + multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older, + multi_csv_same_timestamp_scenario_concurrent, + multi_csv_skip_file_if_already_in_history_concurrent, + multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer, + multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older, + multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer, + multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older, + multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer, + multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older, + single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent, + single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent, + single_csv_input_state_is_earlier_scenario_concurrent, + single_csv_input_state_is_later_scenario_concurrent, + single_csv_no_input_state_scenario_concurrent, +) +from unit_tests.sources.file_based.scenarios.csv_scenarios import ( + csv_analytics_scenario, + csv_autogenerate_column_names_scenario, + csv_custom_bool_values_scenario, + csv_custom_delimiter_in_double_quotes_scenario, + csv_custom_delimiter_with_escape_char_scenario, + csv_custom_format_scenario, + csv_custom_null_values_scenario, + csv_double_quote_is_set_scenario, + csv_escape_char_is_set_scenario, + csv_multi_stream_scenario, + csv_newline_in_values_not_quoted_scenario, + csv_newline_in_values_quoted_value_scenario, + csv_no_files_scenario, + csv_no_records_scenario, + csv_single_stream_scenario, + csv_skip_after_header_scenario, + csv_skip_before_and_after_header_scenario, + csv_skip_before_header_scenario, + csv_string_are_not_null_if_strings_can_be_null_is_false_scenario, + csv_string_can_be_null_with_input_schemas_scenario, + csv_string_not_null_if_no_null_values_scenario, + csv_strings_can_be_null_not_quoted_scenario, + earlier_csv_scenario, + empty_schema_inference_scenario, + invalid_csv_multi_scenario, + invalid_csv_scenario, + multi_csv_scenario, + multi_csv_stream_n_file_exceeds_config_limit_for_inference, + multi_csv_stream_n_file_exceeds_limit_for_inference, + multi_stream_custom_format, + schemaless_csv_multi_stream_scenario, + schemaless_csv_scenario, + schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario, + schemaless_with_user_input_schema_fails_connection_check_scenario, + single_csv_scenario, +) +from unit_tests.sources.file_based.scenarios.excel_scenarios import ( + excel_all_types_scenario, + multiple_excel_combine_schema_scenario, + multiple_streams_excel_scenario, + single_excel_scenario, +) +from unit_tests.sources.file_based.scenarios.incremental_scenarios import ( + multi_csv_different_timestamps_scenario, + multi_csv_include_missing_files_within_history_range, + multi_csv_per_timestamp_scenario, + multi_csv_remove_old_files_if_history_is_full_scenario, + multi_csv_same_timestamp_more_files_than_history_size_scenario, + multi_csv_same_timestamp_scenario, + multi_csv_skip_file_if_already_in_history, + multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario, + multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario, + multi_csv_sync_recent_files_if_history_is_incomplete_scenario, + single_csv_file_is_skipped_if_same_modified_at_as_in_history, + single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history, + single_csv_input_state_is_earlier_scenario, + single_csv_input_state_is_later_scenario, + single_csv_no_input_state_scenario, +) +from unit_tests.sources.file_based.scenarios.jsonl_scenarios import ( + invalid_jsonl_scenario, + jsonl_multi_stream_scenario, + jsonl_user_input_schema_scenario, + multi_jsonl_stream_n_bytes_exceeds_limit_for_inference, + multi_jsonl_stream_n_file_exceeds_limit_for_inference, + multi_jsonl_with_different_keys_scenario, + schemaless_jsonl_multi_stream_scenario, + schemaless_jsonl_scenario, + single_jsonl_scenario, +) +from unit_tests.sources.file_based.scenarios.parquet_scenarios import ( + multi_parquet_scenario, + parquet_file_with_decimal_as_float_scenario, + parquet_file_with_decimal_as_string_scenario, + parquet_file_with_decimal_no_config_scenario, + parquet_various_types_scenario, + parquet_with_invalid_config_scenario, + single_parquet_scenario, + single_partitioned_parquet_scenario, +) +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenario +from unit_tests.sources.file_based.scenarios.unstructured_scenarios import ( + corrupted_file_scenario, + no_file_extension_unstructured_scenario, + simple_markdown_scenario, + simple_txt_scenario, + simple_unstructured_scenario, + unstructured_invalid_file_type_discover_scenario_no_skip, + unstructured_invalid_file_type_discover_scenario_skip, + unstructured_invalid_file_type_read_scenario, +) +from unit_tests.sources.file_based.scenarios.user_input_schema_scenarios import ( + multi_stream_user_input_schema_scenario_emit_nonconforming_records, + multi_stream_user_input_schema_scenario_schema_is_invalid, + multi_stream_user_input_schema_scenario_skip_nonconforming_records, + single_stream_user_input_schema_scenario_emit_nonconforming_records, + single_stream_user_input_schema_scenario_schema_is_invalid, + single_stream_user_input_schema_scenario_skip_nonconforming_records, + valid_multi_stream_user_input_schema_scenario, + valid_single_stream_user_input_schema_scenario, +) +from unit_tests.sources.file_based.scenarios.validation_policy_scenarios import ( + emit_record_scenario_multi_stream, + emit_record_scenario_single_stream, + skip_record_scenario_multi_stream, + skip_record_scenario_single_stream, + wait_for_rediscovery_scenario_multi_stream, + wait_for_rediscovery_scenario_single_stream, +) +from unit_tests.sources.file_based.test_scenarios import verify_check, verify_discover, verify_read, verify_spec + +discover_failure_scenarios = [ + empty_schema_inference_scenario, +] + +discover_success_scenarios = [ + csv_no_records_scenario, + csv_multi_stream_scenario, + csv_single_stream_scenario, + invalid_csv_scenario, + invalid_csv_multi_scenario, + single_csv_scenario, + multi_csv_scenario, + multi_csv_stream_n_file_exceeds_limit_for_inference, + multi_csv_stream_n_file_exceeds_config_limit_for_inference, + single_csv_input_state_is_earlier_scenario, + single_csv_no_input_state_scenario, + single_csv_input_state_is_later_scenario, + multi_csv_same_timestamp_scenario, + multi_csv_different_timestamps_scenario, + multi_csv_per_timestamp_scenario, + multi_csv_skip_file_if_already_in_history, + multi_csv_include_missing_files_within_history_range, + multi_csv_remove_old_files_if_history_is_full_scenario, + multi_csv_same_timestamp_more_files_than_history_size_scenario, + multi_csv_sync_recent_files_if_history_is_incomplete_scenario, + multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario, + multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario, + single_csv_file_is_skipped_if_same_modified_at_as_in_history, + single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history, + csv_custom_format_scenario, + multi_stream_custom_format, + single_parquet_scenario, + multi_parquet_scenario, + parquet_various_types_scenario, + parquet_file_with_decimal_no_config_scenario, + parquet_file_with_decimal_as_string_scenario, + parquet_file_with_decimal_as_float_scenario, + schemaless_csv_scenario, + schemaless_csv_multi_stream_scenario, + schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario, + schemaless_with_user_input_schema_fails_connection_check_scenario, + single_stream_user_input_schema_scenario_schema_is_invalid, + single_stream_user_input_schema_scenario_emit_nonconforming_records, + single_stream_user_input_schema_scenario_skip_nonconforming_records, + multi_stream_user_input_schema_scenario_emit_nonconforming_records, + multi_stream_user_input_schema_scenario_skip_nonconforming_records, + multi_stream_user_input_schema_scenario_schema_is_invalid, + valid_multi_stream_user_input_schema_scenario, + valid_single_stream_user_input_schema_scenario, + single_jsonl_scenario, + multi_jsonl_with_different_keys_scenario, + multi_jsonl_stream_n_file_exceeds_limit_for_inference, + multi_jsonl_stream_n_bytes_exceeds_limit_for_inference, + invalid_jsonl_scenario, + jsonl_multi_stream_scenario, + jsonl_user_input_schema_scenario, + schemaless_jsonl_scenario, + schemaless_jsonl_multi_stream_scenario, + csv_string_can_be_null_with_input_schemas_scenario, + csv_string_are_not_null_if_strings_can_be_null_is_false_scenario, + csv_string_not_null_if_no_null_values_scenario, + csv_strings_can_be_null_not_quoted_scenario, + csv_newline_in_values_quoted_value_scenario, + csv_escape_char_is_set_scenario, + csv_double_quote_is_set_scenario, + csv_custom_delimiter_with_escape_char_scenario, + csv_custom_delimiter_in_double_quotes_scenario, + csv_skip_before_header_scenario, + csv_skip_after_header_scenario, + csv_skip_before_and_after_header_scenario, + csv_custom_bool_values_scenario, + csv_custom_null_values_scenario, + single_avro_scenario, + avro_all_types_scenario, + multiple_avro_combine_schema_scenario, + multiple_streams_avro_scenario, + avro_file_with_double_as_number_scenario, + excel_all_types_scenario, + multiple_excel_combine_schema_scenario, + multiple_streams_excel_scenario, + single_excel_scenario, + csv_newline_in_values_not_quoted_scenario, + csv_autogenerate_column_names_scenario, + parquet_with_invalid_config_scenario, + single_partitioned_parquet_scenario, + simple_markdown_scenario, + simple_txt_scenario, + simple_unstructured_scenario, + corrupted_file_scenario, + no_file_extension_unstructured_scenario, + unstructured_invalid_file_type_discover_scenario_no_skip, + unstructured_invalid_file_type_discover_scenario_skip, + unstructured_invalid_file_type_read_scenario, + multi_csv_different_timestamps_scenario_concurrent, + multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer, + multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older, + multi_csv_per_timestamp_scenario_concurrent, + multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer, + multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older, + multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer, + multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older, + multi_csv_same_timestamp_scenario_concurrent, + multi_csv_skip_file_if_already_in_history_concurrent, + multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer, + multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older, + multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer, + multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older, + multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer, + multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older, + single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent, + single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent, + single_csv_input_state_is_earlier_scenario_concurrent, + single_csv_input_state_is_later_scenario_concurrent, + single_csv_no_input_state_scenario_concurrent, + earlier_csv_scenario, + csv_no_files_scenario, +] + +discover_scenarios = discover_failure_scenarios + discover_success_scenarios + +read_scenarios = discover_success_scenarios + [ + emit_record_scenario_multi_stream, + emit_record_scenario_single_stream, + skip_record_scenario_multi_stream, + skip_record_scenario_single_stream, + csv_analytics_scenario, + wait_for_rediscovery_scenario_multi_stream, + wait_for_rediscovery_scenario_single_stream, +] + +spec_scenarios = [ + single_csv_scenario, +] + +check_scenarios = [ + error_empty_stream_scenario, + error_listing_files_scenario, + error_reading_file_scenario, + error_record_validation_user_provided_schema_scenario, + error_multi_stream_scenario, + success_csv_scenario, + success_extensionless_scenario, + success_multi_stream_scenario, + success_user_provided_schema_scenario, + schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario, + schemaless_with_user_input_schema_fails_connection_check_scenario, + valid_single_stream_user_input_schema_scenario, + single_avro_scenario, + single_excel_scenario, + earlier_csv_scenario, + csv_no_files_scenario, +] + + +@pytest.mark.parametrize("scenario", discover_scenarios, ids=[s.name for s in discover_scenarios]) +def test_file_based_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestScenario[AbstractSource]) -> None: + verify_discover(capsys, tmp_path, scenario) + + +@pytest.mark.parametrize("scenario", read_scenarios, ids=[s.name for s in read_scenarios]) +@freeze_time("2023-06-09T00:00:00Z") +def test_file_based_read(scenario: TestScenario[AbstractSource]) -> None: + verify_read(scenario) + + +@pytest.mark.parametrize("scenario", spec_scenarios, ids=[c.name for c in spec_scenarios]) +def test_file_based_spec(capsys: CaptureFixture[str], scenario: TestScenario[AbstractSource]) -> None: + verify_spec(capsys, scenario) + + +@pytest.mark.parametrize("scenario", check_scenarios, ids=[c.name for c in check_scenarios]) +def test_file_based_check(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestScenario[AbstractSource]) -> None: + verify_check(capsys, tmp_path, scenario) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_stream_reader.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_stream_reader.py new file mode 100644 index 000000000000..b77bf2fd969d --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -0,0 +1,279 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from io import IOBase +from typing import Any, Dict, Iterable, List, Mapping, Optional, Set + +import pytest +from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from pydantic.v1 import AnyUrl +from unit_tests.sources.file_based.helpers import make_remote_files + +reader = AbstractFileBasedStreamReader + +""" +The rules are: + +- All files at top-level: /* +- All files at top-level of mydir: mydir/* +- All files anywhere under mydir: mydir/**/* +- All files in any directory: **/* +- All files in any directory that end in .csv: **/*.csv +- All files in any directory that have a .csv extension: **/*.csv* +""" + +FILEPATHS = [ + "a", + "a.csv", + "a.csv.gz", + "a.jsonl", + "a/b", + "a/b.csv", + "a/b.csv.gz", + "a/b.jsonl", + "a/c", + "a/c.csv", + "a/c.csv.gz", + "a/c.jsonl", + "a/b/c", + "a/b/c.csv", + "a/b/c.csv.gz", + "a/b/c.jsonl", + "a/c/c", + "a/c/c.csv", + "a/c/c.csv.gz", + "a/c/c.jsonl", + "a/b/c/d", + "a/b/c/d.csv", + "a/b/c/d.csv.gz", + "a/b/c/d.jsonl", +] +FILES = make_remote_files(FILEPATHS) + +DEFAULT_CONFIG = { + "streams": [], +} + + +class TestStreamReader(AbstractFileBasedStreamReader): + @property + def config(self) -> Optional[AbstractFileBasedSpec]: + return self._config + + @config.setter + def config(self, value: AbstractFileBasedSpec) -> None: + self._config = value + + def get_matching_files(self, globs: List[str]) -> Iterable[RemoteFile]: + pass + + def open_file(self, file: RemoteFile) -> IOBase: + pass + + def file_size(self, file: RemoteFile) -> int: + return 0 + + def get_file(self, file: RemoteFile, local_directory: str, logger: logging.Logger) -> Dict[str, Any]: + return {} + + +class TestSpec(AbstractFileBasedSpec): + @classmethod + def documentation_url(cls) -> AnyUrl: + return AnyUrl(scheme="https", url="https://docs.airbyte.com/integrations/sources/test") # type: ignore + + +@pytest.mark.parametrize( + "globs,config,expected_matches,expected_path_prefixes", + [ + pytest.param([], DEFAULT_CONFIG, set(), set(), id="no-globs"), + pytest.param([""], DEFAULT_CONFIG, set(), set(), id="empty-string"), + pytest.param(["**"], DEFAULT_CONFIG, set(FILEPATHS), set(), id="**"), + pytest.param( + ["**/*.csv"], DEFAULT_CONFIG, {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, set(), id="**/*.csv" + ), + pytest.param( + ["**/*.csv*"], + DEFAULT_CONFIG, + { + "a.csv", + "a.csv.gz", + "a/b.csv", + "a/b.csv.gz", + "a/c.csv", + "a/c.csv.gz", + "a/b/c.csv", + "a/b/c.csv.gz", + "a/c/c.csv", + "a/c/c.csv.gz", + "a/b/c/d.csv", + "a/b/c/d.csv.gz", + }, + set(), + id="**/*.csv*", + ), + pytest.param(["*"], DEFAULT_CONFIG, {"a", "a.csv", "a.csv.gz", "a.jsonl"}, set(), id="*"), + pytest.param(["*.csv"], DEFAULT_CONFIG, {"a.csv"}, set(), id="*.csv"), + pytest.param(["*.csv*"], DEFAULT_CONFIG, {"a.csv", "a.csv.gz"}, set(), id="*.csv*"), + pytest.param( + ["*/*"], + DEFAULT_CONFIG, + {"a/b", "a/b.csv", "a/b.csv.gz", "a/b.jsonl", "a/c", "a/c.csv", "a/c.csv.gz", "a/c.jsonl"}, + set(), + id="*/*", + ), + pytest.param(["*/*.csv"], DEFAULT_CONFIG, {"a/b.csv", "a/c.csv"}, set(), id="*/*.csv"), + pytest.param(["*/*.csv*"], DEFAULT_CONFIG, {"a/b.csv", "a/b.csv.gz", "a/c.csv", "a/c.csv.gz"}, set(), id="*/*.csv*"), + pytest.param( + ["*/**"], + DEFAULT_CONFIG, + { + "a/b", + "a/b.csv", + "a/b.csv.gz", + "a/b.jsonl", + "a/c", + "a/c.csv", + "a/c.csv.gz", + "a/c.jsonl", + "a/b/c", + "a/b/c.csv", + "a/b/c.csv.gz", + "a/b/c.jsonl", + "a/c/c", + "a/c/c.csv", + "a/c/c.csv.gz", + "a/c/c.jsonl", + "a/b/c/d", + "a/b/c/d.csv", + "a/b/c/d.csv.gz", + "a/b/c/d.jsonl", + }, + set(), + id="*/**", + ), + pytest.param( + ["a/*"], + DEFAULT_CONFIG, + {"a/b", "a/b.csv", "a/b.csv.gz", "a/b.jsonl", "a/c", "a/c.csv", "a/c.csv.gz", "a/c.jsonl"}, + {"a/"}, + id="a/*", + ), + pytest.param(["a/*.csv"], DEFAULT_CONFIG, {"a/b.csv", "a/c.csv"}, {"a/"}, id="a/*.csv"), + pytest.param(["a/*.csv*"], DEFAULT_CONFIG, {"a/b.csv", "a/b.csv.gz", "a/c.csv", "a/c.csv.gz"}, {"a/"}, id="a/*.csv*"), + pytest.param(["a/b/*"], DEFAULT_CONFIG, {"a/b/c", "a/b/c.csv", "a/b/c.csv.gz", "a/b/c.jsonl"}, {"a/b/"}, id="a/b/*"), + pytest.param(["a/b/*.csv"], DEFAULT_CONFIG, {"a/b/c.csv"}, {"a/b/"}, id="a/b/*.csv"), + pytest.param(["a/b/*.csv*"], DEFAULT_CONFIG, {"a/b/c.csv", "a/b/c.csv.gz"}, {"a/b/"}, id="a/b/*.csv*"), + pytest.param( + ["a/*/*"], + DEFAULT_CONFIG, + {"a/b/c", "a/b/c.csv", "a/b/c.csv.gz", "a/b/c.jsonl", "a/c/c", "a/c/c.csv", "a/c/c.csv.gz", "a/c/c.jsonl"}, + {"a/"}, + id="a/*/*", + ), + pytest.param(["a/*/*.csv"], DEFAULT_CONFIG, {"a/b/c.csv", "a/c/c.csv"}, {"a/"}, id="a/*/*.csv"), + pytest.param(["a/*/*.csv*"], DEFAULT_CONFIG, {"a/b/c.csv", "a/b/c.csv.gz", "a/c/c.csv", "a/c/c.csv.gz"}, {"a/"}, id="a/*/*.csv*"), + pytest.param( + ["a/**/*"], + DEFAULT_CONFIG, + { + "a/b", + "a/b.csv", + "a/b.csv.gz", + "a/b.jsonl", + "a/c", + "a/c.csv", + "a/c.csv.gz", + "a/c.jsonl", + "a/b/c", + "a/b/c.csv", + "a/b/c.csv.gz", + "a/b/c.jsonl", + "a/c/c", + "a/c/c.csv", + "a/c/c.csv.gz", + "a/c/c.jsonl", + "a/b/c/d", + "a/b/c/d.csv", + "a/b/c/d.csv.gz", + "a/b/c/d.jsonl", + }, + {"a/"}, + id="a/**/*", + ), + pytest.param( + ["a/**/*.csv"], DEFAULT_CONFIG, {"a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, {"a/"}, id="a/**/*.csv" + ), + pytest.param( + ["a/**/*.csv*"], + DEFAULT_CONFIG, + { + "a/b.csv", + "a/b.csv.gz", + "a/c.csv", + "a/c.csv.gz", + "a/b/c.csv", + "a/b/c.csv.gz", + "a/c/c.csv", + "a/c/c.csv.gz", + "a/b/c/d.csv", + "a/b/c/d.csv.gz", + }, + {"a/"}, + id="a/**/*.csv*", + ), + pytest.param( + ["**/*.csv", "**/*.gz"], + DEFAULT_CONFIG, + { + "a.csv", + "a.csv.gz", + "a/b.csv", + "a/b.csv.gz", + "a/c.csv", + "a/c.csv.gz", + "a/b/c.csv", + "a/b/c.csv.gz", + "a/c/c.csv", + "a/c/c.csv.gz", + "a/b/c/d.csv", + "a/b/c/d.csv.gz", + }, + set(), + id="**/*.csv,**/*.gz", + ), + pytest.param(["*.csv", "*.gz"], DEFAULT_CONFIG, {"a.csv", "a.csv.gz"}, set(), id="*.csv,*.gz"), + pytest.param( + ["a/*.csv", "a/*/*.csv"], DEFAULT_CONFIG, {"a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv"}, {"a/"}, id="a/*.csv,a/*/*.csv" + ), + pytest.param(["a/*.csv", "a/b/*.csv"], DEFAULT_CONFIG, {"a/b.csv", "a/c.csv", "a/b/c.csv"}, {"a/", "a/b/"}, id="a/*.csv,a/b/*.csv"), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-01T03:54:07.000Z", "streams": []}, + {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, + set(), + id="all_csvs_modified_after_start_date", + ), + pytest.param( + ["**/*.csv"], {"start_date": "2023-06-10T03:54:07.000Z", "streams": []}, set(), set(), id="all_csvs_modified_before_start_date" + ), + pytest.param( + ["**/*.csv"], + {"start_date": "2023-06-05T03:54:07.000Z", "streams": []}, + {"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"}, + set(), + id="all_csvs_modified_exactly_on_start_date", + ), + ], +) +def test_globs_and_prefixes_from_globs( + globs: List[str], config: Mapping[str, Any], expected_matches: Set[str], expected_path_prefixes: Set[str] +) -> None: + reader = TestStreamReader() + reader.config = TestSpec(**config) + assert set([f.uri for f in reader.filter_files_by_globs_and_start_date(FILES, globs)]) == expected_matches + assert set(reader.get_prefixes_from_globs(globs)) == expected_path_prefixes diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py new file mode 100644 index 000000000000..b381e6886c3e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -0,0 +1,258 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import math +from pathlib import Path, PosixPath +from typing import Any, Dict, List, Mapping, Optional, Union + +import pytest +from _pytest.capture import CaptureFixture +from _pytest.reports import ExceptionInfo +from airbyte_cdk.entrypoint import launch +from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteCatalogSerializer, SyncMode +from airbyte_cdk.sources import AbstractSource +from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor +from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput +from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read +from airbyte_cdk.utils import message_utils +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenario + + +def verify_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestScenario[AbstractSource]) -> None: + expected_exc, expected_msg = scenario.expected_discover_error + expected_logs = scenario.expected_logs + if expected_exc: + with pytest.raises(expected_exc) as exc: + discover(capsys, tmp_path, scenario) + if expected_msg: + assert expected_msg in get_error_message_from_exc(exc) + elif scenario.expected_catalog: + output = discover(capsys, tmp_path, scenario) + catalog, logs = output["catalog"], output["logs"] + assert catalog == scenario.expected_catalog + if expected_logs: + discover_logs = expected_logs.get("discover") + logs = [log for log in logs if log.get("log", {}).get("level") in ("ERROR", "WARN")] + _verify_expected_logs(logs, discover_logs) + + +def verify_read(scenario: TestScenario[AbstractSource]) -> None: + if scenario.incremental_scenario_config: + run_test_read_incremental(scenario) + else: + run_test_read_full_refresh(scenario) + + +def run_test_read_full_refresh(scenario: TestScenario[AbstractSource]) -> None: + expected_exc, expected_msg = scenario.expected_read_error + output = read(scenario) + if expected_exc: + assert_exception(expected_exc, output) + if expected_msg: + assert expected_msg in output.errors[-1].trace.error.internal_message + else: + _verify_read_output(output, scenario) + + +def run_test_read_incremental(scenario: TestScenario[AbstractSource]) -> None: + expected_exc, expected_msg = scenario.expected_read_error + output = read_with_state(scenario) + if expected_exc: + assert_exception(expected_exc, output) + else: + _verify_read_output(output, scenario) + + +def assert_exception(expected_exception: type[BaseException], output: EntrypointOutput) -> None: + assert expected_exception.__name__ in output.errors[-1].trace.error.stack_trace + + +def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None: + records_and_state_messages, log_messages = output.records_and_state_messages, output.logs + logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels] + if scenario.expected_records is None: + return + + expected_records = [r for r in scenario.expected_records] if scenario.expected_records else [] + + sorted_expected_records = sorted( + filter(lambda e: "data" in e, expected_records), + key=lambda record: ",".join( + f"{k}={v}" for k, v in sorted(record["data"].items(), key=lambda items: (items[0], items[1])) if k != "emitted_at" + ), + ) + sorted_records = sorted( + filter(lambda r: r.record, records_and_state_messages), + key=lambda record: ",".join( + f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at" + ), + ) + + assert len(sorted_records) == len(sorted_expected_records) + + for actual, expected in zip(sorted_records, sorted_expected_records): + if actual.record: + assert len(actual.record.data) == len(expected["data"]) + for key, value in actual.record.data.items(): + if isinstance(value, float): + assert math.isclose(value, expected["data"][key], abs_tol=1e-04) + else: + assert value == expected["data"][key] + assert actual.record.stream == expected["stream"] + + expected_states = list(filter(lambda e: "data" not in e, expected_records)) + states = list(filter(lambda r: r.state, records_and_state_messages)) + assert len(states) > 0, "No state messages emitted. Successful syncs should emit at least one stream state." + _verify_state_record_counts(sorted_records, states) + + if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor): + # Only check the last state emitted because we don't know the order the others will be in. + # This may be needed for non-file-based concurrent scenarios too. + assert {k: v for k, v in states[-1].state.stream.stream_state.__dict__.items()} == expected_states[-1] + else: + for actual, expected in zip(states, expected_states): # states should be emitted in sorted order + assert {k: v for k, v in actual.state.stream.stream_state.__dict__.items()} == expected + + if scenario.expected_logs: + read_logs = scenario.expected_logs.get("read") + assert len(logs) == (len(read_logs) if read_logs else 0) + _verify_expected_logs(logs, read_logs) + + if scenario.expected_analytics: + analytics = output.analytics_messages + + _verify_analytics(analytics, scenario.expected_analytics) + + +def _verify_state_record_counts(records: List[AirbyteMessage], states: List[AirbyteMessage]) -> None: + actual_record_counts = {} + for record in records: + stream_descriptor = message_utils.get_stream_descriptor(record) + actual_record_counts[stream_descriptor] = actual_record_counts.get(stream_descriptor, 0) + 1 + + state_record_count_sums = {} + for state_message in states: + stream_descriptor = message_utils.get_stream_descriptor(state_message) + state_record_count_sums[stream_descriptor] = ( + state_record_count_sums.get(stream_descriptor, 0) + state_message.state.sourceStats.recordCount + ) + + for stream, actual_count in actual_record_counts.items(): + assert actual_count == state_record_count_sums.get(stream) + + # We can have extra keys in state_record_count_sums if we processed a stream and reported 0 records + extra_keys = state_record_count_sums.keys() - actual_record_counts.keys() + for stream in extra_keys: + assert state_record_count_sums[stream] == 0 + + +def _verify_analytics(analytics: List[AirbyteMessage], expected_analytics: Optional[List[AirbyteAnalyticsTraceMessage]]) -> None: + if expected_analytics: + assert len(analytics) == len( + expected_analytics + ), f"Number of actual analytics messages ({len(analytics)}) did not match expected ({len(expected_analytics)})" + for actual, expected in zip(analytics, expected_analytics): + actual_type, actual_value = actual.trace.analytics.type, actual.trace.analytics.value + expected_type = expected.type + expected_value = expected.value + assert actual_type == expected_type + assert actual_value == expected_value + + +def _verify_expected_logs(logs: List[AirbyteLogMessage], expected_logs: Optional[List[Mapping[str, Any]]]) -> None: + if expected_logs: + for actual, expected in zip(logs, expected_logs): + actual_level, actual_message = actual.level.value, actual.message + expected_level = expected["level"] + expected_message = expected["message"] + assert actual_level == expected_level + assert expected_message in actual_message + + +def verify_spec(capsys: CaptureFixture[str], scenario: TestScenario[AbstractSource]) -> None: + assert spec(capsys, scenario) == scenario.expected_spec + + +def verify_check(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestScenario[AbstractSource]) -> None: + expected_exc, expected_msg = scenario.expected_check_error + + if expected_exc: + with pytest.raises(expected_exc) as exc: + check(capsys, tmp_path, scenario) + + if expected_msg: + assert expected_msg in exc.value.message + + else: + output = check(capsys, tmp_path, scenario) + assert output["status"] == scenario.expected_check_status + + +def spec(capsys: CaptureFixture[str], scenario: TestScenario[AbstractSource]) -> Mapping[str, Any]: + launch( + scenario.source, + ["spec"], + ) + captured = capsys.readouterr() + return json.loads(captured.out.splitlines()[0])["spec"] # type: ignore + + +def check(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestScenario[AbstractSource]) -> Dict[str, Any]: + launch( + scenario.source, + ["check", "--config", make_file(tmp_path / "config.json", scenario.config)], + ) + captured = capsys.readouterr() + return _find_connection_status(captured.out.splitlines()) + + +def _find_connection_status(output: List[str]) -> Mapping[str, Any]: + for line in output: + json_line = json.loads(line) + if "connectionStatus" in json_line: + return json_line["connectionStatus"] + raise ValueError("No valid connectionStatus found in output") + + +def discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestScenario[AbstractSource]) -> Dict[str, Any]: + launch( + scenario.source, + ["discover", "--config", make_file(tmp_path / "config.json", scenario.config)], + ) + output = [json.loads(line) for line in capsys.readouterr().out.splitlines()] + [catalog] = [o["catalog"] for o in output if o.get("catalog")] # type: ignore + return { + "catalog": catalog, + "logs": [o["log"] for o in output if o.get("log")], + } + + +def read(scenario: TestScenario[AbstractSource]) -> EntrypointOutput: + return entrypoint_read( + scenario.source, + scenario.config, + ConfiguredAirbyteCatalogSerializer.load(scenario.configured_catalog(SyncMode.full_refresh)), + ) + + +def read_with_state(scenario: TestScenario[AbstractSource]) -> EntrypointOutput: + return entrypoint_read( + scenario.source, + scenario.config, + ConfiguredAirbyteCatalogSerializer.load(scenario.configured_catalog(SyncMode.incremental)), + scenario.input_state(), + ) + + +def make_file(path: Path, file_contents: Optional[Union[Mapping[str, Any], List[Mapping[str, Any]]]]) -> str: + path.write_text(json.dumps(file_contents)) + return str(path) + + +def get_error_message_from_exc(exc: ExceptionInfo[Any]) -> str: + if isinstance(exc.value, AirbyteTracedException): + return exc.value.message + return str(exc.value.args[0]) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_schema_helpers.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_schema_helpers.py new file mode 100644 index 000000000000..90e01942d98f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_schema_helpers.py @@ -0,0 +1,364 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Mapping, Optional + +import pytest +from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, SchemaInferenceError +from airbyte_cdk.sources.file_based.schema_helpers import ( + ComparableType, + SchemaType, + conforms_to_schema, + merge_schemas, + type_mapping_to_jsonschema, +) + +COMPLETE_CONFORMING_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + + +NONCONFORMING_EXTRA_COLUMN_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, + "column_x": "extra", +} + +CONFORMING_WITH_MISSING_COLUMN_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], +} + +CONFORMING_WITH_NARROWER_TYPE_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": True, + "number_field": True, + "string_field": True, + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + +NONCONFORMING_WIDER_TYPE_RECORD = { + "null_field": "not None", + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + +NONCONFORMING_NON_OBJECT_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], + "object_field": "not an object", +} + +NONCONFORMING_NON_ARRAY_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": "not an array", + "object_field": {"col": "val"}, +} + +CONFORMING_MIXED_TYPE_NARROWER_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + +NONCONFORMING_MIXED_TYPE_WIDER_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + +CONFORMING_MIXED_TYPE_WITHIN_TYPE_RANGE_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "val1", + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + +NONCONFORMING_INVALID_ARRAY_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": ["this should not be an array"], + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + +NONCONFORMING_TOO_WIDE_ARRAY_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "okay", + "array_field": ["val1", "val2"], + "object_field": {"col": "val"}, +} + + +CONFORMING_NARROWER_ARRAY_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": "okay", + "array_field": [1, 2], + "object_field": {"col": "val"}, +} + + +NONCONFORMING_INVALID_OBJECT_RECORD = { + "null_field": None, + "boolean_field": True, + "integer_field": 1, + "number_field": 1.5, + "string_field": {"this": "should not be an object"}, + "array_field": [1.1, 2.2], + "object_field": {"col": "val"}, +} + + +SCHEMA = { + "type": "object", + "properties": { + "null_field": {"type": "null"}, + "boolean_field": {"type": "boolean"}, + "integer_field": {"type": "integer"}, + "number_field": {"type": "number"}, + "string_field": {"type": "string"}, + "array_field": { + "type": "array", + "items": { + "type": "number", + }, + }, + "object_field": {"type": "object"}, + }, +} + + +@pytest.mark.parametrize( + "record,schema,expected_result", + [ + pytest.param(COMPLETE_CONFORMING_RECORD, SCHEMA, True, id="record-conforms"), + pytest.param(NONCONFORMING_EXTRA_COLUMN_RECORD, SCHEMA, False, id="nonconforming-extra-column"), + pytest.param(CONFORMING_WITH_MISSING_COLUMN_RECORD, SCHEMA, True, id="record-conforms-with-missing-column"), + pytest.param(CONFORMING_WITH_NARROWER_TYPE_RECORD, SCHEMA, True, id="record-conforms-with-narrower-type"), + pytest.param(NONCONFORMING_WIDER_TYPE_RECORD, SCHEMA, False, id="nonconforming-wider-type"), + pytest.param(NONCONFORMING_NON_OBJECT_RECORD, SCHEMA, False, id="nonconforming-string-is-not-an-object"), + pytest.param(NONCONFORMING_NON_ARRAY_RECORD, SCHEMA, False, id="nonconforming-string-is-not-an-array"), + pytest.param(NONCONFORMING_TOO_WIDE_ARRAY_RECORD, SCHEMA, False, id="nonconforming-array-values-too-wide"), + pytest.param(CONFORMING_NARROWER_ARRAY_RECORD, SCHEMA, True, id="conforming-array-values-narrower-than-schema"), + pytest.param(NONCONFORMING_INVALID_ARRAY_RECORD, SCHEMA, False, id="nonconforming-array-is-not-a-string"), + pytest.param(NONCONFORMING_INVALID_OBJECT_RECORD, SCHEMA, False, id="nonconforming-object-is-not-a-string"), + ], +) +def test_conforms_to_schema(record: Mapping[str, Any], schema: Mapping[str, Any], expected_result: bool) -> None: + assert conforms_to_schema(record, schema) == expected_result + + +def test_comparable_types() -> None: + assert ComparableType.OBJECT > ComparableType.STRING + assert ComparableType.STRING > ComparableType.NUMBER + assert ComparableType.NUMBER > ComparableType.INTEGER + assert ComparableType.INTEGER > ComparableType.BOOLEAN + assert ComparableType["OBJECT"] == ComparableType.OBJECT + + +@pytest.mark.parametrize( + "schema1,schema2,expected_result", + [ + pytest.param({}, {}, {}, id="empty-schemas"), + pytest.param({"a": None}, {}, None, id="null-value-in-schema"), + pytest.param({"a": {"type": "integer"}}, {}, {"a": {"type": "integer"}}, id="single-key-schema1"), + pytest.param({}, {"a": {"type": "integer"}}, {"a": {"type": "integer"}}, id="single-key-schema2"), + pytest.param({"a": {"type": "integer"}}, {"a": {"type": "integer"}}, {"a": {"type": "integer"}}, id="single-key-both-schemas"), + pytest.param({"a": {"type": "integer"}}, {"a": {"type": "number"}}, {"a": {"type": "number"}}, id="single-key-schema2-is-wider"), + pytest.param({"a": {"type": "number"}}, {"a": {"type": "integer"}}, {"a": {"type": "number"}}, id="single-key-schema1-is-wider"), + pytest.param({"a": {"type": "array"}}, {"a": {"type": "integer"}}, None, id="single-key-with-array-schema1"), + pytest.param({"a": {"type": "integer"}}, {"a": {"type": "array"}}, None, id="single-key-with-array-schema2"), + pytest.param( + {"a": {"type": "object", "properties": {"b": {"type": "integer"}}}}, + {"a": {"type": "object", "properties": {"b": {"type": "integer"}}}}, + {"a": {"type": "object", "properties": {"b": {"type": "integer"}}}}, + id="single-key-same-object", + ), + pytest.param( + {"a": {"type": "object", "properties": {"b": {"type": "integer"}}}}, + {"a": {"type": "object", "properties": {"b": {"type": "string"}}}}, + None, + id="single-key-different-objects", + ), + pytest.param( + {"a": {"type": "object", "properties": {"b": {"type": "integer"}}}}, + {"a": {"type": "number"}}, + None, + id="single-key-with-object-schema1", + ), + pytest.param( + {"a": {"type": "number"}}, + {"a": {"type": "object", "properties": {"b": {"type": "integer"}}}}, + None, + id="single-key-with-object-schema2", + ), + pytest.param( + {"a": {"type": "array", "items": {"type": "number"}}}, + {"a": {"type": "array", "items": {"type": "number"}}}, + {"a": {"type": "array", "items": {"type": "number"}}}, + id="equal-arrays-in-both-schemas", + ), + pytest.param( + {"a": {"type": "array", "items": {"type": "integer"}}}, + {"a": {"type": "array", "items": {"type": "number"}}}, + None, + id="different-arrays-in-both-schemas", + ), + pytest.param( + {"a": {"type": "integer"}, "b": {"type": "string"}}, + {"c": {"type": "number"}}, + {"a": {"type": "integer"}, "b": {"type": "string"}, "c": {"type": "number"}}, + id="", + ), + pytest.param({"a": {"type": "invalid_type"}}, {"b": {"type": "integer"}}, None, id="invalid-type"), + pytest.param( + {"a": {"type": "object"}}, + {"a": {"type": "null"}}, + {"a": {"type": "object"}}, + id="single-key-with-null-object-schema2", + ), + pytest.param( + {"a": {"type": "object"}}, + {"b": {"type": "null"}}, + {"a": {"type": "object"}, "b": {"type": "null"}}, + id="new-key-with-null-type", + ), + pytest.param( + {"a": {"type": "null"}}, + {"a": {"type": "object"}}, + {"a": {"type": "object"}}, + id="single-key-with-null-object-schema1", + ), + ], +) +def test_merge_schemas(schema1: SchemaType, schema2: SchemaType, expected_result: Optional[SchemaType]) -> None: + if expected_result is not None: + assert merge_schemas(schema1, schema2) == expected_result + else: + with pytest.raises(SchemaInferenceError): + merge_schemas(schema1, schema2) + + +@pytest.mark.parametrize( + "type_mapping,expected_schema,expected_exc_msg", + [ + pytest.param( + '{"col1": "null", "col2": "array", "col3": "boolean", "col4": "float", "col5": "integer", "col6": "number", "col7": "object", "col8": "string"}', + { + "type": "object", + "properties": { + "col1": {"type": "null"}, + "col2": {"type": "array"}, + "col3": {"type": "boolean"}, + "col4": {"type": "number"}, + "col5": {"type": "integer"}, + "col6": {"type": "number"}, + "col7": {"type": "object"}, + "col8": {"type": "string"}, + }, + }, + None, + id="valid_all_types", + ), + pytest.param( + '{"col1 ": " string", "col2": " integer"}', + {"type": "object", "properties": {"col1": {"type": "string"}, "col2": {"type": "integer"}}}, + None, + id="valid_extra_spaces", + ), + pytest.param( + "", + None, + None, + id="valid_empty_string", + ), + pytest.param( + '{"col1": "x", "col2": "integer"}', + None, + "Invalid type 'x' for property 'col1'", + id="invalid_type", + ), + pytest.param( + '{"col1": "", "col2": "integer"}', + None, + "Invalid input schema", + id="invalid_missing_type", + ), + pytest.param( + '{"": "string", "col2": "integer"}', + None, + "Invalid input schema", + id="invalid_missing_name", + ), + pytest.param( + '{"type": "object", "properties": {"col1": {"type": "string"}, "col2": {"type": "integer"}}}', + None, + "Invalid input schema; nested schemas are not supported.", + id="invalid_nested_input_string", + ), + pytest.param( + '{"type": "object", "properties": {"col1": {"type": "string"}, "col2": {"type": "integer"}}}', + None, + "Invalid input schema; nested schemas are not supported.", + id="invalid_nested_input_json", + ), + ], +) +def test_type_mapping_to_jsonschema( + type_mapping: Mapping[str, Any], expected_schema: Optional[Mapping[str, Any]], expected_exc_msg: Optional[str] +) -> None: + if expected_exc_msg: + with pytest.raises(ConfigValidationError) as exc: + type_mapping_to_jsonschema(type_mapping) + assert expected_exc_msg in exc.value.args[0] + else: + assert type_mapping_to_jsonschema(type_mapping) == expected_schema diff --git a/airbyte-cdk/python/unit_tests/sources/fixtures/__init__.py b/airbyte-cdk/python/unit_tests/sources/fixtures/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/fixtures/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/fixtures/source_test_fixture.py b/airbyte-cdk/python/unit_tests/sources/fixtures/source_test_fixture.py new file mode 100644 index 000000000000..6f3cd57b1ccc --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/fixtures/source_test_fixture.py @@ -0,0 +1,154 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from abc import ABC +from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union + +import requests +from airbyte_cdk.models import ( + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + ConnectorSpecification, + DestinationSyncMode, + SyncMode, +) +from airbyte_cdk.sources import AbstractSource +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.http import HttpStream +from airbyte_cdk.sources.streams.http.requests_native_auth import Oauth2Authenticator +from requests.auth import AuthBase + + +class SourceTestFixture(AbstractSource): + """ + This is a concrete implementation of a Source connector that provides implementations of all the methods needed to run sync + operations. For simplicity, it also overrides functions that read from files in favor of returning the data directly avoiding + the need to load static files (ex. spec.yaml, config.json, configured_catalog.json) into the unit-test package. + """ + + def __init__(self, streams: Optional[List[Stream]] = None, authenticator: Optional[AuthBase] = None): + self._streams = streams + self._authenticator = authenticator + + def spec(self, logger: logging.Logger) -> ConnectorSpecification: + return ConnectorSpecification( + connectionSpecification={ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Test Fixture Spec", + "type": "object", + "required": ["api_token"], + "properties": { + "api_token": { + "type": "string", + "title": "API token", + "description": "The token used to authenticate requests to the API.", + "airbyte_secret": True, + } + }, + } + ) + + def read_config(self, config_path: str) -> Mapping[str, Any]: + return {"api_token": "just_some_token"} + + @classmethod + def read_catalog(cls, catalog_path: str) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream( + name="http_test_stream", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + default_cursor_field=["updated_at"], + source_defined_cursor=True, + source_defined_primary_key=[["id"]], + ), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + ] + ) + + def check_connection(self, *args, **kwargs) -> Tuple[bool, Optional[Any]]: + return True, "" + + def streams(self, *args, **kwargs) -> List[Stream]: + return [HttpTestStream(authenticator=self._authenticator)] + + +class HttpTestStream(HttpStream, ABC): + url_base = "https://airbyte.com/api/v1/" + + @property + def cursor_field(self) -> Union[str, List[str]]: + return ["updated_at"] + + @property + def availability_strategy(self): + return None + + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return "id" + + def path( + self, + *, + stream_state: Mapping[str, Any] = None, + stream_slice: Mapping[str, Any] = None, + next_page_token: Mapping[str, Any] = None, + ) -> str: + return "cast" + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Mapping[str, Any] = None, + next_page_token: Mapping[str, Any] = None, + ) -> Iterable[Mapping]: + body = response.json() or {} + return body["records"] + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + def get_json_schema(self) -> Mapping[str, Any]: + return {} + + +def fixture_mock_send(self, request, **kwargs) -> requests.Response: + """ + Helper method that can be used by a test to patch the Session.send() function and mock the outbound send operation to provide + faster and more reliable responses compared to actual API requests + """ + response = requests.Response() + response.request = request + response.status_code = 200 + response.headers = {"header": "value"} + response_body = { + "records": [ + {"id": 1, "name": "Celine Song", "position": "director"}, + {"id": 2, "name": "Shabier Kirchner", "position": "cinematographer"}, + {"id": 3, "name": "Christopher Bear", "position": "composer"}, + {"id": 4, "name": "Daniel Rossen", "position": "composer"}, + ] + } + response._content = json.dumps(response_body).encode("utf-8") + return response + + +class SourceFixtureOauthAuthenticator(Oauth2Authenticator): + """ + Test OAuth authenticator that only overrides the request and response aspect of the authenticator flow + """ + + def refresh_access_token(self) -> Tuple[str, int]: + response = requests.request(method="POST", url=self.get_token_refresh_endpoint(), params={}) + response.raise_for_status() + return "some_access_token", 1800 # Mock oauth response values to be used during the data retrieval step diff --git a/airbyte-cdk/python/unit_tests/sources/message/__init__.py b/airbyte-cdk/python/unit_tests/sources/message/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/message/test_repository.py b/airbyte-cdk/python/unit_tests/sources/message/test_repository.py new file mode 100644 index 000000000000..48778b657cb8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/message/test_repository.py @@ -0,0 +1,144 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import Mock + +import pytest +from airbyte_cdk.models import AirbyteControlConnectorConfigMessage, AirbyteControlMessage, AirbyteMessage, Level, OrchestratorType, Type +from airbyte_cdk.sources.message import ( + InMemoryMessageRepository, + LogAppenderMessageRepositoryDecorator, + MessageRepository, + NoopMessageRepository, +) + +A_CONTROL = AirbyteControlMessage( + type=OrchestratorType.CONNECTOR_CONFIG, + emitted_at=0, + connectorConfig=AirbyteControlConnectorConfigMessage(config={"a config": "value"}), +) +ANY_MESSAGE = AirbyteMessage( + type=Type.CONTROL, + control=AirbyteControlMessage( + type=OrchestratorType.CONNECTOR_CONFIG, + emitted_at=0, + connectorConfig=AirbyteControlConnectorConfigMessage(config={"any message": "value"}), + ), +) +ANOTHER_CONTROL = AirbyteControlMessage( + type=OrchestratorType.CONNECTOR_CONFIG, + emitted_at=0, + connectorConfig=AirbyteControlConnectorConfigMessage(config={"another config": "another value"}), +) +UNKNOWN_LEVEL = "potato" + + +class TestInMemoryMessageRepository: + def test_given_no_messages_when_consume_queue_then_return_empty(self): + repo = InMemoryMessageRepository() + messages = list(repo.consume_queue()) + assert messages == [] + + def test_given_messages_when_consume_queue_then_return_messages(self): + repo = InMemoryMessageRepository() + first_message = AirbyteMessage(type=Type.CONTROL, control=A_CONTROL) + repo.emit_message(first_message) + second_message = AirbyteMessage(type=Type.CONTROL, control=ANOTHER_CONTROL) + repo.emit_message(second_message) + + messages = repo.consume_queue() + + assert list(messages) == [first_message, second_message] + + def test_given_message_is_consumed_when_consume_queue_then_remove_message_from_queue(self): + repo = InMemoryMessageRepository() + first_message = AirbyteMessage(type=Type.CONTROL, control=A_CONTROL) + repo.emit_message(first_message) + second_message = AirbyteMessage(type=Type.CONTROL, control=ANOTHER_CONTROL) + repo.emit_message(second_message) + + message_generator = repo.consume_queue() + consumed_message = next(message_generator) + assert consumed_message == first_message + + second_message_generator = repo.consume_queue() + assert list(second_message_generator) == [second_message] + + def test_given_log_level_is_severe_enough_when_log_message_then_allow_message_to_be_consumed(self): + repo = InMemoryMessageRepository(Level.DEBUG) + repo.log_message(Level.INFO, lambda: {"message": "this is a log message"}) + assert list(repo.consume_queue()) + + def test_given_log_level_is_severe_enough_when_log_message_then_filter_secrets(self, mocker): + filtered_message = "a filtered message" + mocker.patch("airbyte_cdk.sources.message.repository.filter_secrets", return_value=filtered_message) + repo = InMemoryMessageRepository(Level.DEBUG) + + repo.log_message(Level.INFO, lambda: {"message": "this is a log message"}) + + assert list(repo.consume_queue())[0].log.message == filtered_message + + def test_given_log_level_not_severe_enough_when_log_message_then_do_not_allow_message_to_be_consumed(self): + repo = InMemoryMessageRepository(Level.ERROR) + repo.log_message(Level.INFO, lambda: {"message": "this is a log message"}) + assert not list(repo.consume_queue()) + + def test_given_unknown_log_level_as_threshold_when_log_message_then_allow_message_to_be_consumed(self): + repo = InMemoryMessageRepository(UNKNOWN_LEVEL) + repo.log_message(Level.DEBUG, lambda: {"message": "this is a log message"}) + assert list(repo.consume_queue()) + + +class TestNoopMessageRepository: + def test_given_message_emitted_when_consume_queue_then_return_empty(self): + repo = NoopMessageRepository() + repo.emit_message(AirbyteMessage(type=Type.CONTROL, control=A_CONTROL)) + repo.log_message(Level.INFO, lambda: {"message": "this is a log message"}) + + assert not list(repo.consume_queue()) + + +class TestLogAppenderMessageRepositoryDecorator: + + _DICT_TO_APPEND = {"airbyte_cdk": {"stream": {"is_substream": False}}} + + @pytest.fixture() + def decorated(self): + return Mock(spec=MessageRepository) + + def test_when_emit_message_then_delegate_call(self, decorated): + repo = LogAppenderMessageRepositoryDecorator(self._DICT_TO_APPEND, decorated, Level.DEBUG) + repo.emit_message(ANY_MESSAGE) + decorated.emit_message.assert_called_once_with(ANY_MESSAGE) + + def test_when_log_message_then_append(self, decorated): + repo = LogAppenderMessageRepositoryDecorator({"a": {"dict_to_append": "appended value"}}, decorated, Level.DEBUG) + repo.log_message(Level.INFO, lambda: {"a": {"original": "original value"}}) + assert decorated.log_message.call_args_list[0].args[1]() == { + "a": {"dict_to_append": "appended value", "original": "original value"} + } + + def test_given_value_clash_when_log_message_then_overwrite_value(self, decorated): + repo = LogAppenderMessageRepositoryDecorator({"clash": "appended value"}, decorated, Level.DEBUG) + repo.log_message(Level.INFO, lambda: {"clash": "original value"}) + assert decorated.log_message.call_args_list[0].args[1]() == {"clash": "appended value"} + + def test_given_log_level_is_severe_enough_when_log_message_then_allow_message_to_be_consumed(self, decorated): + repo = LogAppenderMessageRepositoryDecorator(self._DICT_TO_APPEND, decorated, Level.DEBUG) + repo.log_message(Level.INFO, lambda: {}) + assert decorated.log_message.call_count == 1 + + def test_given_log_level_not_severe_enough_when_log_message_then_do_not_allow_message_to_be_consumed(self, decorated): + repo = LogAppenderMessageRepositoryDecorator(self._DICT_TO_APPEND, decorated, Level.ERROR) + repo.log_message(Level.INFO, lambda: {}) + assert decorated.log_message.call_count == 0 + + def test_when_consume_queue_then_return_delegate_queue(self, decorated): + repo = LogAppenderMessageRepositoryDecorator(self._DICT_TO_APPEND, decorated, Level.DEBUG) + queue = [ANY_MESSAGE, ANY_MESSAGE, ANY_MESSAGE] + decorated.consume_queue.return_value = iter(queue) + + result = list(repo.consume_queue()) + + assert result == queue diff --git a/airbyte-cdk/python/unit_tests/sources/mock_server_tests/mock_source_fixture.py b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/mock_source_fixture.py new file mode 100644 index 000000000000..ece5039ba465 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/mock_source_fixture.py @@ -0,0 +1,390 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC +from datetime import datetime, timezone +from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple + +import pendulum +import requests +from airbyte_cdk.models import ConnectorSpecification, SyncMode +from airbyte_cdk.sources import AbstractSource, Source +from airbyte_cdk.sources.streams import CheckpointMixin, IncrementalMixin, Stream +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.streams.http import HttpStream +from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy +from requests import HTTPError + + +class FixtureAvailabilityStrategy(HttpAvailabilityStrategy): + """ + Inherit from HttpAvailabilityStrategy with slight modification to 403 error message. + """ + + def reasons_for_unavailable_status_codes( + self, stream: Stream, logger: logging.Logger, source: Source, error: HTTPError + ) -> Dict[int, str]: + reasons_for_codes: Dict[int, str] = { + requests.codes.FORBIDDEN: "This is likely due to insufficient permissions for your Notion integration. " + "Please make sure your integration has read access for the resources you are trying to sync" + } + return reasons_for_codes + + +class IntegrationStream(HttpStream, ABC): + + url_base = "https://api.airbyte-test.com/v1/" + primary_key = "id" + page_size = 100 + raise_on_http_errors = True + current_page = 0 + + def __init__(self, config: Mapping[str, Any], **kwargs): + super().__init__(**kwargs) + self.start_date = config.get("start_date") + + @property + def availability_strategy(self) -> HttpAvailabilityStrategy: + return FixtureAvailabilityStrategy() + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + data = response.json().get("data", []) + yield from data + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + has_more = response.json().get("has_more") + if has_more: + self.current_page += 1 + return {"page": self.current_page} + else: + return None + + +class IncrementalIntegrationStream(IntegrationStream, IncrementalMixin, ABC): + cursor_field = "created_at" + _state = {} + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + self._state = value + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + for record in super().read_records(sync_mode, cursor_field, stream_slice, stream_state): + self.state = {self.cursor_field: record.get(self.cursor_field)} + yield record + + +class Users(IntegrationStream): + def path(self, **kwargs) -> str: + return "users" + + def get_json_schema(self) -> Mapping[str, Any]: + return { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": True, + "properties": { + "type": {"type": "string"}, + "id": {"type": "string"}, + "created_at": {"type": "string", "format": "date-time"}, + "first_name": {"type": "string"}, + "last_name": {"type": "string"}, + }, + } + + +class Planets(IncrementalIntegrationStream): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._state: MutableMapping[str, Any] = {} + + def path(self, **kwargs) -> str: + return "planets" + + def get_json_schema(self) -> Mapping[str, Any]: + return { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": True, + "properties": { + "type": {"type": "string"}, + "id": {"type": "string"}, + "created_at": {"type": "string", "format": "date-time"}, + "name": {"type": "string"}, + }, + } + + def request_params( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return {"start_date": stream_slice.get("start_date"), "end_date": stream_slice.get("end_date")} + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + start_date = pendulum.parse(self.start_date) + + if stream_state: + start_date = pendulum.parse(stream_state.get(self.cursor_field)) + + date_slices = [] + + end_date = datetime.now(timezone.utc).replace(microsecond=0) + while start_date < end_date: + end_date_slice = min(start_date.add(days=7), end_date) + + date_slice = { + "start_date": start_date.strftime("%Y-%m-%dT%H:%M:%SZ"), + "end_date": end_date_slice.strftime("%Y-%m-%dT%H:%M:%SZ"), + } + + date_slices.append(date_slice) + start_date = end_date_slice + + return date_slices + + +class Legacies(IntegrationStream): + """ + Incremental stream that uses the legacy method get_updated_state() to manage stream state. New connectors use the state + property and setter methods. + """ + + cursor_field = "created_at" + + def path(self, **kwargs) -> str: + return "legacies" + + def get_json_schema(self) -> Mapping[str, Any]: + return { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": True, + "properties": { + "type": {"type": "string"}, + "id": {"type": "string"}, + "created_at": {"type": "string", "format": "date-time"}, + "quote": {"type": "string"}, + }, + } + + def get_updated_state( + self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] + ) -> MutableMapping[str, Any]: + latest_state = latest_record.get(self.cursor_field) + current_state = current_stream_state.get(self.cursor_field) or latest_state + if current_state: + return {self.cursor_field: max(latest_state, current_state)} + return {} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + yield from super().read_records(sync_mode, cursor_field, stream_slice, stream_state) + + def request_params( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return {"start_date": stream_slice.get("start_date"), "end_date": stream_slice.get("end_date")} + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + start_date = pendulum.parse(self.start_date) + + if stream_state: + start_date = pendulum.parse(stream_state.get(self.cursor_field)) + + date_slices = [] + + end_date = datetime.now(timezone.utc).replace(microsecond=0) + while start_date < end_date: + end_date_slice = min(start_date.add(days=7), end_date) + + date_slice = { + "start_date": start_date.strftime("%Y-%m-%dT%H:%M:%SZ"), + "end_date": end_date_slice.strftime("%Y-%m-%dT%H:%M:%SZ"), + } + + date_slices.append(date_slice) + start_date = end_date_slice + + return date_slices + + +class Dividers(IntegrationStream): + def path(self, **kwargs) -> str: + return "dividers" + + def get_json_schema(self) -> Mapping[str, Any]: + return { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": True, + "properties": { + "type": {"type": "string"}, + "id": {"type": "string"}, + "created_at": {"type": "string", "format": "date-time"}, + "divide_category": {"type": "string"}, + }, + } + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + return [{"divide_category": "dukes"}, {"divide_category": "mentats"}] + + def request_params( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return {"category": stream_slice.get("divide_category")} + + +class JusticeSongs(HttpStream, CheckpointMixin, ABC): + url_base = "https://api.airbyte-test.com/v1/" + primary_key = "id" + + def __init__(self, config: Mapping[str, Any], **kwargs): + super().__init__(**kwargs) + self._state: MutableMapping[str, Any] = {} + + def path(self, **kwargs) -> str: + return "justice_songs" + + def get_json_schema(self) -> Mapping[str, Any]: + return { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": True, + "properties": { + "type": {"type": "string"}, + "id": {"type": "string"}, + "created_at": {"type": "string", "format": "date-time"}, + "name": {"type": "string"}, + "album": {"type": "string"}, + }, + } + + @property + def availability_strategy(self) -> HttpAvailabilityStrategy: + return FixtureAvailabilityStrategy() + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + data = response.json().get("data", []) + yield from data + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + self._state = value + + def request_params( + self, + stream_state: Optional[Mapping[str, Any]], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return {"page": next_page_token.get("page")} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + yield from self._read_single_page(cursor_field, stream_slice, stream_state) + + def _read_single_page( + self, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + next_page_token = stream_slice + request_headers = self.request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + request_params = self.request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + + request, response = self._http_client.send_request( + http_method=self.http_method, + url=self._join_url( + self.url_base, + self.path(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + ), + request_kwargs=self.request_kwargs(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + headers=request_headers, + params=request_params, + json=self.request_body_json(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + data=self.request_body_data(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + dedupe_query_params=True, + ) + yield from self.parse_response(response=response) + + self.next_page_token(response) + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + current_page = self._state.get("page") or 0 + has_more = response.json().get("has_more") + if has_more: + self._state = {"page": current_page + 1} + else: + self._state = {"__ab_full_refresh_sync_complete": True} + + +class SourceFixture(AbstractSource): + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, any]: + return True, None + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + return [ + Dividers(config=config), + JusticeSongs(config=config), + Legacies(config=config), + Planets(config=config), + Users(config=config), + ] + + def spec(self, logger: logging.Logger) -> ConnectorSpecification: + return ConnectorSpecification( + connectionSpecification={ + "properties": { + "start_date": { + "title": "Start Date", + "description": "UTC date and time in the format YYYY-MM-DDTHH:MM:SS.000Z. During incremental sync, any data generated before this date will not be replicated. If left blank, the start date will be set to 2 years before the present date.", + "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$", + "pattern_descriptor": "YYYY-MM-DDTHH:MM:SS.000Z", + "examples": ["2020-11-16T00:00:00.000Z"], + "type": "string", + "format": "date-time", + } + } + } + ) diff --git a/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_helpers/__init__.py b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_helpers/__init__.py new file mode 100644 index 000000000000..7cad347c9fde --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_helpers/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from .airbyte_message_assertions import emits_successful_sync_status_messages, validate_message_order + +__all__ = ["emits_successful_sync_status_messages", "validate_message_order"] diff --git a/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_helpers/airbyte_message_assertions.py b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_helpers/airbyte_message_assertions.py new file mode 100644 index 000000000000..04b65594cf01 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_helpers/airbyte_message_assertions.py @@ -0,0 +1,28 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from typing import List + +import pytest +from airbyte_cdk.models import AirbyteMessage, AirbyteStreamStatus, Type + + +def emits_successful_sync_status_messages(status_messages: List[AirbyteStreamStatus]) -> bool: + return ( + len(status_messages) == 3 + and status_messages[0] == AirbyteStreamStatus.STARTED + and status_messages[1] == AirbyteStreamStatus.RUNNING + and status_messages[2] == AirbyteStreamStatus.COMPLETE + ) + + +def validate_message_order(expected_message_order: List[Type], messages: List[AirbyteMessage]): + if len(expected_message_order) != len(messages): + pytest.fail(f"Expected message order count {len(expected_message_order)} did not match actual messages {len(messages)}") + + for i, message in enumerate(messages): + if message.type != expected_message_order[i]: + pytest.fail( + f"At index {i} actual message type {message.type.name} did not match expected message type {expected_message_order[i].name}" + ) diff --git a/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_mock_server_abstract_source.py b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_mock_server_abstract_source.py new file mode 100644 index 000000000000..c7fd2cef433e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_mock_server_abstract_source.py @@ -0,0 +1,552 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from datetime import datetime, timedelta, timezone +from typing import List, Optional +from unittest import TestCase + +import freezegun +from airbyte_cdk.models import AirbyteStateBlob, ConfiguredAirbyteCatalog, SyncMode, Type +from airbyte_cdk.test.catalog_builder import CatalogBuilder +from airbyte_cdk.test.entrypoint_wrapper import read +from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest +from airbyte_cdk.test.mock_http.response_builder import ( + FieldPath, + FieldUpdatePaginationStrategy, + HttpResponseBuilder, + RecordBuilder, + create_record_builder, + create_response_builder, +) +from airbyte_cdk.test.state_builder import StateBuilder +from unit_tests.sources.mock_server_tests.mock_source_fixture import SourceFixture +from unit_tests.sources.mock_server_tests.test_helpers import emits_successful_sync_status_messages, validate_message_order + +_NOW = datetime.now(timezone.utc) + + +class RequestBuilder: + @classmethod + def dividers_endpoint(cls) -> "RequestBuilder": + return cls("dividers") + + @classmethod + def justice_songs_endpoint(cls) -> "RequestBuilder": + return cls("justice_songs") + + @classmethod + def legacies_endpoint(cls) -> "RequestBuilder": + return cls("legacies") + + @classmethod + def planets_endpoint(cls) -> "RequestBuilder": + return cls("planets") + + @classmethod + def users_endpoint(cls) -> "RequestBuilder": + return cls("users") + + def __init__(self, resource: str) -> None: + self._resource = resource + self._start_date: Optional[datetime] = None + self._end_date: Optional[datetime] = None + self._category: Optional[str] = None + self._page: Optional[int] = None + + def with_start_date(self, start_date: datetime) -> "RequestBuilder": + self._start_date = start_date + return self + + def with_end_date(self, end_date: datetime) -> "RequestBuilder": + self._end_date = end_date + return self + + def with_category(self, category: str) -> "RequestBuilder": + self._category = category + return self + + def with_page(self, page: int) -> "RequestBuilder": + self._page = page + return self + + def build(self) -> HttpRequest: + query_params = {} + if self._start_date: + query_params["start_date"] = self._start_date.strftime("%Y-%m-%dT%H:%M:%SZ") + if self._end_date: + query_params["end_date"] = self._end_date.strftime("%Y-%m-%dT%H:%M:%SZ") + if self._category: + query_params["category"] = self._category + if self._page: + query_params["page"] = self._page + + return HttpRequest( + url=f"https://api.airbyte-test.com/v1/{self._resource}", + query_params=query_params, + ) + + +def _create_catalog(names_and_sync_modes: List[tuple[str, SyncMode]]) -> ConfiguredAirbyteCatalog: + catalog_builder = CatalogBuilder() + for stream_name, sync_mode in names_and_sync_modes: + catalog_builder.with_stream(name=stream_name, sync_mode=sync_mode) + return catalog_builder.build() + + +def _create_dividers_request() -> RequestBuilder: + return RequestBuilder.dividers_endpoint() + + +def _create_legacies_request() -> RequestBuilder: + return RequestBuilder.legacies_endpoint() + + +def _create_planets_request() -> RequestBuilder: + return RequestBuilder.planets_endpoint() + + +def _create_users_request() -> RequestBuilder: + return RequestBuilder.users_endpoint() + + +def _create_justice_songs_request() -> RequestBuilder: + return RequestBuilder.justice_songs_endpoint() + + +RESPONSE_TEMPLATE = {"object": "list", "has_more": False, "data": [{"id": "123", "created_at": "2024-01-01T07:04:28.000Z"}]} + +USER_TEMPLATE = { + "object": "list", + "has_more": False, + "data": [ + { + "id": "123", + "created_at": "2024-01-01T07:04:28", + "first_name": "Paul", + "last_name": "Atreides", + } + ], +} + +PLANET_TEMPLATE = { + "object": "list", + "has_more": False, + "data": [ + { + "id": "456", + "created_at": "2024-01-01T07:04:28.000Z", + "name": "Giedi Prime", + } + ], +} + +LEGACY_TEMPLATE = { + "object": "list", + "has_more": False, + "data": [ + { + "id": "l3g4cy", + "created_at": "2024-02-01T07:04:28.000Z", + "quote": "What do you leave behind?", + } + ], +} + +DIVIDER_TEMPLATE = { + "object": "list", + "has_more": False, + "data": [ + { + "id": "l3t0", + "created_at": "2024-02-01T07:04:28.000Z", + "divide_category": "dukes", + } + ], +} + + +JUSTICE_SONGS_TEMPLATE = { + "object": "list", + "has_more": False, + "data": [ + { + "id": "cross_01", + "created_at": "2024-02-01T07:04:28.000Z", + "name": "Genesis", + "album": "Cross", + }, + { + "id": "hyperdrama_01", + "created_at": "2024-02-01T07:04:28.000Z", + "name": "dukes", + "album": "", + }, + ], +} + + +RESOURCE_TO_TEMPLATE = { + "dividers": DIVIDER_TEMPLATE, + "justice_songs": JUSTICE_SONGS_TEMPLATE, + "legacies": LEGACY_TEMPLATE, + "planets": PLANET_TEMPLATE, + "users": USER_TEMPLATE, +} + + +def _create_response(pagination_has_more: bool = False) -> HttpResponseBuilder: + return create_response_builder( + response_template=RESPONSE_TEMPLATE, + records_path=FieldPath("data"), + pagination_strategy=FieldUpdatePaginationStrategy(FieldPath("has_more"), pagination_has_more), + ) + + +def _create_record(resource: str) -> RecordBuilder: + return create_record_builder( + response_template=RESOURCE_TO_TEMPLATE.get(resource), + records_path=FieldPath("data"), + record_id_path=FieldPath("id"), + record_cursor_path=FieldPath("created_at"), + ) + + +class FullRefreshStreamTest(TestCase): + @HttpMocker() + def test_full_refresh_sync(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + http_mocker.get( + _create_users_request().build(), + _create_response().with_record(record=_create_record("users")).with_record(record=_create_record("users")).build(), + ) + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("users", SyncMode.full_refresh)])) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("users")) + assert len(actual_messages.records) == 2 + assert len(actual_messages.state_messages) == 1 + validate_message_order([Type.RECORD, Type.RECORD, Type.STATE], actual_messages.records_and_state_messages) + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "users" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(__ab_full_refresh_sync_complete=True) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 2.0 + + @HttpMocker() + def test_substream_resumable_full_refresh_with_parent_slices(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + expected_first_substream_per_stream_state = [ + {"partition": {"divide_category": "dukes"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + + expected_second_substream_per_stream_state = [ + {"partition": {"divide_category": "dukes"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"divide_category": "mentats"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + + http_mocker.get( + _create_dividers_request().with_category("dukes").build(), + _create_response().with_record(record=_create_record("dividers")).with_record(record=_create_record("dividers")).build(), + ) + + http_mocker.get( + _create_dividers_request().with_category("mentats").build(), + _create_response().with_record(record=_create_record("dividers")).with_record(record=_create_record("dividers")).build(), + ) + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("dividers", SyncMode.full_refresh)])) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("dividers")) + assert len(actual_messages.records) == 4 + assert len(actual_messages.state_messages) == 2 + validate_message_order( + [Type.RECORD, Type.RECORD, Type.STATE, Type.RECORD, Type.RECORD, Type.STATE], actual_messages.records_and_state_messages + ) + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob( + states=expected_first_substream_per_stream_state + ) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 2.0 + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob( + states=expected_second_substream_per_stream_state + ) + assert actual_messages.state_messages[1].state.sourceStats.recordCount == 2.0 + + +@freezegun.freeze_time(_NOW) +class IncrementalStreamTest(TestCase): + @HttpMocker() + def test_incremental_sync(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + last_record_date_0 = (start_datetime + timedelta(days=4)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_planets_request().with_start_date(start_datetime).with_end_date(start_datetime + timedelta(days=7)).build(), + _create_response() + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .build(), + ) + + last_record_date_1 = (_NOW - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_planets_request().with_start_date(start_datetime + timedelta(days=7)).with_end_date(_NOW).build(), + _create_response() + .with_record(record=_create_record("planets").with_cursor(last_record_date_1)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_1)) + .build(), + ) + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("planets", SyncMode.incremental)])) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("planets")) + assert len(actual_messages.records) == 5 + assert len(actual_messages.state_messages) == 2 + validate_message_order( + [Type.RECORD, Type.RECORD, Type.RECORD, Type.STATE, Type.RECORD, Type.RECORD, Type.STATE], + actual_messages.records_and_state_messages, + ) + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "planets" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_0) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 3.0 + assert actual_messages.state_messages[1].state.stream.stream_descriptor.name == "planets" + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_1) + assert actual_messages.state_messages[1].state.sourceStats.recordCount == 2.0 + + @HttpMocker() + def test_incremental_running_as_full_refresh(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + last_record_date_0 = (start_datetime + timedelta(days=4)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_planets_request().with_start_date(start_datetime).with_end_date(start_datetime + timedelta(days=7)).build(), + _create_response() + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .build(), + ) + + last_record_date_1 = (_NOW - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_planets_request().with_start_date(start_datetime + timedelta(days=7)).with_end_date(_NOW).build(), + _create_response() + .with_record(record=_create_record("planets").with_cursor(last_record_date_1)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_1)) + .build(), + ) + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("planets", SyncMode.full_refresh)])) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("planets")) + assert len(actual_messages.records) == 5 + assert len(actual_messages.state_messages) == 2 + validate_message_order( + [Type.RECORD, Type.RECORD, Type.RECORD, Type.STATE, Type.RECORD, Type.RECORD, Type.STATE], + actual_messages.records_and_state_messages, + ) + + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "planets" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_0) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 3.0 + assert actual_messages.state_messages[1].state.stream.stream_descriptor.name == "planets" + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_1) + assert actual_messages.state_messages[1].state.sourceStats.recordCount == 2.0 + + @HttpMocker() + def test_legacy_incremental_sync(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + last_record_date_0 = (start_datetime + timedelta(days=4)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_legacies_request().with_start_date(start_datetime).with_end_date(start_datetime + timedelta(days=7)).build(), + _create_response() + .with_record(record=_create_record("legacies").with_cursor(last_record_date_0)) + .with_record(record=_create_record("legacies").with_cursor(last_record_date_0)) + .with_record(record=_create_record("legacies").with_cursor(last_record_date_0)) + .build(), + ) + + last_record_date_1 = (_NOW - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_legacies_request().with_start_date(start_datetime + timedelta(days=7)).with_end_date(_NOW).build(), + _create_response() + .with_record(record=_create_record("legacies").with_cursor(last_record_date_1)) + .with_record(record=_create_record("legacies").with_cursor(last_record_date_1)) + .build(), + ) + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("legacies", SyncMode.incremental)])) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("legacies")) + assert len(actual_messages.records) == 5 + assert len(actual_messages.state_messages) == 2 + validate_message_order( + [Type.RECORD, Type.RECORD, Type.RECORD, Type.STATE, Type.RECORD, Type.RECORD, Type.STATE], + actual_messages.records_and_state_messages, + ) + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "legacies" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_0) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 3.0 + assert actual_messages.state_messages[1].state.stream.stream_descriptor.name == "legacies" + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_1) + assert actual_messages.state_messages[1].state.sourceStats.recordCount == 2.0 + + @HttpMocker() + def test_legacy_no_records_retains_incoming_state(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + last_record_date_1 = (_NOW - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_legacies_request().with_start_date(_NOW - timedelta(days=1)).with_end_date(_NOW).build(), + _create_response().build(), + ) + + incoming_state = AirbyteStateBlob(created_at=last_record_date_1) + state = StateBuilder().with_stream_state("legacies", incoming_state).build() + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("legacies", SyncMode.incremental)]), state=state) + + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "legacies" + assert actual_messages.state_messages[0].state.stream.stream_state == incoming_state + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 0.0 + + @HttpMocker() + def test_legacy_no_slices_retains_incoming_state(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + last_record_date_1 = _NOW.strftime("%Y-%m-%dT%H:%M:%SZ") + + incoming_state = AirbyteStateBlob(created_at=last_record_date_1) + state = StateBuilder().with_stream_state("legacies", incoming_state).build() + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("legacies", SyncMode.incremental)]), state=state) + + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "legacies" + assert actual_messages.state_messages[0].state.stream.stream_state == incoming_state + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 0.0 + + +@freezegun.freeze_time(_NOW) +class MultipleStreamTest(TestCase): + @HttpMocker() + def test_incremental_and_full_refresh_streams(self, http_mocker): + start_datetime = _NOW - timedelta(days=14) + config = {"start_date": start_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")} + + expected_first_substream_per_stream_state = [ + {"partition": {"divide_category": "dukes"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + + expected_second_substream_per_stream_state = [ + {"partition": {"divide_category": "dukes"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"divide_category": "mentats"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + + # Mocks for users full refresh stream + http_mocker.get( + _create_users_request().build(), + _create_response().with_record(record=_create_record("users")).with_record(record=_create_record("users")).build(), + ) + + # Mocks for planets incremental stream + last_record_date_0 = (start_datetime + timedelta(days=4)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_planets_request().with_start_date(start_datetime).with_end_date(start_datetime + timedelta(days=7)).build(), + _create_response() + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_0)) + .build(), + ) + + last_record_date_1 = (_NOW - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ") + http_mocker.get( + _create_planets_request().with_start_date(start_datetime + timedelta(days=7)).with_end_date(_NOW).build(), + _create_response() + .with_record(record=_create_record("planets").with_cursor(last_record_date_1)) + .with_record(record=_create_record("planets").with_cursor(last_record_date_1)) + .build(), + ) + + # Mocks for dividers full refresh stream + http_mocker.get( + _create_dividers_request().with_category("dukes").build(), + _create_response().with_record(record=_create_record("dividers")).with_record(record=_create_record("dividers")).build(), + ) + + http_mocker.get( + _create_dividers_request().with_category("mentats").build(), + _create_response().with_record(record=_create_record("dividers")).with_record(record=_create_record("dividers")).build(), + ) + + source = SourceFixture() + actual_messages = read( + source, + config=config, + catalog=_create_catalog( + [("users", SyncMode.full_refresh), ("planets", SyncMode.incremental), ("dividers", SyncMode.full_refresh)] + ), + ) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("users")) + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("planets")) + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("dividers")) + + assert len(actual_messages.records) == 11 + assert len(actual_messages.state_messages) == 5 + validate_message_order( + [ + Type.RECORD, + Type.RECORD, + Type.STATE, + Type.RECORD, + Type.RECORD, + Type.RECORD, + Type.STATE, + Type.RECORD, + Type.RECORD, + Type.STATE, + Type.RECORD, + Type.RECORD, + Type.STATE, + Type.RECORD, + Type.RECORD, + Type.STATE, + ], + actual_messages.records_and_state_messages, + ) + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "users" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(__ab_full_refresh_sync_complete=True) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 2.0 + assert actual_messages.state_messages[1].state.stream.stream_descriptor.name == "planets" + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_0) + assert actual_messages.state_messages[1].state.sourceStats.recordCount == 3.0 + assert actual_messages.state_messages[2].state.stream.stream_descriptor.name == "planets" + assert actual_messages.state_messages[2].state.stream.stream_state == AirbyteStateBlob(created_at=last_record_date_1) + assert actual_messages.state_messages[2].state.sourceStats.recordCount == 2.0 + assert actual_messages.state_messages[3].state.stream.stream_descriptor.name == "dividers" + assert actual_messages.state_messages[3].state.stream.stream_state == AirbyteStateBlob( + states=expected_first_substream_per_stream_state + ) + assert actual_messages.state_messages[3].state.sourceStats.recordCount == 2.0 + assert actual_messages.state_messages[4].state.stream.stream_descriptor.name == "dividers" + assert actual_messages.state_messages[4].state.stream.stream_state == AirbyteStateBlob( + states=expected_second_substream_per_stream_state + ) + assert actual_messages.state_messages[4].state.sourceStats.recordCount == 2.0 diff --git a/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_resumable_full_refresh.py b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_resumable_full_refresh.py new file mode 100644 index 000000000000..f5a9e8578ab9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/mock_server_tests/test_resumable_full_refresh.py @@ -0,0 +1,278 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional +from unittest import TestCase + +import freezegun +from airbyte_cdk.models import AirbyteStateBlob, AirbyteStreamStatus, ConfiguredAirbyteCatalog, FailureType, SyncMode, Type +from airbyte_cdk.test.catalog_builder import ConfiguredAirbyteStreamBuilder +from airbyte_cdk.test.entrypoint_wrapper import read +from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest +from airbyte_cdk.test.mock_http.response_builder import ( + FieldPath, + FieldUpdatePaginationStrategy, + HttpResponseBuilder, + RecordBuilder, + create_record_builder, + create_response_builder, +) +from airbyte_cdk.test.state_builder import StateBuilder +from unit_tests.sources.mock_server_tests.mock_source_fixture import SourceFixture +from unit_tests.sources.mock_server_tests.test_helpers import emits_successful_sync_status_messages, validate_message_order + +_NOW = datetime.now(timezone.utc) + + +class RequestBuilder: + @classmethod + def justice_songs_endpoint(cls) -> "RequestBuilder": + return cls("justice_songs") + + def __init__(self, resource: str) -> None: + self._resource = resource + self._page: Optional[int] = None + + def with_page(self, page: int) -> "RequestBuilder": + self._page = page + return self + + def build(self) -> HttpRequest: + query_params = {} + if self._page: + query_params["page"] = self._page + + return HttpRequest( + url=f"https://api.airbyte-test.com/v1/{self._resource}", + query_params=query_params, + ) + + +def _create_catalog(names_and_sync_modes: List[tuple[str, SyncMode, Dict[str, Any]]]) -> ConfiguredAirbyteCatalog: + stream_builder = ConfiguredAirbyteStreamBuilder() + streams = [] + for stream_name, sync_mode, json_schema in names_and_sync_modes: + streams.append(stream_builder.with_name(stream_name).with_sync_mode(sync_mode).with_json_schema(json_schema or {})) + + return ConfiguredAirbyteCatalog(streams=list(map(lambda builder: builder.build(), streams))) + + +def _create_justice_songs_request() -> RequestBuilder: + return RequestBuilder.justice_songs_endpoint() + + +RESPONSE_TEMPLATE = {"object": "list", "has_more": False, "data": [{"id": "123", "created_at": "2024-01-01T07:04:28.000Z"}]} + + +JUSTICE_SONGS_TEMPLATE = { + "object": "list", + "has_more": False, + "data": [ + { + "id": "cross_01", + "created_at": "2024-02-01T07:04:28.000Z", + "name": "Genesis", + "album": "Cross", + }, + { + "id": "hyperdrama_01", + "created_at": "2024-02-01T07:04:28.000Z", + "name": "dukes", + "album": "", + }, + ], +} + + +RESOURCE_TO_TEMPLATE = { + "justice_songs": JUSTICE_SONGS_TEMPLATE, +} + + +def _create_response(pagination_has_more: bool = False) -> HttpResponseBuilder: + return create_response_builder( + response_template=RESPONSE_TEMPLATE, + records_path=FieldPath("data"), + pagination_strategy=FieldUpdatePaginationStrategy(FieldPath("has_more"), pagination_has_more), + ) + + +def _create_record(resource: str) -> RecordBuilder: + return create_record_builder( + response_template=RESOURCE_TO_TEMPLATE.get(resource), + records_path=FieldPath("data"), + record_id_path=FieldPath("id"), + record_cursor_path=FieldPath("created_at"), + ) + + +@freezegun.freeze_time(_NOW) +class ResumableFullRefreshStreamTest(TestCase): + @HttpMocker() + def test_resumable_full_refresh_sync(self, http_mocker): + config = {} + + http_mocker.get( + _create_justice_songs_request().build(), + _create_response(pagination_has_more=True) + .with_pagination() + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .build(), + ) + + http_mocker.get( + _create_justice_songs_request().with_page(1).build(), + _create_response(pagination_has_more=True) + .with_pagination() + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .build(), + ) + + http_mocker.get( + _create_justice_songs_request().with_page(2).build(), + _create_response(pagination_has_more=False).with_pagination().with_record(record=_create_record("justice_songs")).build(), + ) + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("justice_songs", SyncMode.full_refresh, {})])) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("justice_songs")) + assert len(actual_messages.records) == 5 + assert len(actual_messages.state_messages) == 4 + validate_message_order( + [Type.RECORD, Type.RECORD, Type.STATE, Type.RECORD, Type.RECORD, Type.STATE, Type.RECORD, Type.STATE, Type.STATE], + actual_messages.records_and_state_messages, + ) + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(page=1) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 2.0 + assert actual_messages.state_messages[1].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob(page=2) + assert actual_messages.state_messages[1].state.sourceStats.recordCount == 2.0 + assert actual_messages.state_messages[2].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[2].state.stream.stream_state == AirbyteStateBlob(__ab_full_refresh_sync_complete=True) + assert actual_messages.state_messages[2].state.sourceStats.recordCount == 1.0 + assert actual_messages.state_messages[3].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[3].state.stream.stream_state == AirbyteStateBlob(__ab_full_refresh_sync_complete=True) + assert actual_messages.state_messages[3].state.sourceStats.recordCount == 0.0 + + @HttpMocker() + def test_resumable_full_refresh_second_attempt(self, http_mocker): + config = {} + + state = StateBuilder().with_stream_state("justice_songs", {"page": 100}).build() + + http_mocker.get( + _create_justice_songs_request().with_page(100).build(), + _create_response(pagination_has_more=True) + .with_pagination() + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .build(), + ) + + http_mocker.get( + _create_justice_songs_request().with_page(101).build(), + _create_response(pagination_has_more=True) + .with_pagination() + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .build(), + ) + + http_mocker.get( + _create_justice_songs_request().with_page(102).build(), + _create_response(pagination_has_more=False) + .with_pagination() + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .build(), + ) + + source = SourceFixture() + actual_messages = read(source, config=config, catalog=_create_catalog([("justice_songs", SyncMode.full_refresh, {})]), state=state) + + assert emits_successful_sync_status_messages(actual_messages.get_stream_statuses("justice_songs")) + assert len(actual_messages.records) == 8 + assert len(actual_messages.state_messages) == 4 + validate_message_order( + [ + Type.RECORD, + Type.RECORD, + Type.RECORD, + Type.STATE, + Type.RECORD, + Type.RECORD, + Type.RECORD, + Type.STATE, + Type.RECORD, + Type.RECORD, + Type.STATE, + Type.STATE, + ], + actual_messages.records_and_state_messages, + ) + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(page=101) + assert actual_messages.state_messages[0].state.sourceStats.recordCount == 3.0 + assert actual_messages.state_messages[1].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob(page=102) + assert actual_messages.state_messages[1].state.sourceStats.recordCount == 3.0 + assert actual_messages.state_messages[2].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[2].state.stream.stream_state == AirbyteStateBlob(__ab_full_refresh_sync_complete=True) + assert actual_messages.state_messages[2].state.sourceStats.recordCount == 2.0 + assert actual_messages.state_messages[3].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[3].state.stream.stream_state == AirbyteStateBlob(__ab_full_refresh_sync_complete=True) + assert actual_messages.state_messages[3].state.sourceStats.recordCount == 0.0 + + @HttpMocker() + def test_resumable_full_refresh_failure(self, http_mocker): + config = {} + + http_mocker.get( + _create_justice_songs_request().build(), + _create_response(pagination_has_more=True) + .with_pagination() + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .build(), + ) + + http_mocker.get( + _create_justice_songs_request().with_page(1).build(), + _create_response(pagination_has_more=True) + .with_pagination() + .with_record(record=_create_record("justice_songs")) + .with_record(record=_create_record("justice_songs")) + .build(), + ) + + http_mocker.get(_create_justice_songs_request().with_page(2).build(), _create_response().with_status_code(status_code=400).build()) + + source = SourceFixture() + actual_messages = read( + source, config=config, catalog=_create_catalog([("justice_songs", SyncMode.full_refresh, {})]), expecting_exception=True + ) + + status_messages = actual_messages.get_stream_statuses("justice_songs") + assert status_messages[-1] == AirbyteStreamStatus.INCOMPLETE + assert len(actual_messages.records) == 4 + assert len(actual_messages.state_messages) == 2 + + validate_message_order( + [Type.RECORD, Type.RECORD, Type.STATE, Type.RECORD, Type.RECORD, Type.STATE], actual_messages.records_and_state_messages + ) + assert actual_messages.state_messages[0].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[0].state.stream.stream_state == AirbyteStateBlob(page=1) + assert actual_messages.state_messages[1].state.stream.stream_descriptor.name == "justice_songs" + assert actual_messages.state_messages[1].state.stream.stream_state == AirbyteStateBlob(page=2) + + assert actual_messages.errors[0].trace.error.failure_type == FailureType.system_error + assert actual_messages.errors[0].trace.error.stream_descriptor.name == "justice_songs" + assert "Bad request" in actual_messages.errors[0].trace.error.message diff --git a/airbyte-cdk/python/unit_tests/sources/streams/__init__.py b/airbyte-cdk/python/unit_tests/sources/streams/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/__init__.py b/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/test_checkpoint_reader.py b/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/test_checkpoint_reader.py new file mode 100644 index 000000000000..01ddd363b0d3 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/test_checkpoint_reader.py @@ -0,0 +1,350 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from unittest.mock import Mock + +import pytest +from airbyte_cdk.sources.streams.checkpoint import ( + CursorBasedCheckpointReader, + FullRefreshCheckpointReader, + IncrementalCheckpointReader, + LegacyCursorBasedCheckpointReader, + ResumableFullRefreshCheckpointReader, +) +from airbyte_cdk.sources.types import StreamSlice + + +def test_incremental_checkpoint_reader_next_slice(): + stream_slices = [ + {"start_date": "2024-01-01", "end_date": "2024-02-01"}, + {"start_date": "2024-02-01", "end_date": "2024-03-01"}, + {"start_date": "2024-03-01", "end_date": "2024-04-01"}, + ] + checkpoint_reader = IncrementalCheckpointReader(stream_slices=stream_slices, stream_state={}) + + assert checkpoint_reader.next() == stream_slices[0] + checkpoint_reader.observe({"updated_at": "2024-01-15"}) + assert checkpoint_reader.get_checkpoint() == {"updated_at": "2024-01-15"} + assert checkpoint_reader.next() == stream_slices[1] + checkpoint_reader.observe({"updated_at": "2024-02-15"}) + assert checkpoint_reader.get_checkpoint() == {"updated_at": "2024-02-15"} + assert checkpoint_reader.next() == stream_slices[2] + checkpoint_reader.observe({"updated_at": "2024-03-15"}) + assert checkpoint_reader.get_checkpoint() == {"updated_at": "2024-03-15"} + + # Validate that after iterating over every slice, the final get_checkpoint() call is None so that + # no duplicate final state message is emitted + assert checkpoint_reader.next() is None + assert checkpoint_reader.get_checkpoint() is None + + +def test_incremental_checkpoint_reader_incoming_state(): + incoming_state = {"updated_at": "2024-04-01"} + checkpoint_reader = IncrementalCheckpointReader(stream_slices=[], stream_state=incoming_state) + + assert checkpoint_reader.get_checkpoint() == incoming_state + + expected_state = {"cursor": "new_state_value"} + checkpoint_reader.observe(expected_state) + + assert checkpoint_reader.get_checkpoint() == expected_state + + +def test_resumable_full_refresh_checkpoint_reader_next(): + checkpoint_reader = ResumableFullRefreshCheckpointReader(stream_state={"synthetic_page_number": 55}) + + checkpoint_reader.observe({"synthetic_page_number": 56}) + assert checkpoint_reader.next() == {"synthetic_page_number": 56} + + checkpoint_reader.observe({"synthetic_page_number": 57}) + assert checkpoint_reader.next() == {"synthetic_page_number": 57} + + checkpoint_reader.observe({"__ab_full_refresh_sync_complete": True}) + assert checkpoint_reader.next() is None + + +def test_resumable_full_refresh_checkpoint_reader_no_incoming_state(): + checkpoint_reader = ResumableFullRefreshCheckpointReader(stream_state={}) + + checkpoint_reader.observe({"synthetic_page_number": 1}) + assert checkpoint_reader.next() == {"synthetic_page_number": 1} + + checkpoint_reader.observe({"synthetic_page_number": 2}) + assert checkpoint_reader.next() == {"synthetic_page_number": 2} + + checkpoint_reader.observe({"__ab_full_refresh_sync_complete": True}) + assert checkpoint_reader.next() is None + + +def test_full_refresh_checkpoint_reader_next(): + checkpoint_reader = FullRefreshCheckpointReader([{}]) + + assert checkpoint_reader.next() == {} + assert checkpoint_reader.get_checkpoint() is None + assert checkpoint_reader.next() is None + assert checkpoint_reader.get_checkpoint() == {"__ab_no_cursor_state_message": True} + + +def test_full_refresh_checkpoint_reader_substream(): + checkpoint_reader = FullRefreshCheckpointReader([{"partition": 1}, {"partition": 2}]) + + assert checkpoint_reader.next() == {"partition": 1} + assert checkpoint_reader.get_checkpoint() is None + assert checkpoint_reader.next() == {"partition": 2} + assert checkpoint_reader.get_checkpoint() is None + assert checkpoint_reader.next() is None + assert checkpoint_reader.get_checkpoint() == {"__ab_no_cursor_state_message": True} + + +def test_cursor_based_checkpoint_reader_incremental(): + expected_slices = [ + StreamSlice(cursor_slice={"start_date": "2024-01-01", "end_date": "2024-02-01"}, partition={}), + StreamSlice(cursor_slice={"start_date": "2024-02-01", "end_date": "2024-03-01"}, partition={}), + StreamSlice(cursor_slice={"start_date": "2024-03-01", "end_date": "2024-04-01"}, partition={}), + ] + + expected_stream_state = {"end_date": "2024-02-01"} + + incremental_cursor = Mock() + incremental_cursor.stream_slices.return_value = expected_slices + incremental_cursor.select_state.return_value = expected_stream_state + incremental_cursor.get_stream_state.return_value = expected_stream_state + + checkpoint_reader = CursorBasedCheckpointReader( + cursor=incremental_cursor, stream_slices=incremental_cursor.stream_slices(), read_state_from_cursor=False + ) + + assert checkpoint_reader.next() == expected_slices[0] + actual_state = checkpoint_reader.get_checkpoint() + assert actual_state == expected_stream_state + assert checkpoint_reader.next() == expected_slices[1] + assert checkpoint_reader.next() == expected_slices[2] + finished = checkpoint_reader.next() + assert finished is None + + # A finished checkpoint_reader should return None for the final checkpoint to avoid emitting duplicate state + assert checkpoint_reader.get_checkpoint() is None + + +def test_cursor_based_checkpoint_reader_resumable_full_refresh(): + expected_slices = [ + StreamSlice(cursor_slice={}, partition={}), + StreamSlice(cursor_slice={"next_page_token": 2}, partition={}), + StreamSlice(cursor_slice={"next_page_token": 3}, partition={}), + StreamSlice(cursor_slice={"next_page_token": 4}, partition={}), + StreamSlice(cursor_slice={"__ab_full_refresh_sync_complete": True}, partition={}), + ] + + expected_stream_state = {"next_page_token": 2} + + rfr_cursor = Mock() + rfr_cursor.stream_slices.return_value = [StreamSlice(cursor_slice={}, partition={})] + rfr_cursor.select_state.side_effect = expected_slices + rfr_cursor.get_stream_state.return_value = expected_stream_state + + checkpoint_reader = CursorBasedCheckpointReader( + cursor=rfr_cursor, stream_slices=rfr_cursor.stream_slices(), read_state_from_cursor=True + ) + + assert checkpoint_reader.next() == expected_slices[0] + actual_state = checkpoint_reader.get_checkpoint() + assert actual_state == expected_stream_state + assert checkpoint_reader.next() == expected_slices[1] + assert checkpoint_reader.next() == expected_slices[2] + assert checkpoint_reader.next() == expected_slices[3] + finished = checkpoint_reader.next() + assert finished is None + + # A finished checkpoint_reader should return None for the final checkpoint to avoid emitting duplicate state + assert checkpoint_reader.get_checkpoint() is None + + +def test_cursor_based_checkpoint_reader_resumable_full_refresh_parents(): + expected_slices = [ + StreamSlice(cursor_slice={"next_page_token": 2}, partition={"parent_id": "zaheer"}), + StreamSlice(cursor_slice={"next_page_token": 3}, partition={"parent_id": "zaheer"}), + StreamSlice(cursor_slice={"next_page_token": 2}, partition={"parent_id": "pli"}), + StreamSlice(cursor_slice={"next_page_token": 3}, partition={"parent_id": "pli"}), + ] + + expected_stream_state = {"next_page_token": 2} + + rfr_cursor = Mock() + rfr_cursor.stream_slices.return_value = [ + StreamSlice(cursor_slice={}, partition={"parent_id": "zaheer"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "pli"}), + ] + rfr_cursor.select_state.side_effect = [ + {"next_page_token": 2}, + {"next_page_token": 3}, + {"__ab_full_refresh_sync_complete": True}, + {"next_page_token": 2}, + {"next_page_token": 3}, + {"__ab_full_refresh_sync_complete": True}, + ] + rfr_cursor.get_stream_state.return_value = expected_stream_state + + checkpoint_reader = CursorBasedCheckpointReader( + cursor=rfr_cursor, stream_slices=rfr_cursor.stream_slices(), read_state_from_cursor=True + ) + + assert checkpoint_reader.next() == expected_slices[0] + actual_state = checkpoint_reader.get_checkpoint() + assert actual_state == expected_stream_state + assert checkpoint_reader.next() == expected_slices[1] + assert checkpoint_reader.next() == expected_slices[2] + assert checkpoint_reader.next() == expected_slices[3] + finished = checkpoint_reader.next() + assert finished is None + + # A finished checkpoint_reader should return None for the final checkpoint to avoid emitting duplicate state + assert checkpoint_reader.get_checkpoint() is None + + +def test_cursor_based_checkpoint_reader_skip_completed_parent_slices(): + expected_slices = [ + StreamSlice(cursor_slice={"next_page_token": 2}, partition={"parent_id": "bolin"}), + StreamSlice(cursor_slice={"next_page_token": 3}, partition={"parent_id": "bolin"}), + StreamSlice(cursor_slice={"next_page_token": 7}, partition={"parent_id": "pabu"}), + StreamSlice(cursor_slice={"next_page_token": 8}, partition={"parent_id": "pabu"}), + ] + + expected_stream_state = {"next_page_token": 2} + + rfr_cursor = Mock() + rfr_cursor.stream_slices.return_value = [ + StreamSlice(cursor_slice={}, partition={"parent_id": "korra"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "mako"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "bolin"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "asami"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "naga"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "pabu"}), + ] + rfr_cursor.select_state.side_effect = [ + {"__ab_full_refresh_sync_complete": True}, + {"__ab_full_refresh_sync_complete": True}, + {"next_page_token": 2}, + {"next_page_token": 3}, + {"__ab_full_refresh_sync_complete": True}, + {"__ab_full_refresh_sync_complete": True}, + {"__ab_full_refresh_sync_complete": True}, + {"next_page_token": 7}, + {"next_page_token": 8}, + ] + rfr_cursor.get_stream_state.return_value = expected_stream_state + + checkpoint_reader = CursorBasedCheckpointReader( + cursor=rfr_cursor, stream_slices=rfr_cursor.stream_slices(), read_state_from_cursor=True + ) + + assert checkpoint_reader.next() == expected_slices[0] + actual_state = checkpoint_reader.get_checkpoint() + assert actual_state == expected_stream_state + assert checkpoint_reader.next() == expected_slices[1] + assert checkpoint_reader.next() == expected_slices[2] + assert checkpoint_reader.next() == expected_slices[3] + finished = checkpoint_reader.next() + assert finished is None + + # A finished checkpoint_reader should return None for the final checkpoint to avoid emitting duplicate state + assert checkpoint_reader.get_checkpoint() is None + + +def test_cursor_based_checkpoint_reader_sync_first_parent_slice(): + expected_slices = [ + StreamSlice(cursor_slice={"next_page_token": 3}, partition={"parent_id": "bolin"}), + StreamSlice(cursor_slice={"next_page_token": 4}, partition={"parent_id": "bolin"}), + StreamSlice(cursor_slice={"next_page_token": 4}, partition={"parent_id": "bolin"}), + ] + + expected_stream_state = {"next_page_token": 3} + + rfr_cursor = Mock() + rfr_cursor.stream_slices.return_value = [ + StreamSlice(cursor_slice={}, partition={"parent_id": "bolin"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "asami"}), + StreamSlice(cursor_slice={}, partition={"parent_id": "naga"}), + ] + rfr_cursor.select_state.side_effect = [ + {"next_page_token": 3}, # Accounts for the first invocation when checking if partition was already successful + {"next_page_token": 4}, + {"next_page_token": 4}, + {"__ab_full_refresh_sync_complete": True}, + {"__ab_full_refresh_sync_complete": True}, + {"__ab_full_refresh_sync_complete": True}, + ] + rfr_cursor.get_stream_state.return_value = expected_stream_state + + checkpoint_reader = CursorBasedCheckpointReader( + cursor=rfr_cursor, stream_slices=rfr_cursor.stream_slices(), read_state_from_cursor=True + ) + + assert checkpoint_reader.next() == expected_slices[0] + actual_state = checkpoint_reader.get_checkpoint() + assert actual_state == expected_stream_state + assert checkpoint_reader.next() == expected_slices[1] + assert checkpoint_reader.next() == expected_slices[2] + finished = checkpoint_reader.next() + assert finished is None + + # A finished checkpoint_reader should return None for the final checkpoint to avoid emitting duplicate state + assert checkpoint_reader.get_checkpoint() is None + + +def test_cursor_based_checkpoint_reader_resumable_full_refresh_invalid_slice(): + rfr_cursor = Mock() + rfr_cursor.stream_slices.return_value = [{"invalid": "stream_slice"}] + rfr_cursor.select_state.side_effect = [StreamSlice(cursor_slice={"invalid": "stream_slice"}, partition={})] + + checkpoint_reader = CursorBasedCheckpointReader( + cursor=rfr_cursor, stream_slices=rfr_cursor.stream_slices(), read_state_from_cursor=True + ) + + with pytest.raises(ValueError): + checkpoint_reader.next() + + +def test_legacy_cursor_based_checkpoint_reader_resumable_full_refresh(): + expected_mapping_slices = [ + {"parent_id": 400, "partition": {"parent_id": 400}, "cursor_slice": {}}, + {"parent_id": 400, "next_page_token": 2, "partition": {"parent_id": 400}, "cursor_slice": {"next_page_token": 2}}, + {"parent_id": 400, "next_page_token": 2, "partition": {"parent_id": 400}, "cursor_slice": {"next_page_token": 2}}, + {"parent_id": 400, "next_page_token": 3, "partition": {"parent_id": 400}, "cursor_slice": {"next_page_token": 3}}, + {"parent_id": 400, "next_page_token": 4, "partition": {"parent_id": 400}, "cursor_slice": {"next_page_token": 4}}, + { + "parent_id": 400, + "__ab_full_refresh_sync_complete": True, + "partition": {"parent_id": 400}, + "cursor_slice": {"__ab_full_refresh_sync_complete": True}, + }, + ] + + mocked_state = [ + {}, + {"next_page_token": 2}, + {"next_page_token": 3}, + {"next_page_token": 4}, + {"__ab_full_refresh_sync_complete": True}, + ] + + expected_stream_state = {"next_page_token": 2} + + rfr_cursor = Mock() + rfr_cursor.stream_slices.return_value = [{"parent_id": 400}] + rfr_cursor.select_state.side_effect = mocked_state + rfr_cursor.get_stream_state.return_value = expected_stream_state + + checkpoint_reader = LegacyCursorBasedCheckpointReader( + cursor=rfr_cursor, stream_slices=rfr_cursor.stream_slices(), read_state_from_cursor=True + ) + + assert checkpoint_reader.next() == expected_mapping_slices[0] + actual_state = checkpoint_reader.get_checkpoint() + assert actual_state == expected_stream_state + assert checkpoint_reader.next() == expected_mapping_slices[2] + assert checkpoint_reader.next() == expected_mapping_slices[3] + assert checkpoint_reader.next() == expected_mapping_slices[4] + finished = checkpoint_reader.next() + assert finished is None + + # A finished checkpoint_reader should return None for the final checkpoint to avoid emitting duplicate state + assert checkpoint_reader.get_checkpoint() is None diff --git a/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/test_substream_resumable_full_refresh_cursor.py b/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/test_substream_resumable_full_refresh_cursor.py new file mode 100644 index 000000000000..4944518535f9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/checkpoint/test_substream_resumable_full_refresh_cursor.py @@ -0,0 +1,99 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import pytest +from airbyte_cdk.sources.streams.checkpoint.substream_resumable_full_refresh_cursor import SubstreamResumableFullRefreshCursor +from airbyte_cdk.sources.types import StreamSlice +from airbyte_cdk.utils import AirbyteTracedException + + +def test_substream_resumable_full_refresh_cursor(): + """ + Test scenario where a set of parent record partitions are iterated over by the cursor resulting in a completed sync + """ + expected_starting_state = {"states": []} + + expected_ending_state = { + "states": [ + {"partition": {"musician_id": "kousei_arima"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"musician_id": "kaori_miyazono"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + } + + partitions = [ + StreamSlice(partition={"musician_id": "kousei_arima"}, cursor_slice={}), + StreamSlice(partition={"musician_id": "kaori_miyazono"}, cursor_slice={}), + ] + + cursor = SubstreamResumableFullRefreshCursor() + + starting_state = cursor.get_stream_state() + assert starting_state == expected_starting_state + + for partition in partitions: + partition_state = cursor.select_state(partition) + assert partition_state is None + cursor.close_slice(partition) + + ending_state = cursor.get_stream_state() + assert ending_state == expected_ending_state + + +def test_substream_resumable_full_refresh_cursor_with_state(): + """ + Test scenario where a set of parent record partitions are iterated over and previously completed parents are skipped + """ + initial_state = { + "states": [ + {"partition": {"musician_id": "kousei_arima"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"musician_id": "kaori_miyazono"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"musician_id": "takeshi_aiza"}, "cursor": {}}, + ] + } + + expected_ending_state = { + "states": [ + {"partition": {"musician_id": "kousei_arima"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"musician_id": "kaori_miyazono"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"musician_id": "takeshi_aiza"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + {"partition": {"musician_id": "emi_igawa"}, "cursor": {"__ab_full_refresh_sync_complete": True}}, + ] + } + + partitions = [ + StreamSlice(partition={"musician_id": "kousei_arima"}, cursor_slice={}), + StreamSlice(partition={"musician_id": "kaori_miyazono"}, cursor_slice={}), + StreamSlice(partition={"musician_id": "takeshi_aiza"}, cursor_slice={}), + StreamSlice(partition={"musician_id": "emi_igawa"}, cursor_slice={}), + ] + + cursor = SubstreamResumableFullRefreshCursor() + cursor.set_initial_state(initial_state) + + starting_state = cursor.get_stream_state() + assert starting_state == initial_state + + for i, partition in enumerate(partitions): + partition_state = cursor.select_state(partition) + if i < len(initial_state.get("states")): + assert partition_state == initial_state.get("states")[i].get("cursor") + else: + assert partition_state is None + cursor.close_slice(partition) + + ending_state = cursor.get_stream_state() + assert ending_state == expected_ending_state + + +def test_set_initial_state_invalid_incoming_state(): + bad_state = {"next_page_token": 2} + cursor = SubstreamResumableFullRefreshCursor() + + with pytest.raises(AirbyteTracedException): + cursor.set_initial_state(bad_state) + + +def test_select_state_without_slice(): + cursor = SubstreamResumableFullRefreshCursor() + + with pytest.raises(ValueError): + cursor.select_state() diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/__init__.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/__init__.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/__init__.py new file mode 100644 index 000000000000..c941b3045795 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py new file mode 100644 index 000000000000..f3a4df1433cf --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py @@ -0,0 +1,257 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import ConcurrencyCompatibleStateType +from airbyte_cdk.test.state_builder import StateBuilder +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder +from unit_tests.sources.streams.concurrent.scenarios.stream_facade_builder import StreamFacadeSourceBuilder +from unit_tests.sources.streams.concurrent.scenarios.utils import MockStream + +_NO_SLICE_BOUNDARIES = None +_NO_INPUT_STATE = [] +test_incremental_stream_without_slice_boundaries_no_input_state = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_without_slice_boundaries_no_input_state") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"from": 0, "to": 1}, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 1}]), + ({"from": 1, "to": 2}, [{"id": "3", "cursor_field": 2}, {"id": "4", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), _NO_SLICE_BOUNDARIES) + .set_input_state(_NO_INPUT_STATE) + ) + .set_expected_read_error(AirbyteTracedException, "Concurrent read failure") + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config(IncrementalScenarioConfig(input_state=_NO_INPUT_STATE)) + .build() +) + + +test_incremental_stream_with_slice_boundaries_no_input_state = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_with_slice_boundaries_no_input_state") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"from": 0, "to": 1}, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 1}]), + ({"from": 1, "to": 2}, [{"id": "3", "cursor_field": 2}, {"id": "4", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), ("from", "to")) + .set_input_state(_NO_INPUT_STATE) + ) + .set_expected_records( + [ + {"data": {"id": "1", "cursor_field": 0}, "stream": "stream1"}, + {"data": {"id": "2", "cursor_field": 1}, "stream": "stream1"}, + {"cursor_field": 1}, + {"data": {"id": "3", "cursor_field": 2}, "stream": "stream1"}, + {"data": {"id": "4", "cursor_field": 3}, "stream": "stream1"}, + {"cursor_field": 3}, + {"cursor_field": 3}, # see Cursor.ensure_at_least_one_state_emitted + ] + ) + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config(IncrementalScenarioConfig(input_state=_NO_INPUT_STATE)) + .build() +) + + +LEGACY_STATE = StateBuilder().with_stream_state("stream1", {"cursor_field": 0}).build() +test_incremental_stream_without_slice_boundaries_with_legacy_state = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_without_slice_boundaries_with_legacy_state") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"from": 0, "to": 1}, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 1}]), + ({"from": 1, "to": 2}, [{"id": "3", "cursor_field": 2}, {"id": "4", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), _NO_SLICE_BOUNDARIES) + .set_input_state(LEGACY_STATE) + ) + .set_expected_read_error(AirbyteTracedException, "Concurrent read failure") + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config(IncrementalScenarioConfig(input_state=LEGACY_STATE)) + .build() +) + + +test_incremental_stream_with_slice_boundaries_with_legacy_state = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_with_slice_boundaries_with_legacy_state") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"from": 0, "to": 1}, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 1}]), + ({"from": 1, "to": 2}, [{"id": "3", "cursor_field": 2}, {"id": "4", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), ("from", "to")) + .set_input_state(LEGACY_STATE) + ) + .set_expected_records( + [ + {"data": {"id": "1", "cursor_field": 0}, "stream": "stream1"}, + {"data": {"id": "2", "cursor_field": 1}, "stream": "stream1"}, + {"cursor_field": 1}, + {"data": {"id": "3", "cursor_field": 2}, "stream": "stream1"}, + {"data": {"id": "4", "cursor_field": 3}, "stream": "stream1"}, + {"cursor_field": 3}, + {"cursor_field": 3}, # see Cursor.ensure_at_least_one_state_emitted + ] + ) + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config(IncrementalScenarioConfig(input_state=LEGACY_STATE)) + .build() +) + + +CONCURRENT_STATE = ( + StateBuilder() + .with_stream_state( + "stream1", + { + "slices": [{"start": 0, "end": 0}], + "state_type": ConcurrencyCompatibleStateType.date_range.value, + }, + ) + .build() +) +test_incremental_stream_without_slice_boundaries_with_concurrent_state = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_without_slice_boundaries_with_concurrent_state") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"from": 0, "to": 1}, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 1}]), + ({"from": 1, "to": 2}, [{"id": "3", "cursor_field": 2}, {"id": "4", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), _NO_SLICE_BOUNDARIES) + .set_input_state(CONCURRENT_STATE) + ) + .set_expected_read_error(AirbyteTracedException, "Concurrent read failure") + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config(IncrementalScenarioConfig(input_state=CONCURRENT_STATE)) + .build() +) + + +test_incremental_stream_with_slice_boundaries_with_concurrent_state = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_with_slice_boundaries_with_concurrent_state") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"from": 0, "to": 1}, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 1}]), + ({"from": 1, "to": 2}, [{"id": "3", "cursor_field": 2}, {"id": "4", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), ("from", "to")) + .set_input_state(CONCURRENT_STATE) + ) + .set_expected_records( + [ + {"data": {"id": "1", "cursor_field": 0}, "stream": "stream1"}, + {"data": {"id": "2", "cursor_field": 1}, "stream": "stream1"}, + {"cursor_field": 1}, + {"data": {"id": "3", "cursor_field": 2}, "stream": "stream1"}, + {"data": {"id": "4", "cursor_field": 3}, "stream": "stream1"}, + {"cursor_field": 3}, + {"cursor_field": 3}, # see Cursor.ensure_at_least_one_state_emitted + ] + ) + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config(IncrementalScenarioConfig(input_state=CONCURRENT_STATE)) + .build() +) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py new file mode 100644 index 000000000000..4d4fb5c474f2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py @@ -0,0 +1,123 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import concurrent +import logging +from typing import Any, List, Mapping, Optional, Tuple, Union + +from airbyte_cdk.models import ( + AirbyteStateMessage, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + ConnectorSpecification, + DestinationSyncMode, + SyncMode, +) +from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource +from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository +from airbyte_cdk.sources.source import TState +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import EpochValueConcurrentStreamStateConverter +from unit_tests.sources.file_based.scenarios.scenario_builder import SourceBuilder +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import NeverLogSliceLogger + +_CURSOR_FIELD = "cursor_field" +_NO_STATE = None + + +class StreamFacadeConcurrentConnectorStateConverter(EpochValueConcurrentStreamStateConverter): + pass + + +class StreamFacadeSource(ConcurrentSourceAdapter): + def __init__( + self, + streams: List[Stream], + threadpool: concurrent.futures.ThreadPoolExecutor, + cursor_field: Optional[CursorField] = None, + cursor_boundaries: Optional[Tuple[str, str]] = None, + input_state: Optional[List[Mapping[str, Any]]] = _NO_STATE, + ): + self._message_repository = InMemoryMessageRepository() + threadpool_manager = ThreadPoolManager(threadpool, streams[0].logger) + concurrent_source = ConcurrentSource(threadpool_manager, streams[0].logger, NeverLogSliceLogger(), self._message_repository) + super().__init__(concurrent_source) + self._streams = streams + self._threadpool = threadpool_manager + self._cursor_field = cursor_field + self._cursor_boundaries = cursor_boundaries + self._state = [AirbyteStateMessage(s) for s in input_state] if input_state else None + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + return True, None + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + state_manager = ConnectorStateManager( + state=self._state, + ) # The input values into the AirbyteStream are dummy values; the connector state manager only uses `name` and `namespace` + state_converter = StreamFacadeConcurrentConnectorStateConverter() + + return [ + self.convert_to_concurrent_stream(stream.logger, stream, state_manager, self.initialize_cursor( + stream, state_manager, state_converter, self._cursor_boundaries, None, EpochValueConcurrentStreamStateConverter.get_end_provider()) + ) + for stream in self._streams + ] + + @property + def message_repository(self) -> Union[None, MessageRepository]: + return self._message_repository + + def spec(self, logger: logging.Logger) -> ConnectorSpecification: + return ConnectorSpecification(connectionSpecification={}) + + def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=s.as_airbyte_stream(), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + for s in self._streams + ] + ) + + +class StreamFacadeSourceBuilder(SourceBuilder[StreamFacadeSource]): + def __init__(self): + self._source = None + self._streams = [] + self._max_workers = 1 + self._cursor_field = None + self._cursor_boundaries = None + self._input_state = None + self._raw_input_state = None + + def set_streams(self, streams: List[Stream]) -> "StreamFacadeSourceBuilder": + self._streams = streams + return self + + def set_max_workers(self, max_workers: int) -> "StreamFacadeSourceBuilder": + self._max_workers = max_workers + return self + + def set_incremental(self, cursor_field: CursorField, cursor_boundaries: Optional[Tuple[str, str]]) -> "StreamFacadeSourceBuilder": + self._cursor_field = cursor_field + self._cursor_boundaries = cursor_boundaries + return self + + def set_input_state(self, state: List[Mapping[str, Any]]) -> "StreamFacadeSourceBuilder": + self._input_state = state + return self + + def build( + self, configured_catalog: Optional[Mapping[str, Any]], config: Optional[Mapping[str, Any]], state: Optional[TState] + ) -> StreamFacadeSource: + threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=self._max_workers, thread_name_prefix="workerpool") + return StreamFacadeSource(self._streams, threadpool, self._cursor_field, self._cursor_boundaries, state) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py new file mode 100644 index 000000000000..41483282821c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py @@ -0,0 +1,456 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder +from unit_tests.sources.streams.concurrent.scenarios.stream_facade_builder import StreamFacadeSourceBuilder +from unit_tests.sources.streams.concurrent.scenarios.utils import MockStream + +_stream1 = MockStream( + [ + (None, [{"id": "1"}, {"id": "2"}]), + ], + "stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, +) + +_stream_raising_exception = MockStream( + [ + (None, [{"id": "1"}, ValueError("test exception")]), + ], + "stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, +) + +_stream_with_primary_key = MockStream( + [ + (None, [{"id": "1"}, {"id": "2"}]), + ], + "stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + primary_key="id", +) + +_stream2 = MockStream( + [ + (None, [{"id": "A"}, {"id": "B"}]), + ], + "stream2", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, +) + +_stream_with_single_slice = MockStream( + [ + ({"slice_key": "s1"}, [{"id": "1"}, {"id": "2"}]), + ], + "stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, +) + +_stream_with_multiple_slices = MockStream( + [ + ({"slice_key": "s1"}, [{"id": "1"}, {"id": "2"}]), + ({"slice_key": "s2"}, [{"id": "3"}, {"id": "4"}]), + ], + "stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, +) + +test_stream_facade_single_stream = ( + TestScenarioBuilder() + .set_name("test_stream_facade_single_stream") + .set_config({}) + .set_source_builder(StreamFacadeSourceBuilder().set_streams([_stream1])) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .set_expected_logs( + { + "read": [ + {"level": "INFO", "message": "Starting syncing"}, + {"level": "INFO", "message": "Marking stream stream1 as STARTED"}, + {"level": "INFO", "message": "Syncing stream: stream1"}, + {"level": "INFO", "message": "Marking stream stream1 as RUNNING"}, + {"level": "INFO", "message": "Read 2 records from stream1 stream"}, + {"level": "INFO", "message": "Marking stream stream1 as STOPPED"}, + {"level": "INFO", "message": "Finished syncing stream1"}, + {"level": "INFO", "message": "Finished syncing"}, + ] + } + ) + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .build() +) + +test_stream_facade_raises_exception = ( + TestScenarioBuilder() + .set_name("test_stream_facade_raises_exception") + .set_config({}) + .set_source_builder(StreamFacadeSourceBuilder().set_streams([_stream_raising_exception])) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .set_expected_read_error(AirbyteTracedException, "Concurrent read failure") + .build() +) + +test_stream_facade_single_stream_with_primary_key = ( + TestScenarioBuilder() + .set_name("test_stream_facade_stream_with_primary_key") + .set_config({}) + .set_source_builder(StreamFacadeSourceBuilder().set_streams([_stream1])) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) + +test_stream_facade_multiple_streams = ( + TestScenarioBuilder() + .set_name("test_stream_facade_multiple_streams") + .set_config({}) + .set_source_builder(StreamFacadeSourceBuilder().set_streams([_stream1, _stream2])) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + {"data": {"id": "A"}, "stream": "stream2"}, + {"data": {"id": "B"}, "stream": "stream2"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream2", + "supported_sync_modes": ["full_refresh"], + }, + ] + } + ) + .build() +) + +test_stream_facade_single_stream_with_single_slice = ( + TestScenarioBuilder() + .set_name("test_stream_facade_single_stream_with_single_slice") + .set_config({}) + .set_source_builder(StreamFacadeSourceBuilder().set_streams([_stream1])) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) + +test_stream_facade_single_stream_with_multiple_slices = ( + TestScenarioBuilder() + .set_name("test_stream_facade_single_stream_with_multiple_slice") + .set_config({}) + .set_source_builder(StreamFacadeSourceBuilder().set_streams([_stream_with_multiple_slices])) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + {"data": {"id": "3"}, "stream": "stream1"}, + {"data": {"id": "4"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) + +test_stream_facade_single_stream_with_multiple_slices_with_concurrency_level_two = ( + TestScenarioBuilder() + .set_name("test_stream_facade_single_stream_with_multiple_slice_with_concurrency_level_two") + .set_config({}) + .set_source_builder(StreamFacadeSourceBuilder().set_streams([_stream_with_multiple_slices])) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + {"data": {"id": "3"}, "stream": "stream1"}, + {"data": {"id": "4"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) + + +test_incremental_stream_with_slice_boundaries = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_with_slice_boundaries") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"from": 0, "to": 1}, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 1}]), + ({"from": 1, "to": 2}, [{"id": "3", "cursor_field": 2}, {"id": "4", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), ("from", "to")) + ) + .set_expected_records( + [ + {"data": {"id": "1", "cursor_field": 0}, "stream": "stream1"}, + {"data": {"id": "2", "cursor_field": 1}, "stream": "stream1"}, + {"cursor_field": 1}, + {"data": {"id": "3", "cursor_field": 2}, "stream": "stream1"}, + {"data": {"id": "4", "cursor_field": 3}, "stream": "stream1"}, + {"cursor_field": 3}, + {"cursor_field": 3}, # see Cursor.ensure_at_least_one_state_emitted + ] + ) + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) + .build() +) + + +_NO_SLICE_BOUNDARIES = None +test_incremental_stream_without_slice_boundaries = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_without_slice_boundaries") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + (None, [{"id": "1", "cursor_field": 0}, {"id": "2", "cursor_field": 3}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), _NO_SLICE_BOUNDARIES) + ) + .set_expected_records( + [ + {"data": {"id": "1", "cursor_field": 0}, "stream": "stream1"}, + {"data": {"id": "2", "cursor_field": 3}, "stream": "stream1"}, + {"cursor_field": 3}, + {"cursor_field": 3}, # see Cursor.ensure_at_least_one_state_emitted + ] + ) + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) + .build() +) + +test_incremental_stream_with_many_slices_but_without_slice_boundaries = ( + TestScenarioBuilder() + .set_name("test_incremental_stream_with_many_slices_but_without_slice_boundaries") + .set_config({}) + .set_source_builder( + StreamFacadeSourceBuilder() + .set_streams( + [ + MockStream( + [ + ({"parent_id": 1}, [{"id": "1", "cursor_field": 0}]), + ({"parent_id": 309}, [{"id": "3", "cursor_field": 0}]), + ], + "stream1", + cursor_field="cursor_field", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + ) + ] + ) + .set_incremental(CursorField("cursor_field"), _NO_SLICE_BOUNDARIES) + ) + .set_expected_read_error(AirbyteTracedException, "Concurrent read failure") + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_incremental_scenario_config( + IncrementalScenarioConfig( + input_state=[], + ) + ) + .build() +) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py new file mode 100644 index 000000000000..af2249873035 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from pathlib import PosixPath + +import pytest +from _pytest.capture import CaptureFixture +from freezegun import freeze_time +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenario +from unit_tests.sources.file_based.test_scenarios import verify_discover, verify_read +from unit_tests.sources.streams.concurrent.scenarios.incremental_scenarios import ( + test_incremental_stream_with_slice_boundaries_no_input_state, + test_incremental_stream_with_slice_boundaries_with_concurrent_state, + test_incremental_stream_with_slice_boundaries_with_legacy_state, + test_incremental_stream_without_slice_boundaries_no_input_state, + test_incremental_stream_without_slice_boundaries_with_concurrent_state, + test_incremental_stream_without_slice_boundaries_with_legacy_state, +) +from unit_tests.sources.streams.concurrent.scenarios.stream_facade_scenarios import ( + test_incremental_stream_with_many_slices_but_without_slice_boundaries, + test_incremental_stream_with_slice_boundaries, + test_incremental_stream_without_slice_boundaries, + test_stream_facade_multiple_streams, + test_stream_facade_raises_exception, + test_stream_facade_single_stream, + test_stream_facade_single_stream_with_multiple_slices, + test_stream_facade_single_stream_with_multiple_slices_with_concurrency_level_two, + test_stream_facade_single_stream_with_primary_key, + test_stream_facade_single_stream_with_single_slice, +) +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_scenarios import ( + test_concurrent_cdk_multiple_streams, + test_concurrent_cdk_partition_raises_exception, + test_concurrent_cdk_single_stream, + test_concurrent_cdk_single_stream_multiple_partitions, + test_concurrent_cdk_single_stream_multiple_partitions_concurrency_level_two, + test_concurrent_cdk_single_stream_with_primary_key, +) + +scenarios = [ + test_concurrent_cdk_single_stream, + test_concurrent_cdk_multiple_streams, + test_concurrent_cdk_single_stream_multiple_partitions, + test_concurrent_cdk_single_stream_multiple_partitions_concurrency_level_two, + test_concurrent_cdk_single_stream_with_primary_key, + test_concurrent_cdk_partition_raises_exception, + # test streams built using the facade + test_stream_facade_single_stream, + test_stream_facade_multiple_streams, + test_stream_facade_single_stream_with_primary_key, + test_stream_facade_single_stream_with_single_slice, + test_stream_facade_single_stream_with_multiple_slices, + test_stream_facade_single_stream_with_multiple_slices_with_concurrency_level_two, + test_stream_facade_raises_exception, + test_incremental_stream_with_slice_boundaries, + test_incremental_stream_without_slice_boundaries, + test_incremental_stream_with_many_slices_but_without_slice_boundaries, + test_incremental_stream_with_slice_boundaries_no_input_state, + test_incremental_stream_with_slice_boundaries_with_concurrent_state, + test_incremental_stream_with_slice_boundaries_with_legacy_state, + test_incremental_stream_without_slice_boundaries_no_input_state, + test_incremental_stream_without_slice_boundaries_with_concurrent_state, + test_incremental_stream_without_slice_boundaries_with_legacy_state, +] + + +@pytest.mark.parametrize("scenario", scenarios, ids=[s.name for s in scenarios]) +@freeze_time("2023-06-09T00:00:00Z") +def test_concurrent_read(scenario: TestScenario) -> None: + verify_read(scenario) + + +@pytest.mark.parametrize("scenario", scenarios, ids=[s.name for s in scenarios]) +def test_concurrent_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestScenario) -> None: + verify_discover(capsys, tmp_path, scenario) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py new file mode 100644 index 000000000000..b8ed5e72ddd9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py @@ -0,0 +1,421 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging + +from airbyte_cdk.sources.message import InMemoryMessageRepository +from airbyte_cdk.sources.streams.concurrent.availability_strategy import AlwaysAvailableAvailabilityStrategy +from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import ( + ConcurrentSourceBuilder, + InMemoryPartition, + InMemoryPartitionGenerator, +) + +_message_repository = InMemoryMessageRepository() + +_id_only_stream = DefaultStream( + partition_generator=InMemoryPartitionGenerator( + [InMemoryPartition("partition1", "stream1", None, [Record({"id": "1"}, InMemoryPartition("partition1", "stream1", None, [])), Record({"id": "2"}, InMemoryPartition("partition1", "stream1", None, []))])] + ), + name="stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=[], + cursor_field=None, + logger=logging.getLogger("test_logger"), + cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository), +) + +_id_only_stream_with_slice_logger = DefaultStream( + partition_generator=InMemoryPartitionGenerator( + [InMemoryPartition("partition1", "stream1", None, [Record({"id": "1"}, "stream1"), Record({"id": "2"}, "stream1")])] + ), + name="stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=[], + cursor_field=None, + logger=logging.getLogger("test_logger"), + cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository), +) + +_id_only_stream_with_primary_key = DefaultStream( + partition_generator=InMemoryPartitionGenerator( + [InMemoryPartition("partition1", "stream1", None, [Record({"id": "1"}, InMemoryPartition("partition1", "stream1", None, [])), Record({"id": "2"}, InMemoryPartition("partition1", "stream1", None, []))])] + ), + name="stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=["id"], + cursor_field=None, + logger=logging.getLogger("test_logger"), + cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository), +) + +_id_only_stream_multiple_partitions = DefaultStream( + partition_generator=InMemoryPartitionGenerator( + [ + InMemoryPartition("partition1", "stream1", {"p": "1"}, [Record({"id": "1"}, InMemoryPartition("partition1", "stream1", None, [])), Record({"id": "2"}, InMemoryPartition("partition1", "stream1", None, []))]), + InMemoryPartition("partition2", "stream1", {"p": "2"}, [Record({"id": "3"}, InMemoryPartition("partition1", "stream1", None, [])), Record({"id": "4"}, InMemoryPartition("partition1", "stream1", None, []))]), + ] + ), + name="stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=[], + cursor_field=None, + logger=logging.getLogger("test_logger"), + cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository), +) + +_id_only_stream_multiple_partitions_concurrency_level_two = DefaultStream( + partition_generator=InMemoryPartitionGenerator( + [ + InMemoryPartition("partition1", "stream1", {"p": "1"}, [Record({"id": "1"}, InMemoryPartition("partition1", "stream1", None, [])), Record({"id": "2"}, InMemoryPartition("partition1", "stream1", None, []))]), + InMemoryPartition("partition2", "stream1", {"p": "2"}, [Record({"id": "3"}, InMemoryPartition("partition1", "stream1", None, [])), Record({"id": "4"}, InMemoryPartition("partition1", "stream1", None, []))]), + ] + ), + name="stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=[], + cursor_field=None, + logger=logging.getLogger("test_logger"), + cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository), +) + +_stream_raising_exception = DefaultStream( + partition_generator=InMemoryPartitionGenerator( + [InMemoryPartition("partition1", "stream1", None, [Record({"id": "1"}, InMemoryPartition("partition1", "stream1", None, [])), ValueError("test exception")])] + ), + name="stream1", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=[], + cursor_field=None, + logger=logging.getLogger("test_logger"), + cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository), +) + +test_concurrent_cdk_single_stream = ( + TestScenarioBuilder() + .set_name("test_concurrent_cdk_single_stream") + .set_config({}) + .set_source_builder( + ConcurrentSourceBuilder() + .set_streams( + [ + _id_only_stream, + ] + ) + .set_message_repository(_message_repository) + ) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + ] + ) + .set_expected_logs( + { + "read": [ + {"level": "INFO", "message": "Starting syncing"}, + {"level": "INFO", "message": "Marking stream stream1 as STARTED"}, + {"level": "INFO", "message": "Syncing stream: stream1"}, + {"level": "INFO", "message": "Marking stream stream1 as RUNNING"}, + {"level": "INFO", "message": "Read 2 records from stream1 stream"}, + {"level": "INFO", "message": "Marking stream stream1 as STOPPED"}, + {"level": "INFO", "message": "Finished syncing stream1"}, + {"level": "INFO", "message": "Finished syncing"}, + ] + } + ) + .set_log_levels({"ERROR", "WARN", "WARNING", "INFO", "DEBUG"}) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) + +test_concurrent_cdk_single_stream_with_primary_key = ( + TestScenarioBuilder() + .set_name("test_concurrent_cdk_single_stream_with_primary_key") + .set_config({}) + .set_source_builder( + ConcurrentSourceBuilder() + .set_streams( + [ + _id_only_stream_with_primary_key, + ] + ) + .set_message_repository(_message_repository) + ) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + "source_defined_primary_key": [["id"]], + } + ] + } + ) + .build() +) + +test_concurrent_cdk_multiple_streams = ( + TestScenarioBuilder() + .set_name("test_concurrent_cdk_multiple_streams") + .set_config({}) + .set_source_builder( + ConcurrentSourceBuilder() + .set_streams( + [ + _id_only_stream, + DefaultStream( + partition_generator=InMemoryPartitionGenerator( + [ + InMemoryPartition( + "partition1", + "stream2", + None, + [Record({"id": "10", "key": "v1"}, InMemoryPartition("partition1", "stream2", None, [])), + Record({"id": "20", "key": "v2"}, InMemoryPartition("partition1", "stream2", None, []))], + ) + ] + ), + name="stream2", + json_schema={ + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + "key": {"type": ["null", "string"]}, + }, + }, + availability_strategy=AlwaysAvailableAvailabilityStrategy(), + primary_key=[], + cursor_field=None, + logger=logging.getLogger("test_logger"), + cursor=FinalStateCursor(stream_name="stream2", stream_namespace=None, message_repository=_message_repository), + ), + ] + ) + .set_message_repository(_message_repository) + ) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + {"data": {"id": "10", "key": "v1"}, "stream": "stream2"}, + {"data": {"id": "20", "key": "v2"}, "stream": "stream2"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + }, + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + "key": {"type": ["null", "string"]}, + }, + }, + "name": "stream2", + "supported_sync_modes": ["full_refresh"], + }, + ] + } + ) + .build() +) + +test_concurrent_cdk_partition_raises_exception = ( + TestScenarioBuilder() + .set_name("test_concurrent_cdk_partition_raises_exception") + .set_config({}) + .set_source_builder( + ConcurrentSourceBuilder() + .set_streams( + [ + _stream_raising_exception, + ] + ) + .set_message_repository(_message_repository) + ) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + ] + ) + .set_expected_read_error(AirbyteTracedException, "Concurrent read failure") + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) + +test_concurrent_cdk_single_stream_multiple_partitions = ( + TestScenarioBuilder() + .set_name("test_concurrent_cdk_single_stream_multiple_partitions") + .set_config({}) + .set_source_builder( + ConcurrentSourceBuilder() + .set_streams( + [ + _id_only_stream_multiple_partitions, + ] + ) + .set_message_repository(_message_repository) + ) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + {"data": {"id": "3"}, "stream": "stream1"}, + {"data": {"id": "4"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) + +test_concurrent_cdk_single_stream_multiple_partitions_concurrency_level_two = ( + TestScenarioBuilder() + .set_name("test_concurrent_cdk_single_stream_multiple_partitions_concurrency_level_2") + .set_config({}) + .set_source_builder( + ConcurrentSourceBuilder() + .set_streams( + [ + _id_only_stream_multiple_partitions_concurrency_level_two, + ] + ) + .set_message_repository(_message_repository) + ) + .set_expected_records( + [ + {"data": {"id": "1"}, "stream": "stream1"}, + {"data": {"id": "2"}, "stream": "stream1"}, + {"data": {"id": "3"}, "stream": "stream1"}, + {"data": {"id": "4"}, "stream": "stream1"}, + ] + ) + .set_expected_catalog( + { + "streams": [ + { + "json_schema": { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + }, + }, + "name": "stream1", + "supported_sync_modes": ["full_refresh"], + } + ] + } + ) + .build() +) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py new file mode 100644 index 000000000000..17a4b39547ab --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py @@ -0,0 +1,150 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import json +import logging +from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union + +from airbyte_cdk.models import ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, ConnectorSpecification, DestinationSyncMode, SyncMode +from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource +from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter +from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade +from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from unit_tests.sources.file_based.scenarios.scenario_builder import SourceBuilder + + +class LegacyStream(Stream): + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return None + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + yield from [] + + +class ConcurrentCdkSource(ConcurrentSourceAdapter): + def __init__(self, streams: List[DefaultStream], message_repository: Optional[MessageRepository], max_workers, timeout_in_seconds): + concurrent_source = ConcurrentSource.create(1, 1, streams[0]._logger, NeverLogSliceLogger(), message_repository) + super().__init__(concurrent_source) + self._streams = streams + self._message_repository = message_repository + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + # Check is not verified because it is up to the source to implement this method + return True, None + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + return [ + StreamFacade( + s, + LegacyStream(), + FinalStateCursor(stream_name=s.name, stream_namespace=s.namespace, message_repository=self.message_repository), + NeverLogSliceLogger(), + s._logger, + ) + for s in self._streams + ] + + def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification: + return ConnectorSpecification(connectionSpecification={}) + + def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=StreamFacade( + s, + LegacyStream(), + FinalStateCursor(stream_name=s.name, stream_namespace=s.namespace, message_repository=InMemoryMessageRepository()), + NeverLogSliceLogger(), + s._logger, + ).as_airbyte_stream(), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + for s in self._streams + ] + ) + + @property + def message_repository(self) -> Union[None, MessageRepository]: + return self._message_repository + + +class InMemoryPartitionGenerator(PartitionGenerator): + def __init__(self, partitions: List[Partition]): + self._partitions = partitions + + def generate(self) -> Iterable[Partition]: + yield from self._partitions + + +class InMemoryPartition(Partition): + def stream_name(self) -> str: + return self._stream_name + + def __init__(self, name, stream_name, _slice, records): + self._name = name + self._stream_name = stream_name + self._slice = _slice + self._records = records + self._is_closed = False + + def read(self) -> Iterable[Record]: + for record_or_exception in self._records: + if isinstance(record_or_exception, Exception): + raise record_or_exception + else: + yield record_or_exception + + def to_slice(self) -> Optional[Mapping[str, Any]]: + return self._slice + + def __hash__(self) -> int: + if self._slice: + # Convert the slice to a string so that it can be hashed + s = json.dumps(self._slice, sort_keys=True) + return hash((self._name, s)) + else: + return hash(self._name) + + def close(self) -> None: + self._is_closed = True + + def is_closed(self) -> bool: + return self._is_closed + + +class ConcurrentSourceBuilder(SourceBuilder[ConcurrentCdkSource]): + def __init__(self): + self._streams: List[DefaultStream] = [] + self._message_repository = None + + def build(self, configured_catalog: Optional[Mapping[str, Any]], _, __) -> ConcurrentCdkSource: + return ConcurrentCdkSource(self._streams, self._message_repository, 1, 1) + + def set_streams(self, streams: List[DefaultStream]) -> "ConcurrentSourceBuilder": + self._streams = streams + return self + + def set_message_repository(self, message_repository: MessageRepository) -> "ConcurrentSourceBuilder": + self._message_repository = message_repository + return self + + +class NeverLogSliceLogger(SliceLogger): + def should_log_slice_message(self, logger: logging.Logger) -> bool: + return False diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/utils.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/utils.py new file mode 100644 index 000000000000..85f6a1f7c634 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/scenarios/utils.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.core import StreamData + + +class MockStream(Stream): + def __init__( + self, + slices_and_records_or_exception: Iterable[Tuple[Optional[Mapping[str, Any]], Iterable[Union[Exception, Mapping[str, Any]]]]], + name, + json_schema, + primary_key=None, + cursor_field=None, + ): + self._slices_and_records_or_exception = slices_and_records_or_exception + self._name = name + self._json_schema = json_schema + self._primary_key = primary_key + self._cursor_field = cursor_field + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + for _slice, records_or_exception in self._slices_and_records_or_exception: + if stream_slice == _slice: + for item in records_or_exception: + if isinstance(item, Exception): + raise item + yield item + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return self._primary_key + + @property + def name(self) -> str: + return self._name + + @property + def cursor_field(self) -> Union[str, List[str]]: + return self._cursor_field or [] + + def get_json_schema(self) -> Mapping[str, Any]: + return self._json_schema + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + if self._slices_and_records_or_exception: + yield from [_slice for _slice, records_or_exception in self._slices_and_records_or_exception] + else: + yield None diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_adapters.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_adapters.py new file mode 100644 index 000000000000..c5c44a780176 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_adapters.py @@ -0,0 +1,400 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import datetime +import logging +import unittest +from unittest.mock import Mock + +import pytest +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, Level, SyncMode +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.message import InMemoryMessageRepository +from airbyte_cdk.sources.streams.concurrent.adapters import ( + AvailabilityStrategyFacade, + CursorPartitionGenerator, + StreamFacade, + StreamPartition, + StreamPartitionGenerator, +) +from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE, StreamAvailable, StreamUnavailable +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor +from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + CustomFormatConcurrentStreamStateConverter, +) +from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.types import StreamSlice +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer + +_ANY_SYNC_MODE = SyncMode.full_refresh +_ANY_STATE = {"state_key": "state_value"} +_ANY_CURSOR_FIELD = ["a", "cursor", "key"] +_STREAM_NAME = "stream" +_ANY_CURSOR = Mock(spec=Cursor) + + +@pytest.mark.parametrize( + "stream_availability, expected_available, expected_message", + [ + pytest.param(StreamAvailable(), True, None, id="test_stream_is_available"), + pytest.param(STREAM_AVAILABLE, True, None, id="test_stream_is_available_using_singleton"), + pytest.param(StreamUnavailable("message"), False, "message", id="test_stream_is_available"), + ], +) +def test_availability_strategy_facade(stream_availability, expected_available, expected_message): + strategy = Mock() + strategy.check_availability.return_value = stream_availability + facade = AvailabilityStrategyFacade(strategy) + + logger = Mock() + available, message = facade.check_availability(Mock(), logger, Mock()) + + assert available == expected_available + assert message == expected_message + + strategy.check_availability.assert_called_once_with(logger) + + +@pytest.mark.parametrize( + "sync_mode", + [ + pytest.param(SyncMode.full_refresh, id="test_full_refresh"), + pytest.param(SyncMode.incremental, id="test_incremental"), + ], +) +def test_stream_partition_generator(sync_mode): + stream = Mock() + message_repository = Mock() + stream_slices = [{"slice": 1}, {"slice": 2}] + stream.stream_slices.return_value = stream_slices + + partition_generator = StreamPartitionGenerator(stream, message_repository, _ANY_SYNC_MODE, _ANY_CURSOR_FIELD, _ANY_STATE, _ANY_CURSOR) + + partitions = list(partition_generator.generate()) + slices = [partition.to_slice() for partition in partitions] + assert slices == stream_slices + stream.stream_slices.assert_called_once_with(sync_mode=_ANY_SYNC_MODE, cursor_field=_ANY_CURSOR_FIELD, stream_state=_ANY_STATE) + + +@pytest.mark.parametrize( + "transformer, expected_records", + [ + pytest.param( + TypeTransformer(TransformConfig.NoTransform), + [Record({"data": "1"}, None), Record({"data": "2"}, None)], + id="test_no_transform", + ), + pytest.param( + TypeTransformer(TransformConfig.DefaultSchemaNormalization), + [Record({"data": 1}, None), Record({"data": 2}, None)], + id="test_default_transform", + ), + ], +) +def test_stream_partition(transformer, expected_records): + stream = Mock() + stream.name = _STREAM_NAME + stream.get_json_schema.return_value = {"type": "object", "properties": {"data": {"type": ["integer"]}}} + stream.transformer = transformer + message_repository = InMemoryMessageRepository() + _slice = None + sync_mode = SyncMode.full_refresh + cursor_field = None + state = None + partition = StreamPartition(stream, _slice, message_repository, sync_mode, cursor_field, state, _ANY_CURSOR) + + a_log_message = AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.INFO, + message='slice:{"partition": 1}', + ), + ) + for record in expected_records: + record.partition = partition + + stream_data = [a_log_message, {"data": "1"}, {"data": "2"}] + stream.read_records.return_value = stream_data + + records = list(partition.read()) + messages = list(message_repository.consume_queue()) + + assert records == expected_records + assert messages == [a_log_message] + + +@pytest.mark.parametrize( + "exception_type, expected_display_message", + [ + pytest.param(Exception, None, id="test_exception_no_display_message"), + pytest.param(ExceptionWithDisplayMessage, "display_message", id="test_exception_no_display_message"), + ], +) +def test_stream_partition_raising_exception(exception_type, expected_display_message): + stream = Mock() + stream.get_error_display_message.return_value = expected_display_message + + message_repository = InMemoryMessageRepository() + _slice = None + + partition = StreamPartition(stream, _slice, message_repository, _ANY_SYNC_MODE, _ANY_CURSOR_FIELD, _ANY_STATE, _ANY_CURSOR) + + stream.read_records.side_effect = Exception() + + with pytest.raises(exception_type) as e: + list(partition.read()) + if isinstance(e, ExceptionWithDisplayMessage): + assert e.display_message == "display message" + + +@pytest.mark.parametrize( + "_slice, expected_hash", + [ + pytest.param({"partition": 1, "k": "v"}, hash(("stream", '{"k": "v", "partition": 1}')), id="test_hash_with_slice"), + pytest.param(None, hash("stream"), id="test_hash_no_slice"), + ], +) +def test_stream_partition_hash(_slice, expected_hash): + stream = Mock() + stream.name = "stream" + partition = StreamPartition(stream, _slice, Mock(), _ANY_SYNC_MODE, _ANY_CURSOR_FIELD, _ANY_STATE, _ANY_CURSOR) + + _hash = partition.__hash__() + assert _hash == expected_hash + + +class StreamFacadeTest(unittest.TestCase): + def setUp(self): + self._abstract_stream = Mock() + self._abstract_stream.name = "stream" + self._abstract_stream.as_airbyte_stream.return_value = AirbyteStream( + name="stream", + json_schema={"type": "object"}, + supported_sync_modes=[SyncMode.full_refresh], + ) + self._legacy_stream = Mock(spec=Stream) + self._cursor = Mock(spec=Cursor) + self._logger = Mock() + self._slice_logger = Mock() + self._slice_logger.should_log_slice_message.return_value = False + self._facade = StreamFacade(self._abstract_stream, self._legacy_stream, self._cursor, self._slice_logger, self._logger) + self._source = Mock() + + self._stream = Mock() + self._stream.primary_key = "id" + + def test_name_is_delegated_to_wrapped_stream(self): + assert self._facade.name == self._abstract_stream.name + + def test_cursor_field_is_a_string(self): + self._abstract_stream.cursor_field = "cursor_field" + assert self._facade.cursor_field == "cursor_field" + + def test_none_cursor_field_is_converted_to_an_empty_list(self): + self._abstract_stream.cursor_field = None + assert self._facade.cursor_field == [] + + def test_source_defined_cursor_is_true(self): + assert self._facade.source_defined_cursor + + def test_json_schema_is_delegated_to_wrapped_stream(self): + json_schema = {"type": "object"} + self._abstract_stream.get_json_schema.return_value = json_schema + assert self._facade.get_json_schema() == json_schema + self._abstract_stream.get_json_schema.assert_called_once_with() + + def test_given_cursor_is_noop_when_supports_incremental_then_return_legacy_stream_response(self): + assert ( + StreamFacade( + self._abstract_stream, self._legacy_stream, _ANY_CURSOR, Mock(spec=SliceLogger), Mock(spec=logging.Logger) + ).supports_incremental + == self._legacy_stream.supports_incremental + ) + + def test_given_cursor_is_not_noop_when_supports_incremental_then_return_true(self): + assert StreamFacade( + self._abstract_stream, self._legacy_stream, Mock(spec=Cursor), Mock(spec=SliceLogger), Mock(spec=logging.Logger) + ).supports_incremental + + def test_check_availability_is_delegated_to_wrapped_stream(self): + availability = StreamAvailable() + self._abstract_stream.check_availability.return_value = availability + assert self._facade.check_availability(Mock(), Mock()) == (availability.is_available(), availability.message()) + self._abstract_stream.check_availability.assert_called_once_with() + + def test_full_refresh(self): + expected_stream_data = [{"data": 1}, {"data": 2}] + records = [Record(data, "stream") for data in expected_stream_data] + + partition = Mock() + partition.read.return_value = records + self._abstract_stream.generate_partitions.return_value = [partition] + + actual_stream_data = list(self._facade.read_records(SyncMode.full_refresh, None, None, None)) + + assert actual_stream_data == expected_stream_data + + def test_read_records(self): + expected_stream_data = [{"data": 1}, {"data": 2}] + records = [Record(data, "stream") for data in expected_stream_data] + partition = Mock() + partition.read.return_value = records + self._abstract_stream.generate_partitions.return_value = [partition] + + actual_stream_data = list(self._facade.read(None, None, None, None, None, None)) + + assert actual_stream_data == expected_stream_data + + def test_create_from_stream_stream(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = "id" + stream.cursor_field = "cursor" + + facade = StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + assert facade.name == "stream" + assert facade.cursor_field == "cursor" + assert facade._abstract_stream._primary_key == ["id"] + + def test_create_from_stream_stream_with_none_primary_key(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = None + stream.cursor_field = [] + + facade = StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + assert facade._abstract_stream._primary_key == [] + + def test_create_from_stream_with_composite_primary_key(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = ["id", "name"] + stream.cursor_field = [] + + facade = StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + assert facade._abstract_stream._primary_key == ["id", "name"] + + def test_create_from_stream_with_empty_list_cursor(self): + stream = Mock() + stream.primary_key = "id" + stream.cursor_field = [] + + facade = StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + assert facade.cursor_field == [] + + def test_create_from_stream_raises_exception_if_primary_key_is_nested(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = [["field", "id"]] + + with self.assertRaises(ValueError): + StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + def test_create_from_stream_raises_exception_if_primary_key_has_invalid_type(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = 123 + + with self.assertRaises(ValueError): + StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + def test_create_from_stream_raises_exception_if_cursor_field_is_nested(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = "id" + stream.cursor_field = ["field", "cursor"] + + with self.assertRaises(ValueError): + StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + + def test_create_from_stream_with_cursor_field_as_list(self): + stream = Mock() + stream.name = "stream" + stream.primary_key = "id" + stream.cursor_field = ["cursor"] + + facade = StreamFacade.create_from_stream(stream, self._source, self._logger, _ANY_STATE, self._cursor) + assert facade.cursor_field == "cursor" + + def test_create_from_stream_none_message_repository(self): + self._stream.name = "stream" + self._stream.primary_key = "id" + self._stream.cursor_field = "cursor" + self._source.message_repository = None + + with self.assertRaises(ValueError): + StreamFacade.create_from_stream(self._stream, self._source, self._logger, {}, self._cursor) + + def test_get_error_display_message_no_display_message(self): + self._stream.get_error_display_message.return_value = "display_message" + + facade = StreamFacade.create_from_stream(self._stream, self._source, self._logger, _ANY_STATE, self._cursor) + + expected_display_message = None + e = Exception() + + display_message = facade.get_error_display_message(e) + + assert display_message == expected_display_message + + def test_get_error_display_message_with_display_message(self): + self._stream.get_error_display_message.return_value = "display_message" + + facade = StreamFacade.create_from_stream(self._stream, self._source, self._logger, _ANY_STATE, self._cursor) + + expected_display_message = "display_message" + e = ExceptionWithDisplayMessage("display_message") + + display_message = facade.get_error_display_message(e) + + assert display_message == expected_display_message + + +@pytest.mark.parametrize( + "exception, expected_display_message", + [ + pytest.param(Exception("message"), None, id="test_no_display_message"), + pytest.param(ExceptionWithDisplayMessage("message"), "message", id="test_no_display_message"), + ], +) +def test_get_error_display_message(exception, expected_display_message): + stream = Mock() + legacy_stream = Mock() + cursor = Mock(spec=Cursor) + facade = StreamFacade(stream, legacy_stream, cursor, Mock().Mock(), Mock()) + + display_message = facade.get_error_display_message(exception) + + assert display_message == expected_display_message + + +def test_cursor_partition_generator(): + stream = Mock() + cursor = Mock() + message_repository = Mock() + connector_state_converter = CustomFormatConcurrentStreamStateConverter(datetime_format="%Y-%m-%dT%H:%M:%S") + cursor_field = Mock() + slice_boundary_fields = ("start", "end") + + expected_slices = [StreamSlice(partition={}, cursor_slice={"start": "2024-01-01T00:00:00", "end": "2024-01-02T00:00:00"})] + cursor.generate_slices.return_value = [(datetime.datetime(year=2024, month=1, day=1), datetime.datetime(year=2024, month=1, day=2))] + + partition_generator = CursorPartitionGenerator( + stream, + message_repository, + cursor, + connector_state_converter, + cursor_field, + slice_boundary_fields + ) + + partitions = list(partition_generator.generate()) + generated_slices = [partition.to_slice() for partition in partitions] + + assert all(isinstance(partition, StreamPartition) for partition in partitions), "Not all partitions are instances of StreamPartition" + assert generated_slices == expected_slices, f"Expected {expected_slices}, but got {generated_slices}" diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py new file mode 100644 index 000000000000..bd2d4b1ec5bf --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -0,0 +1,757 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +import unittest +from unittest.mock import Mock, call + +import freezegun +import pytest +from airbyte_cdk.models import ( + AirbyteLogMessage, + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStream, + AirbyteStreamStatus, + AirbyteStreamStatusTraceMessage, + AirbyteTraceMessage, +) +from airbyte_cdk.models import Level as LogLevel +from airbyte_cdk.models import StreamDescriptor, SyncMode, TraceType +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor +from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel +from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.message import LogMessage, MessageRepository +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer +from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel +from airbyte_cdk.sources.utils.slice_logger import SliceLogger +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + +_STREAM_NAME = "stream" +_ANOTHER_STREAM_NAME = "stream2" +_ANY_AIRBYTE_MESSAGE = Mock(spec=AirbyteMessage) +_IS_SUCCESSFUL = True + + +class TestConcurrentReadProcessor(unittest.TestCase): + def setUp(self): + self._partition_enqueuer = Mock(spec=PartitionEnqueuer) + self._thread_pool_manager = Mock(spec=ThreadPoolManager) + + self._an_open_partition = Mock(spec=Partition) + self._log_message = Mock(spec=LogMessage) + self._an_open_partition.to_slice.return_value = self._log_message + self._an_open_partition.stream_name.return_value = _STREAM_NAME + + self._a_closed_partition = Mock(spec=Partition) + self._a_closed_partition.stream_name.return_value = _ANOTHER_STREAM_NAME + + self._logger = Mock(spec=logging.Logger) + self._slice_logger = Mock(spec=SliceLogger) + self._slice_logger.create_slice_log_message.return_value = self._log_message + self._message_repository = Mock(spec=MessageRepository) + self._message_repository.consume_queue.return_value = [] + self._partition_reader = Mock(spec=PartitionReader) + + self._stream = Mock(spec=AbstractStream) + self._stream.name = _STREAM_NAME + self._stream.as_airbyte_stream.return_value = AirbyteStream( + name=_STREAM_NAME, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + self._another_stream = Mock(spec=AbstractStream) + self._another_stream.name = _ANOTHER_STREAM_NAME + self._another_stream.as_airbyte_stream.return_value = AirbyteStream( + name=_ANOTHER_STREAM_NAME, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + self._record_data = {"id": 1, "value": "A"} + self._partition = Mock(spec=Partition) + self._partition.stream_name = lambda: _STREAM_NAME + self._record = Mock(spec=Record) + self._record.partition = self._partition + self._record.data = self._record_data + self._record.is_file_transfer_message = False + + def test_stream_is_not_done_initially(self): + stream_instances_to_read_from = [self._stream] + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + assert not handler._is_stream_done(self._stream.name) + + def test_handle_partition_done_no_other_streams_to_generate_partitions_for(self): + stream_instances_to_read_from = [self._stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + handler.start_next_partition_generator() + handler.on_partition(self._an_open_partition) + + sentinel = PartitionGenerationCompletedSentinel(self._stream) + messages = list(handler.on_partition_generation_completed(sentinel)) + + expected_messages = [] + assert messages == expected_messages + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_handle_last_stream_partition_done(self): + in_order_validation_mock = Mock() + in_order_validation_mock.attach_mock(self._another_stream, "_another_stream") + in_order_validation_mock.attach_mock(self._message_repository, "_message_repository") + self._message_repository.consume_queue.return_value = iter([_ANY_AIRBYTE_MESSAGE]) + stream_instances_to_read_from = [self._another_stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + handler.start_next_partition_generator() + + sentinel = PartitionGenerationCompletedSentinel(self._another_stream) + messages = list(handler.on_partition_generation_completed(sentinel)) + + expected_messages = [ + _ANY_AIRBYTE_MESSAGE, + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=_ANOTHER_STREAM_NAME), + status=AirbyteStreamStatus(AirbyteStreamStatus.COMPLETE), + ), + ), + ), + ] + assert messages == expected_messages + assert in_order_validation_mock.mock_calls.index( + call._another_stream.cursor.ensure_at_least_one_state_emitted + ) < in_order_validation_mock.mock_calls.index(call._message_repository.consume_queue) + + def test_handle_partition(self): + stream_instances_to_read_from = [self._another_stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.on_partition(self._a_closed_partition) + + self._thread_pool_manager.submit.assert_called_with(self._partition_reader.process_partition, self._a_closed_partition) + assert self._a_closed_partition in handler._streams_to_running_partitions[_ANOTHER_STREAM_NAME] + + def test_handle_partition_emits_log_message_if_it_should_be_logged(self): + stream_instances_to_read_from = [self._stream] + self._slice_logger = Mock(spec=SliceLogger) + self._slice_logger.should_log_slice_message.return_value = True + self._slice_logger.create_slice_log_message.return_value = self._log_message + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.on_partition(self._an_open_partition) + + self._thread_pool_manager.submit.assert_called_with(self._partition_reader.process_partition, self._an_open_partition) + self._message_repository.emit_message.assert_called_with(self._log_message) + + assert self._an_open_partition in handler._streams_to_running_partitions[_STREAM_NAME] + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_handle_on_partition_complete_sentinel_with_messages_from_repository(self): + stream_instances_to_read_from = [self._stream] + partition = Mock(spec=Partition) + log_message = Mock(spec=LogMessage) + partition.to_slice.return_value = log_message + partition.stream_name.return_value = _STREAM_NAME + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + handler.start_next_partition_generator() + handler.on_partition(partition) + + sentinel = PartitionCompleteSentinel(partition) + + self._message_repository.consume_queue.return_value = [ + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=LogLevel.INFO, message="message emitted from the repository")) + ] + + messages = list(handler.on_partition_complete_sentinel(sentinel)) + + expected_messages = [ + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=LogLevel.INFO, message="message emitted from the repository")) + ] + assert messages == expected_messages + + partition.close.assert_called_once() + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_handle_on_partition_complete_sentinel_yields_status_message_if_the_stream_is_done(self): + self._streams_currently_generating_partitions = [self._another_stream] + stream_instances_to_read_from = [self._another_stream] + log_message = Mock(spec=LogMessage) + self._a_closed_partition.to_slice.return_value = log_message + self._message_repository.consume_queue.return_value = [] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + handler.start_next_partition_generator() + handler.on_partition(self._a_closed_partition) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._another_stream))) + + sentinel = PartitionCompleteSentinel(self._a_closed_partition) + + messages = list(handler.on_partition_complete_sentinel(sentinel)) + + expected_messages = [ + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor( + name=_ANOTHER_STREAM_NAME, + ), + status=AirbyteStreamStatus.COMPLETE, + ), + emitted_at=1577836800000.0, + ), + ) + ] + assert messages == expected_messages + self._a_closed_partition.close.assert_called_once() + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_given_exception_on_partition_complete_sentinel_then_yield_error_trace_message_and_stream_is_incomplete(self) -> None: + self._a_closed_partition.stream_name.return_value = self._stream.name + self._a_closed_partition.close.side_effect = ValueError + + handler = ConcurrentReadProcessor( + [self._stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + handler.start_next_partition_generator() + handler.on_partition(self._a_closed_partition) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._stream))) + messages = list(handler.on_partition_complete_sentinel(PartitionCompleteSentinel(self._a_closed_partition))) + + expected_status_message = AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor( + name=self._stream.name, + ), + status=AirbyteStreamStatus.INCOMPLETE, + ), + emitted_at=1577836800000.0, + ), + ) + assert list(map(lambda message: message.trace.type, messages)) == [TraceType.ERROR, TraceType.STREAM_STATUS] + assert messages[1] == expected_status_message + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_handle_on_partition_complete_sentinel_yields_no_status_message_if_the_stream_is_not_done(self): + stream_instances_to_read_from = [self._stream] + partition = Mock(spec=Partition) + log_message = Mock(spec=LogMessage) + partition.to_slice.return_value = log_message + partition.stream_name.return_value = _STREAM_NAME + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + handler.start_next_partition_generator() + + sentinel = PartitionCompleteSentinel(partition) + + messages = list(handler.on_partition_complete_sentinel(sentinel)) + + expected_messages = [] + assert messages == expected_messages + partition.close.assert_called_once() + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_on_record_no_status_message_no_repository_messge(self): + stream_instances_to_read_from = [self._stream] + partition = Mock(spec=Partition) + log_message = Mock(spec=LogMessage) + partition.to_slice.return_value = log_message + partition.stream_name.return_value = _STREAM_NAME + self._message_repository.consume_queue.return_value = [] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Simulate a first record + list(handler.on_record(self._record)) + + messages = list(handler.on_record(self._record)) + + expected_messages = [ + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream=_STREAM_NAME, + data=self._record_data, + emitted_at=1577836800000, + ), + ) + ] + assert messages == expected_messages + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_on_record_with_repository_messge(self): + stream_instances_to_read_from = [self._stream] + partition = Mock(spec=Partition) + log_message = Mock(spec=LogMessage) + partition.to_slice.return_value = log_message + partition.stream_name.return_value = _STREAM_NAME + slice_logger = Mock(spec=SliceLogger) + slice_logger.should_log_slice_message.return_value = True + slice_logger.create_slice_log_message.return_value = log_message + self._message_repository.consume_queue.return_value = [ + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=LogLevel.INFO, message="message emitted from the repository")) + ] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + stream = Mock(spec=AbstractStream) + stream.name = _STREAM_NAME + stream.as_airbyte_stream.return_value = AirbyteStream( + name=_STREAM_NAME, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + # Simulate a first record + list(handler.on_record(self._record)) + + messages = list(handler.on_record(self._record)) + + expected_messages = [ + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream=_STREAM_NAME, + data=self._record_data, + emitted_at=1577836800000, + ), + ), + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=LogLevel.INFO, message="message emitted from the repository")), + ] + assert messages == expected_messages + assert handler._record_counter[_STREAM_NAME] == 2 + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_on_record_emits_status_message_on_first_record_no_repository_message(self): + self._streams_currently_generating_partitions = [_STREAM_NAME] + stream_instances_to_read_from = [self._stream] + partition = Mock(spec=Partition) + partition.stream_name.return_value = _STREAM_NAME + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + messages = list(handler.on_record(self._record)) + + expected_messages = [ + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=_STREAM_NAME), status=AirbyteStreamStatus(AirbyteStreamStatus.RUNNING) + ), + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream=_STREAM_NAME, + data=self._record_data, + emitted_at=1577836800000, + ), + ), + ] + assert messages == expected_messages + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_on_record_emits_status_message_on_first_record_with_repository_message(self): + stream_instances_to_read_from = [self._stream] + partition = Mock(spec=Partition) + log_message = Mock(spec=LogMessage) + partition.to_slice.return_value = log_message + partition.stream_name.return_value = _STREAM_NAME + self._message_repository.consume_queue.return_value = [ + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=LogLevel.INFO, message="message emitted from the repository")) + ] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + stream = Mock(spec=AbstractStream) + stream.name = _STREAM_NAME + stream.as_airbyte_stream.return_value = AirbyteStream( + name=_STREAM_NAME, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + messages = list(handler.on_record(self._record)) + + expected_messages = [ + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=_STREAM_NAME), status=AirbyteStreamStatus(AirbyteStreamStatus.RUNNING) + ), + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream=_STREAM_NAME, + data=self._record_data, + emitted_at=1577836800000, + ), + ), + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=LogLevel.INFO, message="message emitted from the repository")), + ] + assert messages == expected_messages + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_on_exception_return_trace_message_and_on_stream_complete_return_stream_status(self): + stream_instances_to_read_from = [self._stream, self._another_stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.start_next_partition_generator() + handler.on_partition(self._an_open_partition) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._stream))) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._another_stream))) + + another_stream = Mock(spec=AbstractStream) + another_stream.name = _STREAM_NAME + another_stream.as_airbyte_stream.return_value = AirbyteStream( + name=_ANOTHER_STREAM_NAME, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + exception = StreamThreadException(RuntimeError("Something went wrong"), _STREAM_NAME) + + exception_messages = list(handler.on_exception(exception)) + assert len(exception_messages) == 1 + assert "StreamThreadException" in exception_messages[0].trace.error.stack_trace + + assert list(handler.on_partition_complete_sentinel(PartitionCompleteSentinel(self._an_open_partition))) == [ + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=_STREAM_NAME), status=AirbyteStreamStatus(AirbyteStreamStatus.INCOMPLETE) + ), + ), + ) + ] + with pytest.raises(AirbyteTracedException): + handler.is_done() + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_given_underlying_exception_is_traced_exception_on_exception_return_trace_message_and_on_stream_complete_return_stream_status( + self, + ): + stream_instances_to_read_from = [self._stream, self._another_stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.start_next_partition_generator() + handler.on_partition(self._an_open_partition) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._stream))) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._another_stream))) + + another_stream = Mock(spec=AbstractStream) + another_stream.name = _STREAM_NAME + another_stream.as_airbyte_stream.return_value = AirbyteStream( + name=_ANOTHER_STREAM_NAME, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + underlying_exception = AirbyteTracedException() + exception = StreamThreadException(underlying_exception, _STREAM_NAME) + + exception_messages = list(handler.on_exception(exception)) + assert len(exception_messages) == 1 + assert "AirbyteTracedException" in exception_messages[0].trace.error.stack_trace + + assert list(handler.on_partition_complete_sentinel(PartitionCompleteSentinel(self._an_open_partition))) == [ + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=_STREAM_NAME), status=AirbyteStreamStatus(AirbyteStreamStatus.INCOMPLETE) + ), + ), + ) + ] + with pytest.raises(AirbyteTracedException): + handler.is_done() + + def test_given_partition_completion_is_not_success_then_do_not_close_partition(self): + stream_instances_to_read_from = [self._stream, self._another_stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.start_next_partition_generator() + handler.on_partition(self._an_open_partition) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._stream))) + + list(handler.on_partition_complete_sentinel(PartitionCompleteSentinel(self._an_open_partition, not _IS_SUCCESSFUL))) + + assert self._an_open_partition.close.call_count == 0 + + def test_given_partition_completion_is_not_success_then_do_not_close_partition(self): + stream_instances_to_read_from = [self._stream, self._another_stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.start_next_partition_generator() + handler.on_partition(self._an_open_partition) + list(handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._stream))) + + list(handler.on_partition_complete_sentinel(PartitionCompleteSentinel(self._an_open_partition, not _IS_SUCCESSFUL))) + + assert self._an_open_partition.close.call_count == 0 + + def test_is_done_is_false_if_there_are_any_instances_to_read_from(self): + stream_instances_to_read_from = [self._stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + assert not handler.is_done() + + def test_is_done_is_false_if_there_are_streams_still_generating_partitions(self): + stream_instances_to_read_from = [self._stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.start_next_partition_generator() + + assert not handler.is_done() + + def test_is_done_is_false_if_all_partitions_are_not_closed(self): + stream_instances_to_read_from = [self._stream] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + handler.start_next_partition_generator() + handler.on_partition(self._an_open_partition) + handler.on_partition_generation_completed(PartitionGenerationCompletedSentinel(self._stream)) + + assert not handler.is_done() + + def test_is_done_is_true_if_all_partitions_are_closed_and_no_streams_are_generating_partitions_and_none_are_still_to_run(self): + stream_instances_to_read_from = [] + + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + assert handler.is_done() + + @freezegun.freeze_time("2020-01-01T00:00:00") + def test_start_next_partition_generator(self): + stream_instances_to_read_from = [self._stream] + handler = ConcurrentReadProcessor( + stream_instances_to_read_from, + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + status_message = handler.start_next_partition_generator() + + assert status_message == AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=_STREAM_NAME), status=AirbyteStreamStatus(AirbyteStreamStatus.STARTED) + ), + ), + ) + + assert _STREAM_NAME in handler._streams_currently_generating_partitions + self._thread_pool_manager.submit.assert_called_with(self._partition_enqueuer.generate_partitions, self._stream) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_cursor.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_cursor.py new file mode 100644 index 000000000000..5da0a55b62d1 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_cursor.py @@ -0,0 +1,1057 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from datetime import datetime, timedelta, timezone +from functools import partial +from typing import Any, Mapping, Optional +from unittest import TestCase +from unittest.mock import Mock + +import freezegun +import pytest +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, CursorValueType +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import ConcurrencyCompatibleStateType +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + EpochValueConcurrentStreamStateConverter, + IsoMillisConcurrentStreamStateConverter, +) +from isodate import parse_duration + +_A_STREAM_NAME = "a stream name" +_A_STREAM_NAMESPACE = "a stream namespace" +_A_CURSOR_FIELD_KEY = "a_cursor_field_key" +_NO_STATE = {} +_NO_PARTITION_IDENTIFIER = None +_NO_SLICE = None +_NO_SLICE_BOUNDARIES = None +_LOWER_SLICE_BOUNDARY_FIELD = "lower_boundary" +_UPPER_SLICE_BOUNDARY_FIELD = "upper_boundary" +_SLICE_BOUNDARY_FIELDS = (_LOWER_SLICE_BOUNDARY_FIELD, _UPPER_SLICE_BOUNDARY_FIELD) +_A_VERY_HIGH_CURSOR_VALUE = 1000000000 +_NO_LOOKBACK_WINDOW = timedelta(seconds=0) + + +def _partition(_slice: Optional[Mapping[str, Any]], _stream_name: Optional[str] = Mock()) -> Partition: + partition = Mock(spec=Partition) + partition.to_slice.return_value = _slice + partition.stream_name.return_value = _stream_name + return partition + + +def _record(cursor_value: CursorValueType, partition: Optional[Partition] = Mock(spec=Partition)) -> Record: + return Record(data={_A_CURSOR_FIELD_KEY: cursor_value}, partition=partition) + + +class ConcurrentCursorStateTest(TestCase): + def setUp(self) -> None: + self._message_repository = Mock(spec=MessageRepository) + self._state_manager = Mock(spec=ConnectorStateManager) + + def _cursor_with_slice_boundary_fields(self, is_sequential_state=True) -> ConcurrentCursor: + return ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + {}, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + None, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + def _cursor_without_slice_boundary_fields(self) -> ConcurrentCursor: + return ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + {}, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=True), + CursorField(_A_CURSOR_FIELD_KEY), + None, + None, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + def test_given_boundary_fields_when_close_partition_then_emit_state(self) -> None: + cursor = self._cursor_with_slice_boundary_fields() + cursor.close_partition( + _partition( + {_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30}, + ) + ) + + self._message_repository.emit_message.assert_called_once_with(self._state_manager.create_state_message.return_value) + self._state_manager.update_state_for_stream.assert_called_once_with( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + {_A_CURSOR_FIELD_KEY: 0}, # State message is updated to the legacy format before being emitted + ) + + def test_given_state_not_sequential_when_close_partition_then_emit_state(self) -> None: + cursor = self._cursor_with_slice_boundary_fields(is_sequential_state=False) + cursor.close_partition( + _partition( + {_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30}, + ) + ) + + self._message_repository.emit_message.assert_called_once_with(self._state_manager.create_state_message.return_value) + self._state_manager.update_state_for_stream.assert_called_once_with( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + {"slices": [{"end": 0, "start": 0}, {"end": 30, "start": 12}], "state_type": "date-range"}, + ) + + def test_close_partition_emits_message_to_lower_boundary_when_no_prior_state_exists(self) -> None: + self._cursor_with_slice_boundary_fields().close_partition( + _partition( + {_LOWER_SLICE_BOUNDARY_FIELD: 0, _UPPER_SLICE_BOUNDARY_FIELD: 30}, + ) + ) + + self._message_repository.emit_message.assert_called_once_with(self._state_manager.create_state_message.return_value) + self._state_manager.update_state_for_stream.assert_called_once_with( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + {_A_CURSOR_FIELD_KEY: 0}, # State message is updated to the lower slice boundary + ) + + def test_given_boundary_fields_and_record_observed_when_close_partition_then_ignore_records(self) -> None: + cursor = self._cursor_with_slice_boundary_fields() + cursor.observe(_record(_A_VERY_HIGH_CURSOR_VALUE)) + + cursor.close_partition(_partition({_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30})) + + assert self._state_manager.update_state_for_stream.call_args_list[0].args[2][_A_CURSOR_FIELD_KEY] != _A_VERY_HIGH_CURSOR_VALUE + + def test_given_no_boundary_fields_when_close_partition_then_emit_state(self) -> None: + cursor = self._cursor_without_slice_boundary_fields() + partition = _partition(_NO_SLICE) + cursor.observe(_record(10, partition=partition)) + cursor.close_partition(partition) + + self._state_manager.update_state_for_stream.assert_called_once_with( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + {"a_cursor_field_key": 10}, + ) + + def test_given_no_boundary_fields_when_close_multiple_partitions_then_raise_exception(self) -> None: + cursor = self._cursor_without_slice_boundary_fields() + partition = _partition(_NO_SLICE) + cursor.observe(_record(10, partition=partition)) + cursor.close_partition(partition) + + with pytest.raises(ValueError): + cursor.close_partition(partition) + + def test_given_no_records_observed_when_close_partition_then_do_not_emit_state(self) -> None: + cursor = self._cursor_without_slice_boundary_fields() + cursor.close_partition(_partition(_NO_SLICE)) + assert self._message_repository.emit_message.call_count == 0 + + def test_given_slice_boundaries_and_no_slice_when_close_partition_then_raise_error(self) -> None: + cursor = self._cursor_with_slice_boundary_fields() + with pytest.raises(KeyError): + cursor.close_partition(_partition(_NO_SLICE)) + + def test_given_slice_boundaries_not_matching_slice_when_close_partition_then_raise_error(self) -> None: + cursor = self._cursor_with_slice_boundary_fields() + with pytest.raises(KeyError): + cursor.close_partition(_partition({"not_matching_key": "value"})) + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_no_state_when_generate_slices_then_create_slice_from_start_to_end(self): + start = datetime.fromtimestamp(10, timezone.utc) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + _NO_STATE, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(10, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_one_slice_when_generate_slices_then_create_slice_from_slice_upper_boundary_to_end(self): + start = datetime.fromtimestamp(0, timezone.utc) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 0, EpochValueConcurrentStreamStateConverter.END_KEY: 20}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(20, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_start_after_slices_when_generate_slices_then_generate_from_start(self): + start = datetime.fromtimestamp(30, timezone.utc) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 0, EpochValueConcurrentStreamStateConverter.END_KEY: 20}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(30, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_state_with_gap_and_start_after_slices_when_generate_slices_then_generate_from_start(self): + start = datetime.fromtimestamp(30, timezone.utc) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 0, EpochValueConcurrentStreamStateConverter.END_KEY: 10}, + {EpochValueConcurrentStreamStateConverter.START_KEY: 15, EpochValueConcurrentStreamStateConverter.END_KEY: 20}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(30, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_small_slice_range_when_generate_slices_then_create_many_slices(self): + start = datetime.fromtimestamp(0, timezone.utc) + small_slice_range = timedelta(seconds=10) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 0, EpochValueConcurrentStreamStateConverter.END_KEY: 20}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + small_slice_range, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(20, timezone.utc), datetime.fromtimestamp(30, timezone.utc)), + (datetime.fromtimestamp(30, timezone.utc), datetime.fromtimestamp(40, timezone.utc)), + (datetime.fromtimestamp(40, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_difference_between_slices_match_slice_range_when_generate_slices_then_create_one_slice(self): + start = datetime.fromtimestamp(0, timezone.utc) + small_slice_range = timedelta(seconds=10) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 0, EpochValueConcurrentStreamStateConverter.END_KEY: 30}, + {EpochValueConcurrentStreamStateConverter.START_KEY: 40, EpochValueConcurrentStreamStateConverter.END_KEY: 50}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + small_slice_range, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(30, timezone.utc), datetime.fromtimestamp(40, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_small_slice_range_with_granularity_when_generate_slices_then_create_many_slices(self): + start = datetime.fromtimestamp(1, timezone.utc) + small_slice_range = timedelta(seconds=10) + granularity = timedelta(seconds=1) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 1, EpochValueConcurrentStreamStateConverter.END_KEY: 20}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + small_slice_range, + granularity, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(20, timezone.utc), datetime.fromtimestamp(29, timezone.utc)), + (datetime.fromtimestamp(30, timezone.utc), datetime.fromtimestamp(39, timezone.utc)), + (datetime.fromtimestamp(40, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_difference_between_slices_match_slice_range_and_cursor_granularity_when_generate_slices_then_create_one_slice(self): + start = datetime.fromtimestamp(1, timezone.utc) + small_slice_range = timedelta(seconds=10) + granularity = timedelta(seconds=1) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 1, EpochValueConcurrentStreamStateConverter.END_KEY: 30}, + {EpochValueConcurrentStreamStateConverter.START_KEY: 41, EpochValueConcurrentStreamStateConverter.END_KEY: 50}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + small_slice_range, + granularity, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(31, timezone.utc), datetime.fromtimestamp(40, timezone.utc)), # FIXME there should probably be the granularity at the beginning too + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_non_continuous_state_when_generate_slices_then_create_slices_between_gaps_and_after(self): + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 0, EpochValueConcurrentStreamStateConverter.END_KEY: 10}, + {EpochValueConcurrentStreamStateConverter.START_KEY: 20, EpochValueConcurrentStreamStateConverter.END_KEY: 25}, + {EpochValueConcurrentStreamStateConverter.START_KEY: 30, EpochValueConcurrentStreamStateConverter.END_KEY: 40}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + None, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(10, timezone.utc), datetime.fromtimestamp(20, timezone.utc)), + (datetime.fromtimestamp(25, timezone.utc), datetime.fromtimestamp(30, timezone.utc)), + (datetime.fromtimestamp(40, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_lookback_window_when_generate_slices_then_apply_lookback_on_most_recent_slice(self): + start = datetime.fromtimestamp(0, timezone.utc) + lookback_window = timedelta(seconds=10) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 0, EpochValueConcurrentStreamStateConverter.END_KEY: 20}, + {EpochValueConcurrentStreamStateConverter.START_KEY: 30, EpochValueConcurrentStreamStateConverter.END_KEY: 40}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + lookback_window, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(20, timezone.utc), datetime.fromtimestamp(30, timezone.utc)), + (datetime.fromtimestamp(30, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(50, timezone.utc)) + def test_given_start_is_before_first_slice_lower_boundary_when_generate_slices_then_generate_slice_before(self): + start = datetime.fromtimestamp(0, timezone.utc) + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "slices": [ + {EpochValueConcurrentStreamStateConverter.START_KEY: 10, EpochValueConcurrentStreamStateConverter.END_KEY: 20}, + ], + }, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(is_sequential_state=False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + start, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [ + (datetime.fromtimestamp(0, timezone.utc), datetime.fromtimestamp(10, timezone.utc)), + (datetime.fromtimestamp(20, timezone.utc), datetime.fromtimestamp(50, timezone.utc)), + ] + + def test_slices_with_records_when_close_then_most_recent_cursor_value_from_most_recent_slice(self) -> None: + cursor = self._cursor_with_slice_boundary_fields(is_sequential_state=False) + first_partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 0, _UPPER_SLICE_BOUNDARY_FIELD: 10}) + second_partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 10, _UPPER_SLICE_BOUNDARY_FIELD: 20}) + cursor.observe(_record(5, partition=first_partition)) + cursor.close_partition(first_partition) + + cursor.observe(_record(15, partition=second_partition)) + cursor.close_partition(second_partition) + + assert self._state_manager.update_state_for_stream.call_args_list[-1].args[2] == { + "slices": [ + {"end": 20, "start": 0, "most_recent_cursor_value": 15} + ], + "state_type": "date-range", + } + + def test_last_slice_without_records_when_close_then_most_recent_cursor_value_is_from_previous_slice(self) -> None: + cursor = self._cursor_with_slice_boundary_fields(is_sequential_state=False) + first_partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 0, _UPPER_SLICE_BOUNDARY_FIELD: 10}) + second_partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 10, _UPPER_SLICE_BOUNDARY_FIELD: 20}) + cursor.observe(_record(5, partition=first_partition)) + cursor.close_partition(first_partition) + + cursor.close_partition(second_partition) + + assert self._state_manager.update_state_for_stream.call_args_list[-1].args[2] == { + "slices": [ + {"end": 20, "start": 0, "most_recent_cursor_value": 5} + ], + "state_type": "date-range", + } + + def test_most_recent_cursor_value_outside_of_boundaries_when_close_then_most_recent_cursor_value_still_considered(self) -> None: + """ + Not sure what is the value of this behavior but I'm simply documenting how it is today + """ + cursor = self._cursor_with_slice_boundary_fields(is_sequential_state=False) + partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 0, _UPPER_SLICE_BOUNDARY_FIELD: 10}) + cursor.observe(_record(15, partition=partition)) + cursor.close_partition(partition) + + assert self._state_manager.update_state_for_stream.call_args_list[-1].args[2] == { + "slices": [ + {"end": 10, "start": 0, "most_recent_cursor_value": 15} + ], + "state_type": "date-range", + } + + def test_most_recent_cursor_value_on_sequential_state_when_close_then_cursor_value_is_most_recent_cursor_value(self) -> None: + cursor = self._cursor_with_slice_boundary_fields(is_sequential_state=True) + partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 0, _UPPER_SLICE_BOUNDARY_FIELD: 10}) + cursor.observe(_record(7, partition=partition)) + cursor.close_partition(partition) + + assert self._state_manager.update_state_for_stream.call_args_list[-1].args[2] == { + _A_CURSOR_FIELD_KEY: 7 + } + + def test_non_continuous_slices_on_sequential_state_when_close_then_cursor_value_is_most_recent_cursor_value_of_first_slice(self) -> None: + cursor = self._cursor_with_slice_boundary_fields(is_sequential_state=True) + first_partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 0, _UPPER_SLICE_BOUNDARY_FIELD: 10}) + third_partition = _partition({_LOWER_SLICE_BOUNDARY_FIELD: 20, _UPPER_SLICE_BOUNDARY_FIELD: 30}) # second partition has failed + cursor.observe(_record(7, partition=first_partition)) + cursor.close_partition(first_partition) + + cursor.close_partition(third_partition) + + assert self._state_manager.update_state_for_stream.call_args_list[-1].args[2] == { + _A_CURSOR_FIELD_KEY: 7 + } + + @freezegun.freeze_time(time_to_freeze=datetime.fromtimestamp(10, timezone.utc)) + def test_given_overflowing_slice_gap_when_generate_slices_then_cap_upper_bound_to_end_provider(self) -> None: + a_very_big_slice_range = timedelta.max + cursor = ConcurrentCursor( + _A_STREAM_NAME, + _A_STREAM_NAMESPACE, + {_A_CURSOR_FIELD_KEY: 0}, + self._message_repository, + self._state_manager, + EpochValueConcurrentStreamStateConverter(False), + CursorField(_A_CURSOR_FIELD_KEY), + _SLICE_BOUNDARY_FIELDS, + None, + EpochValueConcurrentStreamStateConverter.get_end_provider(), + _NO_LOOKBACK_WINDOW, + slice_range=a_very_big_slice_range, + ) + + slices = list(cursor.generate_slices()) + + assert slices == [(datetime.fromtimestamp(0, timezone.utc), datetime.fromtimestamp(10, timezone.utc))] + + +@freezegun.freeze_time(time_to_freeze=datetime(2024, 4, 1, 0, 0, 0, 0, tzinfo=timezone.utc)) +@pytest.mark.parametrize( + "start_datetime,end_datetime,step,cursor_field,lookback_window,state,expected_slices", + [ + pytest.param( + "{{ config.start_time }}", + "{{ config.end_time or now_utc() }}", + "P10D", + "updated_at", + "P5D", + {}, + [ + (datetime(2024, 1, 1, 0, 0, tzinfo=timezone.utc), datetime(2024, 1, 10, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 1, 11, 0, 0, tzinfo=timezone.utc), datetime(2024, 1, 20, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 1, 21, 0, 0, tzinfo=timezone.utc), datetime(2024, 1, 30, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 1, 31, 0, 0, tzinfo=timezone.utc), datetime(2024, 2, 9, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 2, 10, 0, 0, tzinfo=timezone.utc), datetime(2024, 2, 19, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 2, 20, 0, 0, tzinfo=timezone.utc), datetime(2024, 3, 1, 0, 0, 0, tzinfo=timezone.utc)) + ], + id="test_datetime_based_cursor_all_fields", + ), + pytest.param( + "{{ config.start_time }}", + "{{ config.end_time or '2024-01-01T00:00:00.000000+0000' }}", + "P10D", + "updated_at", + "P5D", + { + "slices": [ + { + "start": "2024-01-01T00:00:00.000000+0000", + "end": "2024-02-10T00:00:00.000000+0000", + } + ], + "state_type": "date-range" + }, + [ + (datetime(2024, 2, 5, 0, 0, 0, tzinfo=timezone.utc), datetime(2024, 2, 14, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 2, 15, 0, 0, 0, tzinfo=timezone.utc), datetime(2024, 2, 24, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 2, 25, 0, 0, 0, tzinfo=timezone.utc), datetime(2024, 3, 1, 0, 0, 0, tzinfo=timezone.utc)) + ], + id="test_datetime_based_cursor_with_state", + ), + pytest.param( + "{{ config.start_time }}", + "{{ config.missing or now_utc().strftime('%Y-%m-%dT%H:%M:%S.%fZ') }}", + "P20D", + "updated_at", + "P1D", + { + "slices": [ + { + "start": "2024-01-01T00:00:00.000000+0000", + "end": "2024-01-21T00:00:00.000000+0000", + } + ], + "state_type": "date-range" + }, + [ + (datetime(2024, 1, 20, 0, 0, tzinfo=timezone.utc), datetime(2024, 2, 8, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 2, 9, 0, 0, tzinfo=timezone.utc), datetime(2024, 2, 28, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 2, 29, 0, 0, tzinfo=timezone.utc), datetime(2024, 3, 19, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 3, 20, 0, 0, tzinfo=timezone.utc), datetime(2024, 4, 1, 0, 0, 0, tzinfo=timezone.utc)), + ], + id="test_datetime_based_cursor_with_state_and_end_date", + ), + pytest.param( + "{{ config.start_time }}", + "{{ config.end_time }}", + "P1M", + "updated_at", + "P5D", + {}, + [ + (datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), datetime(2024, 1, 31, 23, 59, 59, tzinfo=timezone.utc)), + (datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), datetime(2024, 3, 1, 0, 0, 0, tzinfo=timezone.utc)), + ], + id="test_datetime_based_cursor_using_large_step_duration", + ), + ] +) +def test_generate_slices_concurrent_cursor_from_datetime_based_cursor( + start_datetime, + end_datetime, + step, + cursor_field, + lookback_window, + state, + expected_slices, +): + message_repository = Mock(spec=MessageRepository) + state_manager = Mock(spec=ConnectorStateManager) + + config = { + "start_time": "2024-01-01T00:00:00.000000+0000", + "end_time": "2024-03-01T00:00:00.000000+0000", + } + + datetime_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime=start_datetime, parameters={}), + end_datetime=MinMaxDatetime(datetime=end_datetime, parameters={}), + step=step, + cursor_field=cursor_field, + partition_field_start="start", + partition_field_end="end", + datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", + cursor_granularity="PT1S", + lookback_window=lookback_window, + is_compare_strictly=True, + config=config, + parameters={}, + ) + + # I don't love that we're back to this inching close to interpolation at parse time instead of runtime + # We also might need to add a wrapped class that exposes these fields publicly or live with ugly private access + interpolated_state_date = datetime_based_cursor._start_datetime + start_date = interpolated_state_date.get_datetime(config=config) + + interpolated_end_date = datetime_based_cursor._end_datetime + interpolated_end_date_provider = partial(interpolated_end_date.get_datetime, config) + + interpolated_cursor_field = datetime_based_cursor.cursor_field + cursor_field = CursorField(cursor_field_key=interpolated_cursor_field.eval(config=config)) + + lower_slice_boundary = datetime_based_cursor._partition_field_start.eval(config=config) + upper_slice_boundary = datetime_based_cursor._partition_field_end.eval(config=config) + slice_boundary_fields = (lower_slice_boundary, upper_slice_boundary) + + # DatetimeBasedCursor returns an isodate.Duration if step uses month or year precision. This still works in our + # code, but mypy may complain when we actually implement this in the concurrent low-code source. To fix this, we + # may need to convert a Duration to timedelta by multiplying month by 30 (but could lose precision). + step_length = datetime_based_cursor._step + + lookback_window = parse_duration(datetime_based_cursor.lookback_window) if datetime_based_cursor.lookback_window else None + + cursor_granularity = parse_duration(datetime_based_cursor.cursor_granularity) if datetime_based_cursor.cursor_granularity else None + + cursor = ConcurrentCursor( + stream_name=_A_STREAM_NAME, + stream_namespace=_A_STREAM_NAMESPACE, + stream_state=state, + message_repository=message_repository, + connector_state_manager=state_manager, + connector_state_converter=IsoMillisConcurrentStreamStateConverter(is_sequential_state=True), + cursor_field=cursor_field, + slice_boundary_fields=slice_boundary_fields, + start=start_date, + end_provider=interpolated_end_date_provider, + lookback_window=lookback_window, + slice_range=step_length, + cursor_granularity=cursor_granularity, + ) + + actual_slices = list(cursor.generate_slices()) + assert actual_slices == expected_slices + + +@freezegun.freeze_time(time_to_freeze=datetime(2024, 9, 1, 0, 0, 0, 0, tzinfo=timezone.utc)) +def test_observe_concurrent_cursor_from_datetime_based_cursor(): + message_repository = Mock(spec=MessageRepository) + state_manager = Mock(spec=ConnectorStateManager) + + config = { + "start_time": "2024-08-01T00:00:00.000000+0000", + "dynamic_cursor_key": "updated_at" + } + + datetime_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="{{ config.start_time }}", parameters={}), + cursor_field="{{ config.dynamic_cursor_key }}", + datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", + config=config, + parameters={}, + ) + + interpolated_state_date = datetime_based_cursor._start_datetime + start_date = interpolated_state_date.get_datetime(config=config) + + interpolated_cursor_field = datetime_based_cursor.cursor_field + cursor_field = CursorField(cursor_field_key=interpolated_cursor_field.eval(config=config)) + + step_length = datetime_based_cursor._step + + concurrent_cursor = ConcurrentCursor( + stream_name="gods", + stream_namespace=_A_STREAM_NAMESPACE, + stream_state={}, + message_repository=message_repository, + connector_state_manager=state_manager, + connector_state_converter=IsoMillisConcurrentStreamStateConverter(is_sequential_state=True), + cursor_field=cursor_field, + slice_boundary_fields=None, + start=start_date, + end_provider=IsoMillisConcurrentStreamStateConverter.get_end_provider(), + slice_range=step_length, + ) + + partition = _partition( + {_LOWER_SLICE_BOUNDARY_FIELD: "2024-08-01T00:00:00.000000+0000", _UPPER_SLICE_BOUNDARY_FIELD: "2024-09-01T00:00:00.000000+0000"}, + _stream_name="gods", + ) + + record_1 = Record( + partition=partition, data={"id": "999", "updated_at": "2024-08-23T00:00:00.000000+0000", "name": "kratos", "mythology": "greek"}, + ) + record_2 = Record( + partition=partition, data={"id": "1000", "updated_at": "2024-08-22T00:00:00.000000+0000", "name": "odin", "mythology": "norse"}, + ) + record_3 = Record( + partition=partition, data={"id": "500", "updated_at": "2024-08-24T00:00:00.000000+0000", "name": "freya", "mythology": "norse"}, + ) + + concurrent_cursor.observe(record_1) + actual_most_recent_record = concurrent_cursor._most_recent_cursor_value_per_partition[partition] + assert actual_most_recent_record == concurrent_cursor._extract_cursor_value(record_1) + + concurrent_cursor.observe(record_2) + actual_most_recent_record = concurrent_cursor._most_recent_cursor_value_per_partition[partition] + assert actual_most_recent_record == concurrent_cursor._extract_cursor_value(record_1) + + concurrent_cursor.observe(record_3) + actual_most_recent_record = concurrent_cursor._most_recent_cursor_value_per_partition[partition] + assert actual_most_recent_record == concurrent_cursor._extract_cursor_value(record_3) + + +@freezegun.freeze_time(time_to_freeze=datetime(2024, 9, 1, 0, 0, 0, 0, tzinfo=timezone.utc)) +def test_close_partition_concurrent_cursor_from_datetime_based_cursor(): + message_repository = Mock(spec=MessageRepository) + state_manager = Mock(spec=ConnectorStateManager) + + config = { + "start_time": "2024-08-01T00:00:00.000000+0000", + "dynamic_cursor_key": "updated_at" + } + + datetime_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="{{ config.start_time }}", parameters={}), + cursor_field="{{ config.dynamic_cursor_key }}", + datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", + config=config, + parameters={}, + ) + + interpolated_state_date = datetime_based_cursor._start_datetime + start_date = interpolated_state_date.get_datetime(config=config) + + interpolated_cursor_field = datetime_based_cursor.cursor_field + cursor_field = CursorField(cursor_field_key=interpolated_cursor_field.eval(config=config)) + + step_length = datetime_based_cursor._step + + concurrent_cursor = ConcurrentCursor( + stream_name="gods", + stream_namespace=_A_STREAM_NAMESPACE, + stream_state={}, + message_repository=message_repository, + connector_state_manager=state_manager, + connector_state_converter=IsoMillisConcurrentStreamStateConverter(is_sequential_state=False), + cursor_field=cursor_field, + slice_boundary_fields=None, + start=start_date, + end_provider=IsoMillisConcurrentStreamStateConverter.get_end_provider(), + slice_range=step_length, + ) + + partition = _partition( + {_LOWER_SLICE_BOUNDARY_FIELD: "2024-08-01T00:00:00.000000+0000", _UPPER_SLICE_BOUNDARY_FIELD: "2024-09-01T00:00:00.000000+0000"}, + _stream_name="gods", + ) + + record_1 = Record( + partition=partition, data={"id": "999", "updated_at": "2024-08-23T00:00:00.000000+0000", "name": "kratos", "mythology": "greek"}, + ) + concurrent_cursor.observe(record_1) + + concurrent_cursor.close_partition(partition) + + message_repository.emit_message.assert_called_once_with(state_manager.create_state_message.return_value) + state_manager.update_state_for_stream.assert_called_once_with( + "gods", + _A_STREAM_NAMESPACE, + { + "slices": [{"end": "2024-08-23T00:00:00.000Z", "start": "2024-08-01T00:00:00.000Z", "most_recent_cursor_value": "2024-08-23T00:00:00.000Z"}], + "state_type": "date-range" + }, + ) + + +@freezegun.freeze_time(time_to_freeze=datetime(2024, 9, 1, 0, 0, 0, 0, tzinfo=timezone.utc)) +def test_close_partition_with_slice_range_concurrent_cursor_from_datetime_based_cursor(): + message_repository = Mock(spec=MessageRepository) + state_manager = Mock(spec=ConnectorStateManager) + + config = { + "start_time": "2024-07-01T00:00:00.000000+0000", + "dynamic_cursor_key": "updated_at" + } + + datetime_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="{{ config.start_time }}", parameters={}), + cursor_field="{{ config.dynamic_cursor_key }}", + datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", + step="P15D", + cursor_granularity="P1D", + config=config, + parameters={}, + ) + + interpolated_state_date = datetime_based_cursor._start_datetime + start_date = interpolated_state_date.get_datetime(config=config) + + interpolated_cursor_field = datetime_based_cursor.cursor_field + cursor_field = CursorField(cursor_field_key=interpolated_cursor_field.eval(config=config)) + + lower_slice_boundary = datetime_based_cursor._partition_field_start.eval(config=config) + upper_slice_boundary = datetime_based_cursor._partition_field_end.eval(config=config) + slice_boundary_fields = (lower_slice_boundary, upper_slice_boundary) + + step_length = datetime_based_cursor._step + + concurrent_cursor = ConcurrentCursor( + stream_name="gods", + stream_namespace=_A_STREAM_NAMESPACE, + stream_state={}, + message_repository=message_repository, + connector_state_manager=state_manager, + connector_state_converter=IsoMillisConcurrentStreamStateConverter(is_sequential_state=False, cursor_granularity=None), + cursor_field=cursor_field, + slice_boundary_fields=slice_boundary_fields, + start=start_date, + slice_range=step_length, + cursor_granularity=None, + end_provider=IsoMillisConcurrentStreamStateConverter.get_end_provider(), + ) + + partition_0 = _partition( + {"start_time": "2024-07-01T00:00:00.000000+0000", "end_time": "2024-07-16T00:00:00.000000+0000"}, _stream_name="gods", + ) + partition_3 = _partition( + {"start_time": "2024-08-15T00:00:00.000000+0000", "end_time": "2024-08-30T00:00:00.000000+0000"}, _stream_name="gods", + ) + record_1 = Record( + partition=partition_0, data={"id": "1000", "updated_at": "2024-07-05T00:00:00.000000+0000", "name": "loki", "mythology": "norse"}, + ) + record_2 = Record( + partition=partition_3, data={"id": "999", "updated_at": "2024-08-20T00:00:00.000000+0000", "name": "kratos", "mythology": "greek"}, + ) + + concurrent_cursor.observe(record_1) + concurrent_cursor.close_partition(partition_0) + concurrent_cursor.observe(record_2) + concurrent_cursor.close_partition(partition_3) + + message_repository.emit_message.assert_called_with(state_manager.create_state_message.return_value) + assert message_repository.emit_message.call_count == 2 + state_manager.update_state_for_stream.assert_called_with( + "gods", + _A_STREAM_NAMESPACE, + { + "slices": [ + {"start": "2024-07-01T00:00:00.000Z", "end": "2024-07-16T00:00:00.000Z", "most_recent_cursor_value": "2024-07-05T00:00:00.000Z"}, + {"start": "2024-08-15T00:00:00.000Z", "end": "2024-08-30T00:00:00.000Z", "most_recent_cursor_value": "2024-08-20T00:00:00.000Z"}, + ], + "state_type": "date-range" + }, + ) + assert state_manager.update_state_for_stream.call_count == 2 + + +@freezegun.freeze_time(time_to_freeze=datetime(2024, 9, 1, 0, 0, 0, 0, tzinfo=timezone.utc)) +def test_close_partition_with_slice_range_granularity_concurrent_cursor_from_datetime_based_cursor(): + message_repository = Mock(spec=MessageRepository) + state_manager = Mock(spec=ConnectorStateManager) + + config = { + "start_time": "2024-07-01T00:00:00.000000+0000", + "dynamic_cursor_key": "updated_at" + } + + datetime_based_cursor = DatetimeBasedCursor( + start_datetime=MinMaxDatetime(datetime="{{ config.start_time }}", parameters={}), + cursor_field="{{ config.dynamic_cursor_key }}", + datetime_format="%Y-%m-%dT%H:%M:%S.%f%z", + step="P15D", + cursor_granularity="P1D", + config=config, + parameters={}, + ) + + interpolated_state_date = datetime_based_cursor._start_datetime + start_date = interpolated_state_date.get_datetime(config=config) + + interpolated_cursor_field = datetime_based_cursor.cursor_field + cursor_field = CursorField(cursor_field_key=interpolated_cursor_field.eval(config=config)) + + lower_slice_boundary = datetime_based_cursor._partition_field_start.eval(config=config) + upper_slice_boundary = datetime_based_cursor._partition_field_end.eval(config=config) + slice_boundary_fields = (lower_slice_boundary, upper_slice_boundary) + + step_length = datetime_based_cursor._step + + cursor_granularity = parse_duration(datetime_based_cursor.cursor_granularity) if datetime_based_cursor.cursor_granularity else None + + concurrent_cursor = ConcurrentCursor( + stream_name="gods", + stream_namespace=_A_STREAM_NAMESPACE, + stream_state={}, + message_repository=message_repository, + connector_state_manager=state_manager, + connector_state_converter=IsoMillisConcurrentStreamStateConverter(is_sequential_state=False, cursor_granularity=cursor_granularity), + cursor_field=cursor_field, + slice_boundary_fields=slice_boundary_fields, + start=start_date, + slice_range=step_length, + cursor_granularity=cursor_granularity, + end_provider=IsoMillisConcurrentStreamStateConverter.get_end_provider(), + ) + + partition_0 = _partition( + {"start_time": "2024-07-01T00:00:00.000000+0000", "end_time": "2024-07-15T00:00:00.000000+0000"}, _stream_name="gods", + ) + partition_1 = _partition( + {"start_time": "2024-07-16T00:00:00.000000+0000", "end_time": "2024-07-31T00:00:00.000000+0000"}, _stream_name="gods", + ) + partition_3 = _partition( + {"start_time": "2024-08-15T00:00:00.000000+0000", "end_time": "2024-08-29T00:00:00.000000+0000"}, _stream_name="gods", + ) + record_1 = Record( + partition=partition_0, data={"id": "1000", "updated_at": "2024-07-05T00:00:00.000000+0000", "name": "loki", "mythology": "norse"}, + ) + record_2 = Record( + partition=partition_1, data={"id": "2000", "updated_at": "2024-07-25T00:00:00.000000+0000", "name": "freya", "mythology": "norse"}, + ) + record_3 = Record( + partition=partition_3, data={"id": "999", "updated_at": "2024-08-20T00:00:00.000000+0000", "name": "kratos", "mythology": "greek"}, + ) + + concurrent_cursor.observe(record_1) + concurrent_cursor.close_partition(partition_0) + concurrent_cursor.observe(record_2) + concurrent_cursor.close_partition(partition_1) + concurrent_cursor.observe(record_3) + concurrent_cursor.close_partition(partition_3) + + message_repository.emit_message.assert_called_with(state_manager.create_state_message.return_value) + assert message_repository.emit_message.call_count == 3 + state_manager.update_state_for_stream.assert_called_with( + "gods", + _A_STREAM_NAMESPACE, + { + "slices": [ + {"start": "2024-07-01T00:00:00.000Z", "end": "2024-07-31T00:00:00.000Z", "most_recent_cursor_value": "2024-07-25T00:00:00.000Z"}, + {"start": "2024-08-15T00:00:00.000Z", "end": "2024-08-29T00:00:00.000Z", "most_recent_cursor_value": "2024-08-20T00:00:00.000Z"} + + ], + "state_type": "date-range" + }, + ) + assert state_manager.update_state_for_stream.call_count == 3 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_datetime_state_converter.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_datetime_state_converter.py new file mode 100644 index 000000000000..d139656f8212 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_datetime_state_converter.py @@ -0,0 +1,390 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from datetime import datetime, timezone + +import pytest +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import ConcurrencyCompatibleStateType +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + CustomFormatConcurrentStreamStateConverter, + EpochValueConcurrentStreamStateConverter, + IsoMillisConcurrentStreamStateConverter, +) + + +@pytest.mark.parametrize( + "converter, input_state, is_compatible", + [ + pytest.param( + EpochValueConcurrentStreamStateConverter(), + {"state_type": "date-range"}, + True, + id="no-input-state-is-compatible-epoch", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + { + "created_at": "2022_05_22", + "state_type": ConcurrencyCompatibleStateType.date_range.value, + }, + True, + id="input-state-with-date_range-is-compatible-epoch", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + { + "created_at": "2022_05_22", + "state_type": "fake", + }, + False, + id="input-state-with-fake-state-type-is-not-compatible-epoch", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + { + "created_at": "2022_05_22", + }, + False, + id="input-state-without-state_type-is-not-compatible-epoch", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + {"state_type": "date-range"}, + True, + id="no-input-state-is-compatible-isomillis", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + { + "created_at": "2022_05_22", + "state_type": ConcurrencyCompatibleStateType.date_range.value, + }, + True, + id="input-state-with-date_range-is-compatible-isomillis", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + { + "created_at": "2022_05_22", + "state_type": "fake", + }, + False, + id="input-state-with-fake-state-type-is-not-compatible-isomillis", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + { + "created_at": "2022_05_22", + }, + False, + id="input-state-without-state_type-is-not-compatible-isomillis", + ), + ], +) +def test_concurrent_stream_state_converter_is_state_message_compatible(converter, input_state, is_compatible): + assert converter.is_state_message_compatible(input_state) == is_compatible + + +@pytest.mark.parametrize( + "converter,start,state,expected_start", + [ + pytest.param( + EpochValueConcurrentStreamStateConverter(), + None, + {}, + EpochValueConcurrentStreamStateConverter().zero_value, + id="epoch-converter-no-state-no-start-start-is-zero-value", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + datetime.fromtimestamp(1617030403, timezone.utc), + {}, + datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + id="epoch-converter-no-state-with-start-start-is-start", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + None, + {"created_at": 1617030404}, + datetime(2021, 3, 29, 15, 6, 44, tzinfo=timezone.utc), + id="epoch-converter-state-without-start-start-is-from-state", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + datetime.fromtimestamp(1617030404, timezone.utc), + {"created_at": 1617030403}, + datetime(2021, 3, 29, 15, 6, 44, tzinfo=timezone.utc), + id="epoch-converter-state-before-start-start-is-start", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + datetime.fromtimestamp(1617030403, timezone.utc), + {"created_at": 1617030404}, + datetime(2021, 3, 29, 15, 6, 44, tzinfo=timezone.utc), + id="epoch-converter-state-after-start-start-is-from-state", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + None, + {}, + IsoMillisConcurrentStreamStateConverter().zero_value, + id="isomillis-converter-no-state-no-start-start-is-zero-value", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + datetime(2021, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + {}, + datetime(2021, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + id="isomillis-converter-no-state-with-start-start-is-start", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + None, + {"created_at": "2021-08-22T05:03:27.000Z"}, + datetime(2021, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + id="isomillis-converter-state-without-start-start-is-from-state", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + datetime(2022, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + {"created_at": "2021-08-22T05:03:27.000Z"}, + datetime(2022, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + id="isomillis-converter-state-before-start-start-is-start", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + datetime(2022, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + {"created_at": "2023-08-22T05:03:27.000Z"}, + datetime(2023, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + id="isomillis-converter-state-after-start-start-is-from-state", + ), + ], +) +def test_get_sync_start(converter, start, state, expected_start): + assert converter._get_sync_start(CursorField("created_at"), state, start) == expected_start + + +@pytest.mark.parametrize( + "converter, start, sequential_state, expected_output_state", + [ + pytest.param( + EpochValueConcurrentStreamStateConverter(), + datetime.fromtimestamp(0, timezone.utc), + {}, + { + "legacy": {}, + "slices": [ + { + "start": EpochValueConcurrentStreamStateConverter().zero_value, + "end": EpochValueConcurrentStreamStateConverter().zero_value, + } + ], + "state_type": "date-range", + }, + id="empty-input-state-epoch", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + datetime.fromtimestamp(1577836800, timezone.utc), + {"created": 1617030403}, + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(2020, 1, 1, tzinfo=timezone.utc), + "end": datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + } + ], + "legacy": {"created": 1617030403}, + }, + id="with-input-state-epoch", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + datetime(2020, 1, 1, tzinfo=timezone.utc), + {"created": "2021-08-22T05:03:27.000Z"}, + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(2020, 1, 1, tzinfo=timezone.utc), + "end": datetime(2021, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + } + ], + "legacy": {"created": "2021-08-22T05:03:27.000Z"}, + }, + id="with-input-state-isomillis", + ), + ], +) +def test_convert_from_sequential_state(converter, start, sequential_state, expected_output_state): + comparison_format = "%Y-%m-%dT%H:%M:%S.%f" + if expected_output_state["slices"]: + _, conversion = converter.convert_from_sequential_state(CursorField("created"), sequential_state, start) + assert conversion["state_type"] == expected_output_state["state_type"] + assert conversion["legacy"] == expected_output_state["legacy"] + for actual, expected in zip(conversion["slices"], expected_output_state["slices"]): + assert actual["start"].strftime(comparison_format) == expected["start"].strftime(comparison_format) + assert actual["end"].strftime(comparison_format) == expected["end"].strftime(comparison_format) + else: + _, conversion = converter.convert_from_sequential_state(CursorField("created"), sequential_state, start) + assert conversion == expected_output_state + + +@pytest.mark.parametrize( + "converter, concurrent_state, expected_output_state", + [ + pytest.param( + EpochValueConcurrentStreamStateConverter(), + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(1970, 1, 3, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + } + ], + }, + {"created": 172800}, + id="epoch-single-slice", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(1970, 1, 3, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + { + "start": datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2022, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + ], + }, + {"created": 172800}, + id="epoch-overlapping-slices", + ), + pytest.param( + EpochValueConcurrentStreamStateConverter(), + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(1970, 1, 3, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + { + "start": datetime(2022, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2023, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + ], + }, + {"created": 172800}, + id="epoch-multiple-slices", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(1970, 1, 3, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + } + ], + }, + {"created": "1970-01-03T00:00:00.000Z"}, + id="isomillis-single-slice", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(1970, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + { + "start": datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2022, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + ], + }, + {"created": "1970-01-01T00:00:00.000Z"}, + id="isomillis-overlapping-slices", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + { + "state_type": "date-range", + "slices": [ + { + "start": datetime(1970, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2021, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + { + "start": datetime(2022, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + "end": datetime(2023, 3, 29, 15, 6, 43, tzinfo=timezone.utc), + }, + ], + }, + {"created": "1970-01-01T00:00:00.000Z"}, + id="isomillis-multiple-slices", + ), + ], +) +def test_convert_to_sequential_state(converter, concurrent_state, expected_output_state): + assert converter.convert_to_state_message(CursorField("created"), concurrent_state) == expected_output_state + + +@pytest.mark.parametrize( + "converter, concurrent_state, expected_output_state", + [ + pytest.param( + EpochValueConcurrentStreamStateConverter(), + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "start": EpochValueConcurrentStreamStateConverter().zero_value, + }, + {"created": 0}, + id="empty-slices-epoch", + ), + pytest.param( + IsoMillisConcurrentStreamStateConverter(), + { + "state_type": ConcurrencyCompatibleStateType.date_range.value, + "start": datetime(2021, 8, 22, 5, 3, 27, tzinfo=timezone.utc), + }, + {"created": "2021-08-22T05:03:27.000Z"}, + id="empty-slices-isomillis", + ), + ], +) +def test_convert_to_sequential_state_no_slices_returns_legacy_state(converter, concurrent_state, expected_output_state): + with pytest.raises(RuntimeError): + converter.convert_to_state_message(CursorField("created"), concurrent_state) + + +def test_given_multiple_input_datetime_format_when_parse_timestamp_then_iterate_until_successful_parsing(): + output_format = "%Y-%m-%dT%H:%M:%S" + input_formats = ["%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%d"] + converter = CustomFormatConcurrentStreamStateConverter(output_format, input_formats) + + parsed_datetime = converter.parse_timestamp("2024-01-01") + + assert parsed_datetime == datetime(2024, 1, 1, tzinfo=timezone.utc) + + +def test_given_when_parse_timestamp_then_eventually_fallback_on_output_format(): + output_format = "%Y-%m-%dT%H:%M:%S" + input_formats = ["%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%d"] + converter = CustomFormatConcurrentStreamStateConverter(output_format, input_formats) + + parsed_datetime = converter.parse_timestamp("2024-01-01T02:00:00") + + assert parsed_datetime == datetime(2024, 1, 1, 2, 0, 0, tzinfo=timezone.utc) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_default_stream.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_default_stream.py new file mode 100644 index 000000000000..bb06a7b75e65 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_default_stream.py @@ -0,0 +1,198 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import unittest +from unittest.mock import Mock + +from airbyte_cdk.models import AirbyteStream, SyncMode +from airbyte_cdk.sources.message import InMemoryMessageRepository +from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream + + +class ThreadBasedConcurrentStreamTest(unittest.TestCase): + def setUp(self): + self._partition_generator = Mock() + self._name = "name" + self._json_schema = {} + self._availability_strategy = Mock() + self._primary_key = [] + self._cursor_field = None + self._logger = Mock() + self._cursor = Mock(spec=Cursor) + self._message_repository = InMemoryMessageRepository() + self._stream = DefaultStream( + self._partition_generator, + self._name, + self._json_schema, + self._availability_strategy, + self._primary_key, + self._cursor_field, + self._logger, + FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository), + ) + + def test_get_json_schema(self): + json_schema = self._stream.get_json_schema() + assert json_schema == self._json_schema + + def test_check_availability(self): + self._availability_strategy.check_availability.return_value = STREAM_AVAILABLE + availability = self._stream.check_availability() + assert availability == STREAM_AVAILABLE + self._availability_strategy.check_availability.assert_called_once_with(self._logger) + + def test_check_for_error_raises_an_exception_if_any_of_the_futures_are_not_done(self): + futures = [Mock() for _ in range(3)] + for f in futures: + f.exception.return_value = None + futures[0].done.return_value = False + + with self.assertRaises(Exception): + self._stream._check_for_errors(futures) + + def test_check_for_error_raises_an_exception_if_any_of_the_futures_raised_an_exception(self): + futures = [Mock() for _ in range(3)] + for f in futures: + f.exception.return_value = None + futures[0].exception.return_value = Exception("error") + + with self.assertRaises(Exception): + self._stream._check_for_errors(futures) + + def test_as_airbyte_stream(self): + expected_airbyte_stream = AirbyteStream( + name=self._name, + json_schema=self._json_schema, + supported_sync_modes=[SyncMode.full_refresh], + source_defined_cursor=None, + default_cursor_field=None, + source_defined_primary_key=None, + namespace=None, + ) + actual_airbyte_stream = self._stream.as_airbyte_stream() + + assert actual_airbyte_stream == expected_airbyte_stream + + def test_as_airbyte_stream_with_primary_key(self): + json_schema = { + "type": "object", + "properties": { + "id_a": {"type": ["null", "string"]}, + "id_b": {"type": ["null", "string"]}, + }, + } + stream = DefaultStream( + self._partition_generator, + self._name, + json_schema, + self._availability_strategy, + ["composite_key_1", "composite_key_2"], + self._cursor_field, + self._logger, + FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository), + ) + + expected_airbyte_stream = AirbyteStream( + name=self._name, + json_schema=json_schema, + supported_sync_modes=[SyncMode.full_refresh], + source_defined_cursor=None, + default_cursor_field=None, + source_defined_primary_key=[["composite_key_1"], ["composite_key_2"]], + namespace=None, + ) + + airbyte_stream = stream.as_airbyte_stream() + assert airbyte_stream == expected_airbyte_stream + + def test_as_airbyte_stream_with_composite_primary_key(self): + json_schema = { + "type": "object", + "properties": { + "id_a": {"type": ["null", "string"]}, + "id_b": {"type": ["null", "string"]}, + }, + } + stream = DefaultStream( + self._partition_generator, + self._name, + json_schema, + self._availability_strategy, + ["id_a", "id_b"], + self._cursor_field, + self._logger, + FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository), + ) + + expected_airbyte_stream = AirbyteStream( + name=self._name, + json_schema=json_schema, + supported_sync_modes=[SyncMode.full_refresh], + source_defined_cursor=None, + default_cursor_field=None, + source_defined_primary_key=[["id_a"], ["id_b"]], + namespace=None, + ) + + airbyte_stream = stream.as_airbyte_stream() + assert airbyte_stream == expected_airbyte_stream + + def test_as_airbyte_stream_with_a_cursor(self): + json_schema = { + "type": "object", + "properties": { + "id": {"type": ["null", "string"]}, + "date": {"type": ["null", "string"]}, + }, + } + stream = DefaultStream( + self._partition_generator, + self._name, + json_schema, + self._availability_strategy, + self._primary_key, + "date", + self._logger, + FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository), + ) + + expected_airbyte_stream = AirbyteStream( + name=self._name, + json_schema=json_schema, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + source_defined_cursor=True, + default_cursor_field=["date"], + source_defined_primary_key=None, + namespace=None, + is_resumable=True, + ) + + airbyte_stream = stream.as_airbyte_stream() + assert airbyte_stream == expected_airbyte_stream + + def test_as_airbyte_stream_with_namespace(self): + stream = DefaultStream( + self._partition_generator, + self._name, + self._json_schema, + self._availability_strategy, + self._primary_key, + self._cursor_field, + self._logger, + FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository), + namespace="test", + ) + expected_airbyte_stream = AirbyteStream( + name=self._name, + json_schema=self._json_schema, + supported_sync_modes=[SyncMode.full_refresh], + source_defined_cursor=None, + default_cursor_field=None, + source_defined_primary_key=None, + namespace="test", + ) + actual_airbyte_stream = stream.as_airbyte_stream() + + assert actual_airbyte_stream == expected_airbyte_stream diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_partition_enqueuer.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_partition_enqueuer.py new file mode 100644 index 000000000000..da67ff82588d --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_partition_enqueuer.py @@ -0,0 +1,97 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import unittest +from queue import Queue +from typing import Callable, Iterable, List +from unittest.mock import Mock, patch + +from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel +from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem + +_SOME_PARTITIONS: List[Partition] = [Mock(spec=Partition), Mock(spec=Partition)] +_A_STREAM_NAME = "a_stream_name" + + +class PartitionEnqueuerTest(unittest.TestCase): + def setUp(self) -> None: + self._queue: Queue[QueueItem] = Queue() + self._thread_pool_manager = Mock(spec=ThreadPoolManager) + self._thread_pool_manager.prune_to_validate_has_reached_futures_limit.return_value = False + self._partition_generator = PartitionEnqueuer(self._queue, self._thread_pool_manager) + + @patch("airbyte_cdk.sources.streams.concurrent.partition_enqueuer.time.sleep") + def test_given_no_partitions_when_generate_partitions_then_do_not_wait(self, mocked_sleep): + self._thread_pool_manager.prune_to_validate_has_reached_futures_limit.return_value = True # shouldn't be called but just in case + stream = self._a_stream([]) + + self._partition_generator.generate_partitions(stream) + + assert mocked_sleep.call_count == 0 + + def test_given_no_partitions_when_generate_partitions_then_only_push_sentinel(self): + self._thread_pool_manager.prune_to_validate_has_reached_futures_limit.return_value = True + stream = self._a_stream([]) + + self._partition_generator.generate_partitions(stream) + + assert self._consume_queue() == [PartitionGenerationCompletedSentinel(stream)] + + def test_given_partitions_when_generate_partitions_then_return_partitions_before_sentinel(self): + self._thread_pool_manager.prune_to_validate_has_reached_futures_limit.return_value = False + stream = self._a_stream(_SOME_PARTITIONS) + + self._partition_generator.generate_partitions(stream) + + assert self._consume_queue() == _SOME_PARTITIONS + [PartitionGenerationCompletedSentinel(stream)] + + @patch("airbyte_cdk.sources.streams.concurrent.partition_enqueuer.time.sleep") + def test_given_partition_but_limit_reached_when_generate_partitions_then_wait_until_not_hitting_limit(self, mocked_sleep): + self._thread_pool_manager.prune_to_validate_has_reached_futures_limit.side_effect = [True, True, False] + stream = self._a_stream([Mock(spec=Partition)]) + + self._partition_generator.generate_partitions(stream) + + assert mocked_sleep.call_count == 2 + + def test_given_exception_when_generate_partitions_then_return_exception_and_sentinel(self): + stream = Mock(spec=AbstractStream) + stream.name = _A_STREAM_NAME + exception = ValueError() + stream.generate_partitions.side_effect = self._partitions_before_raising(_SOME_PARTITIONS, exception) + + self._partition_generator.generate_partitions(stream) + + queue_content = self._consume_queue() + assert queue_content == _SOME_PARTITIONS + [ + StreamThreadException(exception, _A_STREAM_NAME), + PartitionGenerationCompletedSentinel(stream), + ] + + def _partitions_before_raising(self, partitions: List[Partition], exception: Exception) -> Callable[[], Iterable[Partition]]: + def inner_function() -> Iterable[Partition]: + for partition in partitions: + yield partition + raise exception + + return inner_function + + @staticmethod + def _a_stream(partitions: List[Partition]) -> AbstractStream: + stream = Mock(spec=AbstractStream) + stream.generate_partitions.return_value = iter(partitions) + return stream + + def _consume_queue(self) -> List[QueueItem]: + queue_content: List[QueueItem] = [] + while queue_item := self._queue.get(): + if isinstance(queue_item, PartitionGenerationCompletedSentinel): + queue_content.append(queue_item) + break + queue_content.append(queue_item) + return queue_content diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_partition_reader.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_partition_reader.py new file mode 100644 index 000000000000..226652be82a1 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_partition_reader.py @@ -0,0 +1,72 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import unittest +from queue import Queue +from typing import Callable, Iterable, List +from unittest.mock import Mock + +import pytest +from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException +from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem + +_RECORDS = [ + Record({"id": 1, "name": "Jack"}, "stream"), + Record({"id": 2, "name": "John"}, "stream"), +] + + +class PartitionReaderTest(unittest.TestCase): + def setUp(self) -> None: + self._queue: Queue[QueueItem] = Queue() + self._partition_reader = PartitionReader(self._queue) + + def test_given_no_records_when_process_partition_then_only_emit_sentinel(self): + self._partition_reader.process_partition(self._a_partition([])) + + while queue_item := self._queue.get(): + if not isinstance(queue_item, PartitionCompleteSentinel): + pytest.fail("Only one PartitionCompleteSentinel is expected") + break + + def test_given_read_partition_successful_when_process_partition_then_queue_records_and_sentinel(self): + partition = self._a_partition(_RECORDS) + self._partition_reader.process_partition(partition) + + queue_content = self._consume_queue() + + assert queue_content == _RECORDS + [PartitionCompleteSentinel(partition)] + + def test_given_exception_when_process_partition_then_queue_records_and_exception_and_sentinel(self): + partition = Mock() + exception = ValueError() + partition.read.side_effect = self._read_with_exception(_RECORDS, exception) + self._partition_reader.process_partition(partition) + + queue_content = self._consume_queue() + + assert queue_content == _RECORDS + [StreamThreadException(exception, partition.stream_name()), PartitionCompleteSentinel(partition)] + + def _a_partition(self, records: List[Record]) -> Partition: + partition = Mock(spec=Partition) + partition.read.return_value = iter(records) + return partition + + @staticmethod + def _read_with_exception(records: List[Record], exception: Exception) -> Callable[[], Iterable[Record]]: + def mocked_function() -> Iterable[Record]: + yield from records + raise exception + + return mocked_function + + def _consume_queue(self): + queue_content = [] + while queue_item := self._queue.get(): + queue_content.append(queue_item) + if isinstance(queue_item, PartitionCompleteSentinel): + break + return queue_content diff --git a/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_thread_pool_manager.py b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_thread_pool_manager.py new file mode 100644 index 000000000000..197f9b3431e8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/concurrent/test_thread_pool_manager.py @@ -0,0 +1,81 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from concurrent.futures import Future, ThreadPoolExecutor +from unittest import TestCase +from unittest.mock import Mock + +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager + + +class ThreadPoolManagerTest(TestCase): + def setUp(self): + self._threadpool = Mock(spec=ThreadPoolExecutor) + self._thread_pool_manager = ThreadPoolManager(self._threadpool, Mock(), max_concurrent_tasks=1) + self._fn = lambda x: x + self._arg = "arg" + + def test_submit_calls_underlying_thread_pool(self): + self._thread_pool_manager.submit(self._fn, self._arg) + self._threadpool.submit.assert_called_with(self._fn, self._arg) + + assert len(self._thread_pool_manager._futures) == 1 + + def test_given_exception_during_pruning_when_check_for_errors_and_shutdown_then_shutdown_and_raise(self): + future = Mock(spec=Future) + future.exception.return_value = RuntimeError + future.done.side_effect = [True, True] + + self._thread_pool_manager._futures = [future] + self._thread_pool_manager.prune_to_validate_has_reached_futures_limit() + + with self.assertRaises(RuntimeError): + self._thread_pool_manager.check_for_errors_and_shutdown() + self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True) + + def test_is_done_is_false_if_not_all_futures_are_done(self): + future = Mock(spec=Future) + future.done.return_value = False + + self._thread_pool_manager._futures = [future] + + assert not self._thread_pool_manager.is_done() + + def test_is_done_is_true_if_all_futures_are_done(self): + future = Mock(spec=Future) + future.done.return_value = True + + self._thread_pool_manager._futures = [future] + + assert self._thread_pool_manager.is_done() + + def test_threadpool_shutdown_if_errors(self): + future = Mock(spec=Future) + future.exception.return_value = RuntimeError + + self._thread_pool_manager._futures = [future] + + with self.assertRaises(RuntimeError): + self._thread_pool_manager.check_for_errors_and_shutdown() + self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True) + + def test_check_for_errors_and_shutdown_raises_error_if_futures_are_not_done(self): + future = Mock(spec=Future) + future.exception.return_value = None + future.done.return_value = False + + self._thread_pool_manager._futures = [future] + + with self.assertRaises(RuntimeError): + self._thread_pool_manager.check_for_errors_and_shutdown() + self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True) + + def test_check_for_errors_and_shutdown_does_not_raise_error_if_futures_are_done(self): + future = Mock(spec=Future) + future.exception.return_value = None + future.done.return_value = True + + self._thread_pool_manager._futures = [future] + + self._thread_pool_manager.check_for_errors_and_shutdown() + self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/__init__.py b/airbyte-cdk/python/unit_tests/sources/streams/http/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/__init__.py b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_default_backoff_strategy.py b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_default_backoff_strategy.py new file mode 100644 index 000000000000..67e7e3503c6c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_default_backoff_strategy.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from typing import Optional, Union + +import requests +from airbyte_cdk.sources.streams.http.error_handlers import BackoffStrategy, DefaultBackoffStrategy + +_ANY_ATTEMPT_COUNT = 123 + + +def test_given_no_arguments_default_backoff_strategy_returns_default_values(): + response = requests.Response() + backoff_strategy = DefaultBackoffStrategy() + assert backoff_strategy.backoff_time(response, _ANY_ATTEMPT_COUNT) is None + + +class CustomBackoffStrategy(BackoffStrategy): + def backoff_time( + self, response_or_exception: Optional[Union[requests.Response, requests.RequestException]], attempt_count: int + ) -> Optional[float]: + return response_or_exception.headers["Retry-After"] + + +def test_given_valid_arguments_default_backoff_strategy_returns_values(): + + response = requests.Response() + response.headers["Retry-After"] = 123 + backoff_strategy = CustomBackoffStrategy() + assert backoff_strategy.backoff_time(response, _ANY_ATTEMPT_COUNT) == 123 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_http_status_error_handler.py b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_http_status_error_handler.py new file mode 100644 index 000000000000..6da3e15b2a69 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_http_status_error_handler.py @@ -0,0 +1,112 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from unittest.mock import MagicMock + +import pytest +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.streams.http.error_handlers import ErrorResolution, HttpStatusErrorHandler, ResponseAction + +logger = MagicMock() + + +def test_given_ok_response_http_status_error_handler_returns_success_action(mocker): + mocked_response = MagicMock(spec=requests.Response) + mocked_response.ok = True + mocked_response.status_code = 200 + error_resolution = HttpStatusErrorHandler(logger).interpret_response(mocked_response) + assert isinstance(error_resolution, ErrorResolution) + assert error_resolution.response_action == ResponseAction.SUCCESS + assert error_resolution.failure_type is None + assert error_resolution.error_message is None + + +@pytest.mark.parametrize( + "error, expected_action, expected_failure_type, expected_error_message", + [ + (403, ResponseAction.FAIL, FailureType.config_error, "Forbidden. You don't have permission to access this resource."), + (404, ResponseAction.FAIL, FailureType.system_error, "Not found. The requested resource was not found on the server."), + ], +) +def test_given_error_code_in_response_http_status_error_handler_returns_expected_actions( + error, expected_action, expected_failure_type, expected_error_message +): + response = requests.Response() + response.status_code = error + error_resolution = HttpStatusErrorHandler(logger).interpret_response(response) + assert error_resolution.response_action == expected_action + assert error_resolution.failure_type == expected_failure_type + assert error_resolution.error_message == expected_error_message + + +def test_given_no_response_argument_returns_expected_action(): + + error_resolution = HttpStatusErrorHandler(logger).interpret_response() + + assert error_resolution.response_action == ResponseAction.FAIL + assert error_resolution.failure_type == FailureType.system_error + + +def test_given_unmapped_status_error_returns_retry_action_as_transient_error(): + + response = requests.Response() + response.status_code = 508 + + error_resolution = HttpStatusErrorHandler(logger).interpret_response(response) + + assert error_resolution.response_action == ResponseAction.RETRY + assert error_resolution.failure_type == FailureType.system_error + assert error_resolution.error_message == "Unexpected HTTP Status Code in error handler: 508" + + +def test_given_requests_exception_returns_retry_action_as_transient_error(): + + error_resolution = HttpStatusErrorHandler(logger).interpret_response(requests.RequestException()) + + assert error_resolution.response_action == ResponseAction.RETRY + assert error_resolution.failure_type + + +def test_given_unmapped_exception_returns_retry_action_as_system_error(): + + error_resolution = HttpStatusErrorHandler(logger).interpret_response(Exception()) + + assert error_resolution.response_action == ResponseAction.RETRY + assert error_resolution.failure_type == FailureType.system_error + + +def test_given_unexpected_response_type_returns_fail_action_as_system_error(): + + error_resolution = HttpStatusErrorHandler(logger).interpret_response("unexpected response type") + + assert error_resolution.response_action == ResponseAction.FAIL + assert error_resolution.failure_type == FailureType.system_error + assert error_resolution.error_message == "Received unexpected response type: " + + +def test_given_injected_error_mapping_returns_expected_action(): + + default_error_handler = HttpStatusErrorHandler(logger) + + mock_response = MagicMock(spec=requests.Response) + mock_response.status_code = 509 + mock_response.ok = False + + default_error_resolution = default_error_handler.interpret_response(mock_response) + + assert default_error_resolution.response_action == ResponseAction.RETRY + assert default_error_resolution.failure_type == FailureType.system_error + assert default_error_resolution.error_message == f"Unexpected HTTP Status Code in error handler: {mock_response.status_code}" + + mapped_error_resolution = ErrorResolution( + response_action=ResponseAction.IGNORE, failure_type=FailureType.transient_error, error_message="Injected mapping" + ) + + error_mapping = {509: mapped_error_resolution} + + actual_error_resolution = HttpStatusErrorHandler(logger, error_mapping).interpret_response(mock_response) + + assert actual_error_resolution.response_action == mapped_error_resolution.response_action + assert actual_error_resolution.failure_type == mapped_error_resolution.failure_type + assert actual_error_resolution.error_message == mapped_error_resolution.error_message diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_json_error_message_parser.py b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_json_error_message_parser.py new file mode 100644 index 000000000000..90ea36bc6622 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_json_error_message_parser.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +import requests +from airbyte_cdk.sources.streams.http.error_handlers import JsonErrorMessageParser + + +@pytest.mark.parametrize( + "response_body,expected_error_message", + [ + (b'{"message": "json error message"}', "json error message"), + (b'[{"message": "list error message"}]', "list error message"), + (b'[{"message": "list error message 1"}, {"message": "list error message 2"}]', "list error message 1, list error message 2"), + (b'{"error": "messages error message"}', "messages error message"), + (b'[{"errors": "list error message 1"}, {"errors": "list error message 2"}]', "list error message 1, list error message 2"), + (b'{"failures": "failures error message"}', "failures error message"), + (b'{"failure": "failure error message"}', "failure error message"), + (b'{"detail": "detail error message"}', "detail error message"), + (b'{"err": "err error message"}', "err error message"), + (b'{"error_message": "error_message error message"}', "error_message error message"), + (b'{"msg": "msg error message"}', "msg error message"), + (b'{"reason": "reason error message"}', "reason error message"), + (b'{"status_message": "status_message error message"}', "status_message error message"),], +) +def test_given_error_message_in_response_body_parse_response_error_message_returns_error_message(response_body, expected_error_message): + response = requests.Response() + response._content = response_body + error_message = JsonErrorMessageParser().parse_response_error_message(response) + assert error_message == expected_error_message + + +def test_given_invalid_json_body_parse_response_error_message_returns_none(): + response = requests.Response() + response._content = b"invalid json body" + error_message = JsonErrorMessageParser().parse_response_error_message(response) + assert error_message == "invalid json body" diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_response_models.py b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_response_models.py new file mode 100644 index 000000000000..a19d3c8d5fe0 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/error_handlers/test_response_models.py @@ -0,0 +1,65 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +from unittest import TestCase + +import requests +import requests_mock +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction, create_fallback_error_resolution +from airbyte_cdk.utils.airbyte_secrets_utils import update_secrets + +_A_SECRET = "a-secret" +_A_URL = "https://a-url.com" + + +class DefaultErrorResolutionTest(TestCase): + def setUp(self) -> None: + update_secrets([_A_SECRET]) + + def tearDown(self) -> None: + # to avoid other tests being impacted by added secrets + update_secrets([]) + + def test_given_none_when_create_fallback_error_resolution_then_return_error_resolution(self) -> None: + error_resolution = create_fallback_error_resolution(None) + + assert error_resolution.failure_type == FailureType.system_error + assert error_resolution.response_action == ResponseAction.RETRY + assert ( + error_resolution.error_message + == "Error handler did not receive a valid response or exception. This is unexpected please contact Airbyte Support" + ) + + def test_given_exception_when_create_fallback_error_resolution_then_return_error_resolution(self) -> None: + exception = ValueError("This is an exception") + + error_resolution = create_fallback_error_resolution(exception) + + assert error_resolution.failure_type == FailureType.system_error + assert error_resolution.response_action == ResponseAction.RETRY + assert error_resolution.error_message + assert "ValueError" in error_resolution.error_message + assert str(exception) in error_resolution.error_message + + def test_given_response_can_raise_for_status_when_create_fallback_error_resolution_then_error_resolution(self) -> None: + response = self._create_response(512) + + error_resolution = create_fallback_error_resolution(response) + + assert error_resolution.failure_type == FailureType.system_error + assert error_resolution.response_action == ResponseAction.RETRY + assert error_resolution.error_message and "512 Server Error: None for url: https://a-url.com/" in error_resolution.error_message + + def test_given_response_is_ok_when_create_fallback_error_resolution_then_error_resolution(self) -> None: + response = self._create_response(205) + + error_resolution = create_fallback_error_resolution(response) + + assert error_resolution.failure_type == FailureType.system_error + assert error_resolution.response_action == ResponseAction.RETRY + assert error_resolution.error_message and str(response.status_code) in error_resolution.error_message + + def _create_response(self, status_code: int) -> requests.Response: + with requests_mock.Mocker() as http_mocker: + http_mocker.get(_A_URL, status_code=status_code) + return requests.get(_A_URL) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/__init__.py b/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py b/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py new file mode 100644 index 000000000000..50bd3d8faf7a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py @@ -0,0 +1,424 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from typing import Optional, Union +from unittest.mock import Mock + +import freezegun +import pendulum +import pytest +import requests +from airbyte_cdk.models import FailureType, OrchestratorType, Type +from airbyte_cdk.sources.streams.http.requests_native_auth import ( + BasicHttpAuthenticator, + MultipleTokenAuthenticator, + Oauth2Authenticator, + SingleUseRefreshTokenOauth2Authenticator, + TokenAuthenticator, +) +from airbyte_cdk.utils import AirbyteTracedException +from requests import Response +from requests.exceptions import RequestException + +LOGGER = logging.getLogger(__name__) + +resp = Response() + + +def test_token_authenticator(): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_auth = TokenAuthenticator(token="test-token") + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {"Authorization": "Bearer test-token"} == prepared_request.headers + assert {"Authorization": "Bearer test-token"} == header1 + assert {"Authorization": "Bearer test-token"} == header2 + + +def test_basic_http_authenticator(): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_auth = BasicHttpAuthenticator(username="user", password="password") + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {"Authorization": "Basic dXNlcjpwYXNzd29yZA=="} == prepared_request.headers + assert {"Authorization": "Basic dXNlcjpwYXNzd29yZA=="} == header1 + assert {"Authorization": "Basic dXNlcjpwYXNzd29yZA=="} == header2 + + +def test_multiple_token_authenticator(): + multiple_token_auth = MultipleTokenAuthenticator(tokens=["token1", "token2"]) + header1 = multiple_token_auth.get_auth_header() + header2 = multiple_token_auth.get_auth_header() + header3 = multiple_token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + multiple_token_auth(prepared_request) + + assert {"Authorization": "Bearer token2"} == prepared_request.headers + assert {"Authorization": "Bearer token1"} == header1 + assert {"Authorization": "Bearer token2"} == header2 + assert {"Authorization": "Bearer token1"} == header3 + + +class TestOauth2Authenticator: + """ + Test class for OAuth2Authenticator. + """ + + refresh_endpoint = "refresh_end" + client_id = "client_id" + client_secret = "client_secret" + refresh_token = "refresh_token" + + def test_get_auth_header_fresh(self, mocker): + """ + Should not retrieve new token if current token is valid. + """ + oauth = Oauth2Authenticator( + token_refresh_endpoint=TestOauth2Authenticator.refresh_endpoint, + client_id=TestOauth2Authenticator.client_id, + client_secret=TestOauth2Authenticator.client_secret, + refresh_token=TestOauth2Authenticator.refresh_token, + ) + + mocker.patch.object(Oauth2Authenticator, "refresh_access_token", return_value=("access_token", 1000)) + header = oauth.get_auth_header() + assert {"Authorization": "Bearer access_token"} == header + + def test_get_auth_header_expired(self, mocker): + """ + Should retrieve new token if current token is expired. + """ + oauth = Oauth2Authenticator( + token_refresh_endpoint=TestOauth2Authenticator.refresh_endpoint, + client_id=TestOauth2Authenticator.client_id, + client_secret=TestOauth2Authenticator.client_secret, + refresh_token=TestOauth2Authenticator.refresh_token, + ) + + expire_immediately = 0 + mocker.patch.object(Oauth2Authenticator, "refresh_access_token", return_value=("access_token_1", expire_immediately)) + oauth.get_auth_header() # Set the first expired token. + + valid_100_secs = 100 + mocker.patch.object(Oauth2Authenticator, "refresh_access_token", return_value=("access_token_2", valid_100_secs)) + header = oauth.get_auth_header() + assert {"Authorization": "Bearer access_token_2"} == header + + def test_refresh_request_body(self): + """ + Request body should match given configuration. + """ + scopes = ["scope1", "scope2"] + oauth = Oauth2Authenticator( + token_refresh_endpoint="refresh_end", + client_id="some_client_id", + client_secret="some_client_secret", + refresh_token="some_refresh_token", + scopes=["scope1", "scope2"], + token_expiry_date=pendulum.now().add(days=3), + grant_type="some_grant_type", + refresh_request_body={"custom_field": "in_outbound_request", "another_field": "exists_in_body", "scopes": ["no_override"]}, + ) + body = oauth.build_refresh_request_body() + expected = { + "grant_type": "some_grant_type", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "refresh_token": "some_refresh_token", + "scopes": scopes, + "custom_field": "in_outbound_request", + "another_field": "exists_in_body", + } + assert body == expected + + def test_refresh_access_token(self, mocker): + oauth = Oauth2Authenticator( + token_refresh_endpoint="refresh_end", + client_id="some_client_id", + client_secret="some_client_secret", + refresh_token="some_refresh_token", + scopes=["scope1", "scope2"], + token_expiry_date=pendulum.now().add(days=3), + refresh_request_body={"custom_field": "in_outbound_request", "another_field": "exists_in_body", "scopes": ["no_override"]}, + ) + + resp.status_code = 200 + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": 1000}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + token, expires_in = oauth.refresh_access_token() + + assert isinstance(expires_in, int) + assert ("access_token", 1000) == (token, expires_in) + + # Test with expires_in as str + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": "2000"}) + token, expires_in = oauth.refresh_access_token() + + assert isinstance(expires_in, str) + assert ("access_token", "2000") == (token, expires_in) + + # Test with expires_in as str + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": "2022-04-24T00:00:00Z"}) + token, expires_in = oauth.refresh_access_token() + + assert isinstance(expires_in, str) + assert ("access_token", "2022-04-24T00:00:00Z") == (token, expires_in) + + @pytest.mark.parametrize( + "expires_in_response, token_expiry_date_format, expected_token_expiry_date", + [ + (3600, None, pendulum.datetime(year=2022, month=1, day=1, hour=1)), + ("90012", None, pendulum.datetime(year=2022, month=1, day=2, hour=1, second=12)), + ("2024-02-28", "YYYY-MM-DD", pendulum.datetime(year=2024, month=2, day=28)), + ("2022-02-12T00:00:00.000000+00:00", "YYYY-MM-DDTHH:mm:ss.SSSSSSZ", pendulum.datetime(year=2022, month=2, day=12)), + ], + ids=["seconds", "string_of_seconds", "simple_date", "simple_datetime"], + ) + @freezegun.freeze_time("2022-01-01") + def test_parse_refresh_token_lifespan( + self, + mocker, + expires_in_response: Union[str, int], + token_expiry_date_format: Optional[str], + expected_token_expiry_date: pendulum.DateTime, + ): + oauth = Oauth2Authenticator( + token_refresh_endpoint="refresh_end", + client_id="some_client_id", + client_secret="some_client_secret", + refresh_token="some_refresh_token", + scopes=["scope1", "scope2"], + token_expiry_date=pendulum.now().subtract(days=3), + token_expiry_date_format=token_expiry_date_format, + token_expiry_is_time_of_expiration=bool(token_expiry_date_format), + refresh_request_body={"custom_field": "in_outbound_request", "another_field": "exists_in_body", "scopes": ["no_override"]}, + ) + + resp.status_code = 200 + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": expires_in_response}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + token, expire_in = oauth.refresh_access_token() + expires_datetime = oauth._parse_token_expiration_date(expire_in) + + assert isinstance(expires_datetime, pendulum.DateTime) + assert ("access_token", expected_token_expiry_date) == (token, expires_datetime) + + @pytest.mark.usefixtures("mock_sleep") + @pytest.mark.parametrize("error_code", (429, 500, 502, 504)) + def test_refresh_access_token_retry(self, error_code, requests_mock): + oauth = Oauth2Authenticator( + f"https://{TestOauth2Authenticator.refresh_endpoint}", + TestOauth2Authenticator.client_id, + TestOauth2Authenticator.client_secret, + TestOauth2Authenticator.refresh_token, + ) + requests_mock.post( + f"https://{TestOauth2Authenticator.refresh_endpoint}", + [{"status_code": error_code}, {"status_code": error_code}, {"json": {"access_token": "token", "expires_in": 10}}], + ) + token, expires_in = oauth.refresh_access_token() + assert isinstance(expires_in, int) + assert (token, expires_in) == ("token", 10) + assert requests_mock.call_count == 3 + + def test_auth_call_method(self, mocker): + oauth = Oauth2Authenticator( + token_refresh_endpoint=TestOauth2Authenticator.refresh_endpoint, + client_id=TestOauth2Authenticator.client_id, + client_secret=TestOauth2Authenticator.client_secret, + refresh_token=TestOauth2Authenticator.refresh_token, + ) + + mocker.patch.object(Oauth2Authenticator, "refresh_access_token", return_value=("access_token", 1000)) + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + oauth(prepared_request) + + assert {"Authorization": "Bearer access_token"} == prepared_request.headers + + @pytest.mark.parametrize( + ("config_codes", "response_code", "config_key", "response_key", "config_values", "response_value", "wrapped"), + ( + ((400,), 400, "error", "error", ("invalid_grant",), "invalid_grant", True), + ((401,), 400, "error", "error", ("invalid_grant",), "invalid_grant", False), + ((400,), 400, "error_key", "error", ("invalid_grant",), "invalid_grant", False), + ((400,), 400, "error", "error", ("invalid_grant",), "valid_grant", False), + ((), 400, "", "error", (), "valid_grant", False), + ), + ) + def test_refresh_access_token_wrapped( + self, requests_mock, config_codes, response_code, config_key, response_key, config_values, response_value, wrapped + ): + oauth = Oauth2Authenticator( + f"https://{TestOauth2Authenticator.refresh_endpoint}", + TestOauth2Authenticator.client_id, + TestOauth2Authenticator.client_secret, + TestOauth2Authenticator.refresh_token, + refresh_token_error_status_codes=config_codes, + refresh_token_error_key=config_key, + refresh_token_error_values=config_values, + ) + error_content = {response_key: response_value} + requests_mock.post(f"https://{TestOauth2Authenticator.refresh_endpoint}", status_code=response_code, json=error_content) + + exception_to_raise = AirbyteTracedException if wrapped else RequestException + with pytest.raises(exception_to_raise) as exc_info: + oauth.refresh_access_token() + + if wrapped: + error_message = "Refresh token is invalid or expired. Please re-authenticate from Sources//Settings." + assert exc_info.value.internal_message == error_message + assert exc_info.value.message == error_message + assert exc_info.value.failure_type == FailureType.config_error + + +class TestSingleUseRefreshTokenOauth2Authenticator: + @pytest.fixture + def connector_config(self): + return { + "credentials": { + "access_token": "my_access_token", + "refresh_token": "my_refresh_token", + "client_id": "my_client_id", + "client_secret": "my_client_secret", + "token_expiry_date": "2022-12-31T00:00:00+00:00", + } + } + + @pytest.fixture + def invalid_connector_config(self): + return {"no_credentials_key": "foo"} + + def test_init(self, connector_config): + authenticator = SingleUseRefreshTokenOauth2Authenticator( + connector_config, + token_refresh_endpoint="foobar", + client_id=connector_config["credentials"]["client_id"], + client_secret=connector_config["credentials"]["client_secret"], + ) + assert authenticator.access_token == connector_config["credentials"]["access_token"] + assert authenticator.get_refresh_token() == connector_config["credentials"]["refresh_token"] + assert authenticator.get_token_expiry_date() == pendulum.parse(connector_config["credentials"]["token_expiry_date"]) + + @freezegun.freeze_time("2022-12-31") + @pytest.mark.parametrize( + "test_name, expires_in_value, expiry_date_format, expected_expiry_date", + [ + ("number_of_seconds", 42, None, "2022-12-31T00:00:42+00:00"), + ("string_of_seconds", "42", None, "2022-12-31T00:00:42+00:00"), + ("date_format", "2023-04-04", "YYYY-MM-DD", "2023-04-04T00:00:00+00:00"), + ], + ) + def test_given_no_message_repository_get_access_token( + self, test_name, expires_in_value, expiry_date_format, expected_expiry_date, capsys, mocker, connector_config + ): + authenticator = SingleUseRefreshTokenOauth2Authenticator( + connector_config, + token_refresh_endpoint="foobar", + client_id=connector_config["credentials"]["client_id"], + client_secret=connector_config["credentials"]["client_secret"], + token_expiry_date_format=expiry_date_format, + ) + authenticator.refresh_access_token = mocker.Mock(return_value=("new_access_token", expires_in_value, "new_refresh_token")) + authenticator.token_has_expired = mocker.Mock(return_value=True) + access_token = authenticator.get_access_token() + captured = capsys.readouterr() + airbyte_message = json.loads(captured.out) + expected_new_config = connector_config.copy() + expected_new_config["credentials"]["access_token"] = "new_access_token" + expected_new_config["credentials"]["refresh_token"] = "new_refresh_token" + expected_new_config["credentials"]["token_expiry_date"] = expected_expiry_date + assert airbyte_message["control"]["connectorConfig"]["config"] == expected_new_config + assert authenticator.access_token == access_token == "new_access_token" + assert authenticator.get_refresh_token() == "new_refresh_token" + assert authenticator.get_token_expiry_date() > pendulum.now() + authenticator.token_has_expired = mocker.Mock(return_value=False) + access_token = authenticator.get_access_token() + captured = capsys.readouterr() + assert not captured.out + assert authenticator.access_token == access_token == "new_access_token" + + def test_given_message_repository_when_get_access_token_then_emit_message(self, mocker, connector_config): + message_repository = Mock() + authenticator = SingleUseRefreshTokenOauth2Authenticator( + connector_config, + token_refresh_endpoint="foobar", + client_id=connector_config["credentials"]["client_id"], + client_secret=connector_config["credentials"]["client_secret"], + token_expiry_date_format="YYYY-MM-DD", + message_repository=message_repository, + ) + authenticator.refresh_access_token = mocker.Mock(return_value=("new_access_token", "2023-04-04", "new_refresh_token")) + authenticator.token_has_expired = mocker.Mock(return_value=True) + + authenticator.get_access_token() + + emitted_message = message_repository.emit_message.call_args_list[0].args[0] + assert emitted_message.type == Type.CONTROL + assert emitted_message.control.type == OrchestratorType.CONNECTOR_CONFIG + assert emitted_message.control.connectorConfig.config["credentials"]["access_token"] == "new_access_token" + assert emitted_message.control.connectorConfig.config["credentials"]["refresh_token"] == "new_refresh_token" + assert emitted_message.control.connectorConfig.config["credentials"]["token_expiry_date"] == "2023-04-04T00:00:00+00:00" + assert emitted_message.control.connectorConfig.config["credentials"]["client_id"] == "my_client_id" + assert emitted_message.control.connectorConfig.config["credentials"]["client_secret"] == "my_client_secret" + + def test_given_message_repository_when_get_access_token_then_log_request(self, mocker, connector_config): + message_repository = Mock() + authenticator = SingleUseRefreshTokenOauth2Authenticator( + connector_config, + token_refresh_endpoint="foobar", + client_id=connector_config["credentials"]["client_id"], + client_secret=connector_config["credentials"]["client_secret"], + message_repository=message_repository, + ) + mocker.patch("airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth.requests.request") + mocker.patch( + "airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth.format_http_message", return_value="formatted json" + ) + authenticator.token_has_expired = mocker.Mock(return_value=True) + + authenticator.get_access_token() + + assert message_repository.log_message.call_count == 1 + + def test_refresh_access_token(self, mocker, connector_config): + authenticator = SingleUseRefreshTokenOauth2Authenticator( + connector_config, + token_refresh_endpoint="foobar", + client_id=connector_config["credentials"]["client_id"], + client_secret=connector_config["credentials"]["client_secret"], + ) + + authenticator._get_refresh_access_token_response = mocker.Mock( + return_value={ + authenticator.get_access_token_name(): "new_access_token", + authenticator.get_expires_in_name(): "42", + authenticator.get_refresh_token_name(): "new_refresh_token", + } + ) + assert authenticator.refresh_access_token() == ("new_access_token", "42", "new_refresh_token") + + +def mock_request(method, url, data): + if url == "refresh_end": + return resp + raise Exception(f"Error while refreshing access token with request: {method}, {url}, {data}") diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/test_availability_strategy.py b/airbyte-cdk/python/unit_tests/sources/streams/http/test_availability_strategy.py new file mode 100644 index 000000000000..42975d8ed5a9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/test_availability_strategy.py @@ -0,0 +1,153 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import io +import json +import logging +from typing import Any, Iterable, Mapping, Optional + +import pytest +import requests +from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy +from airbyte_cdk.sources.streams.http.http import HttpStream + +logger = logging.getLogger("airbyte") + + +class MockHttpStream(HttpStream): + url_base = "https://test_base_url.com" + primary_key = "" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.resp_counter = 1 + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + def path(self, **kwargs) -> str: + return "" + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + stub_resp = {"data": self.resp_counter} + self.resp_counter += 1 + yield stub_resp + + pass + + def retry_factor(self) -> float: + return 0.01 + + +@pytest.mark.parametrize( + ("status_code", "json_contents", "expected_is_available", "expected_messages"), + [ + ( + 403, + {"error": "Something went wrong"}, + False, + [ + "Forbidden. You don't have permission to access this resource.", + "Forbidden. You don't have permission to access this resource.", + ], + ), + (200, {}, True, []), + ], +) +@pytest.mark.parametrize( + ("include_source", "expected_docs_url_messages"), + [ + (True, ["Forbidden. You don't have permission to access this resource."]), + (False, ["Forbidden. You don't have permission to access this resource."]), + ], +) +@pytest.mark.parametrize("records_as_list", [True, False]) +def test_default_http_availability_strategy( + mocker, + status_code, + json_contents, + expected_is_available, + expected_messages, + include_source, + expected_docs_url_messages, + records_as_list, +): + class MockListHttpStream(MockHttpStream): + def read_records(self, *args, **kvargs): + if records_as_list: + return list(super().read_records(*args, **kvargs)) + else: + return super().read_records(*args, **kvargs) + + http_stream = MockListHttpStream() + response = requests.Response() + response.status_code = status_code + response.raw = io.BytesIO(json.dumps(json_contents).encode("utf-8")) + mocker.patch.object(requests.Session, "send", return_value=response) + + actual_is_available, reason = HttpAvailabilityStrategy().check_availability(http_stream, logger) + + assert actual_is_available == expected_is_available + if expected_is_available: + assert reason is None + else: + all_expected_messages = expected_messages + expected_docs_url_messages + for message in all_expected_messages: + assert message in reason + + +def test_http_availability_raises_unhandled_error(mocker): + http_stream = MockHttpStream() + + req = requests.Response() + req.status_code = 404 + mocker.patch.object(requests.Session, "send", return_value=req) + + assert (False, "Not found. The requested resource was not found on the server.") == HttpAvailabilityStrategy().check_availability( + http_stream, logger + ) + + +def test_send_handles_retries_when_checking_availability(mocker, caplog): + mocker.patch("time.sleep", lambda x: None) + http_stream = MockHttpStream() + + req_1 = requests.Response() + req_1.status_code = 429 + req_2 = requests.Response() + req_2.status_code = 503 + req_3 = requests.Response() + req_3.status_code = 200 + mock_send = mocker.patch.object(requests.Session, "send", side_effect=[req_1, req_2, req_3]) + + with caplog.at_level(logging.INFO): + stream_is_available, _ = HttpAvailabilityStrategy().check_availability(stream=http_stream, logger=logger) + + assert stream_is_available + assert mock_send.call_count == 3 + for message in ["Caught retryable error", "Service unavailable", "Service unavailable"]: + assert message in caplog.text + + +@pytest.mark.parametrize("records_as_list", [True, False]) +def test_http_availability_strategy_on_empty_stream(mocker, records_as_list): + class MockEmptyHttpStream(mocker.MagicMock, MockHttpStream): + def __init__(self, *args, **kvargs): + mocker.MagicMock.__init__(self) + self.read_records = mocker.MagicMock() + + empty_stream = MockEmptyHttpStream() + assert isinstance(empty_stream, HttpStream) + + # Generator should have no values to generate + if records_as_list: + empty_stream.read_records.return_value = [] + else: + empty_stream.read_records.return_value = iter([]) + + logger = logging.getLogger("airbyte.test-source") + stream_is_available, _ = HttpAvailabilityStrategy().check_availability(stream=empty_stream, logger=logger) + + assert stream_is_available + assert empty_stream.read_records.called diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/test_http.py b/airbyte-cdk/python/unit_tests/sources/streams/http/test_http.py new file mode 100644 index 000000000000..8737289a780f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/test_http.py @@ -0,0 +1,1359 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import json +import logging +from http import HTTPStatus +from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union +from unittest.mock import ANY, MagicMock, patch + +import pytest +import requests +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type +from airbyte_cdk.sources.streams import CheckpointMixin +from airbyte_cdk.sources.streams.checkpoint import ResumableFullRefreshCursor +from airbyte_cdk.sources.streams.checkpoint.substream_resumable_full_refresh_cursor import SubstreamResumableFullRefreshCursor +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.sources.streams.http import HttpStream, HttpSubStream +from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler, HttpStatusErrorHandler +from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction +from airbyte_cdk.sources.streams.http.exceptions import DefaultBackoffException, RequestBodyException, UserDefinedBackoffException +from airbyte_cdk.sources.streams.http.http_client import MessageRepresentationAirbyteTracedErrors +from airbyte_cdk.sources.streams.http.requests_native_auth import TokenAuthenticator + + +class StubBasicReadHttpStream(HttpStream): + url_base = "https://test_base_url.com" + primary_key = "" + + def __init__(self, deduplicate_query_params: bool = False, **kwargs): + super().__init__(**kwargs) + self.resp_counter = 1 + self._deduplicate_query_params = deduplicate_query_params + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + def path(self, **kwargs) -> str: + return "" + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + stubResp = {"data": self.resp_counter} + self.resp_counter += 1 + yield stubResp + + def must_deduplicate_query_params(self) -> bool: + return self._deduplicate_query_params + + @property + def cursor_field(self) -> Union[str, List[str]]: + return ["updated_at"] + + +def test_default_authenticator(): + stream = StubBasicReadHttpStream() + assert stream._http_client._session.auth is None + + +def test_requests_native_token_authenticator(): + stream = StubBasicReadHttpStream(authenticator=TokenAuthenticator("test-token")) + assert isinstance(stream._http_client._session.auth, TokenAuthenticator) + + +def test_request_kwargs_used(mocker, requests_mock): + stream = StubBasicReadHttpStream() + request_kwargs = {"cert": None, "proxies": "google.com"} + mocker.patch.object(stream, "request_kwargs", return_value=request_kwargs) + send_mock = mocker.patch.object(stream._http_client._session, "send", wraps=stream._http_client._session.send) + requests_mock.register_uri("GET", stream.url_base) + + list(stream.read_records(sync_mode=SyncMode.full_refresh)) + + stream._http_client._session.send.assert_any_call(ANY, **request_kwargs) + assert send_mock.call_count == 1 + + +def test_stub_basic_read_http_stream_read_records(mocker): + stream = StubBasicReadHttpStream() + blank_response = {} # Send a blank response is fine as we ignore the response in `parse_response anyway. + mocker.patch.object(stream._http_client, "send_request", return_value=(None, blank_response)) + + records = list(stream.read_records(SyncMode.full_refresh)) + + assert [{"data": 1}] == records + + +class StubNextPageTokenHttpStream(StubBasicReadHttpStream): + current_page = 0 + + def __init__(self, pages: int = 5): + super().__init__() + self._pages = pages + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + while self.current_page < self._pages: + page_token = {"page": self.current_page} + self.current_page += 1 + return page_token + return None + + +def test_next_page_token_is_input_to_other_methods(mocker): + """Validates that the return value from next_page_token is passed into other methods that need it like request_params, headers, body, etc..""" + pages = 5 + stream = StubNextPageTokenHttpStream(pages=pages) + blank_response = {} # Send a blank response is fine as we ignore the response in `parse_response anyway. + mocker.patch.object(stream._http_client, "send_request", return_value=(None, blank_response)) + + methods = ["request_params", "request_headers", "request_body_json"] + for method in methods: + # Wrap all methods we're interested in testing with mocked objects so we can later spy on their input args and verify they were what we expect + mocker.patch.object(stream, method, wraps=getattr(stream, method)) + + records = list(stream.read_records(SyncMode.full_refresh)) + + # Since we have 5 pages, we expect 5 tokens which are {"page":1}, {"page":2}, etc... + expected_next_page_tokens = [{"page": i} for i in range(pages)] + for method in methods: + # First assert that they were called with no next_page_token. This is the first call in the pagination loop. + getattr(stream, method).assert_any_call(next_page_token=None, stream_slice=None, stream_state={}) + for token in expected_next_page_tokens: + # Then verify that each method + getattr(stream, method).assert_any_call(next_page_token=token, stream_slice=None, stream_state={}) + + expected = [{"data": 1}, {"data": 2}, {"data": 3}, {"data": 4}, {"data": 5}, {"data": 6}] + + assert records == expected + + +class StubBadUrlHttpStream(StubBasicReadHttpStream): + url_base = "bad_url" + + +def test_stub_bad_url_http_stream_read_records(mocker): + stream = StubBadUrlHttpStream() + + with pytest.raises(requests.exceptions.RequestException): + list(stream.read_records(SyncMode.full_refresh)) + + +class StubCustomBackoffHttpStream(StubBasicReadHttpStream): + def backoff_time(self, response: requests.Response) -> Optional[float]: + return 0.5 + + +def test_stub_custom_backoff_http_stream(mocker): + mocker.patch("time.sleep", lambda x: None) + stream = StubCustomBackoffHttpStream() + req = requests.Response() + req.status_code = 429 + + send_mock = mocker.patch.object(requests.Session, "send", return_value=req) + + with pytest.raises(UserDefinedBackoffException): + list(stream.read_records(SyncMode.full_refresh)) + assert send_mock.call_count == stream.max_retries + 1 + + # TODO(davin): Figure out how to assert calls. + + +@pytest.mark.parametrize("retries", [-20, -1, 0, 1, 2, 10]) +def test_stub_custom_backoff_http_stream_retries(mocker, retries): + mocker.patch("time.sleep", lambda x: None) + + class StubCustomBackoffHttpStreamRetries(StubCustomBackoffHttpStream): + @property + def max_retries(self): + return retries + + def get_error_handler(self) -> Optional[ErrorHandler]: + return HttpStatusErrorHandler(logging.Logger, max_retries=retries) + + stream = StubCustomBackoffHttpStreamRetries() + req = requests.Response() + req.status_code = HTTPStatus.TOO_MANY_REQUESTS + send_mock = mocker.patch.object(requests.Session, "send", return_value=req) + + with pytest.raises(UserDefinedBackoffException, match="Too many requests") as excinfo: + list(stream.read_records(SyncMode.full_refresh)) + assert isinstance(excinfo.value.request, requests.PreparedRequest) + assert isinstance(excinfo.value.response, requests.Response) + if retries <= 0: + assert send_mock.call_count == 1 + else: + assert send_mock.call_count == stream.max_retries + 1 + + +def test_stub_custom_backoff_http_stream_endless_retries(mocker): + mocker.patch("time.sleep", lambda x: None) + + class StubCustomBackoffHttpStreamRetries(StubCustomBackoffHttpStream): + def get_error_handler(self) -> Optional[ErrorHandler]: + return HttpStatusErrorHandler(logging.Logger, max_retries=99999) + + infinite_number = 20 + + stream = StubCustomBackoffHttpStreamRetries() + req = requests.Response() + req.status_code = HTTPStatus.TOO_MANY_REQUESTS + send_mock = mocker.patch.object(requests.Session, "send", side_effect=[req] * infinite_number) + + # Expecting mock object to raise a RuntimeError when the end of side_effect list parameter reached. + with pytest.raises(RuntimeError): + list(stream.read_records(SyncMode.full_refresh)) + assert send_mock.call_count == infinite_number + 1 + + +@pytest.mark.parametrize("http_code", [400, 401, 403]) +def test_4xx_error_codes_http_stream(mocker, http_code): + stream = StubCustomBackoffHttpStream() + req = requests.Response() + req.status_code = http_code + mocker.patch.object(requests.Session, "send", return_value=req) + + with pytest.raises(MessageRepresentationAirbyteTracedErrors): + list(stream.read_records(SyncMode.full_refresh)) + + +class AutoFailFalseHttpStream(StubBasicReadHttpStream): + raise_on_http_errors = False + max_retries = 3 + + def get_error_handler(self) -> Optional[ErrorHandler]: + return HttpStatusErrorHandler(logging.getLogger(), max_retries=3) + + +def test_raise_on_http_errors_off_429(mocker): + mocker.patch("time.sleep", lambda x: None) + stream = AutoFailFalseHttpStream() + req = requests.Response() + req.status_code = 429 + + mocker.patch.object(requests.Session, "send", return_value=req) + with pytest.raises(DefaultBackoffException, match="Too many requests"): + stream.exit_on_rate_limit = True + list(stream.read_records(SyncMode.full_refresh)) + + +@pytest.mark.parametrize("status_code", [500, 501, 503, 504]) +def test_raise_on_http_errors_off_5xx(mocker, status_code): + mocker.patch("time.sleep", lambda x: None) + stream = AutoFailFalseHttpStream() + req = requests.Response() + req.status_code = status_code + + send_mock = mocker.patch.object(requests.Session, "send", return_value=req) + with pytest.raises(DefaultBackoffException): + list(stream.read_records(SyncMode.full_refresh)) + assert send_mock.call_count == stream.max_retries + 1 + + +@pytest.mark.parametrize("status_code", [400, 401, 402, 403, 416]) +def test_raise_on_http_errors_off_non_retryable_4xx(mocker, status_code): + stream = AutoFailFalseHttpStream() + req = requests.PreparedRequest() + res = requests.Response() + res.status_code = status_code + + mocker.patch.object(requests.Session, "send", return_value=res) + response = stream._http_client._session.send(req) + assert response.status_code == status_code + + +@pytest.mark.parametrize( + "error", + ( + requests.exceptions.ConnectTimeout, + requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + ), +) +def test_raise_on_http_errors(mocker, error): + mocker.patch("time.sleep", lambda x: None) + stream = AutoFailFalseHttpStream() + send_mock = mocker.patch.object(requests.Session, "send", side_effect=error()) + + with pytest.raises(DefaultBackoffException): + list(stream.read_records(SyncMode.full_refresh)) + assert send_mock.call_count == stream.max_retries + 1 + + +class PostHttpStream(StubBasicReadHttpStream): + http_method = "POST" + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + """Returns response data as is""" + yield response.json() + + +class TestRequestBody: + """Suite of different tests for request bodies""" + + json_body = {"key": "value"} + data_body = "key:value" + form_body = {"key1": "value1", "key2": 1234} + urlencoded_form_body = "key1=value1&key2=1234" + + def request2response(self, request, context): + return json.dumps({"body": request.text, "content_type": request.headers.get("Content-Type")}) + + def test_json_body(self, mocker, requests_mock): + + stream = PostHttpStream() + mocker.patch.object(stream, "request_body_json", return_value=self.json_body) + + requests_mock.register_uri("POST", stream.url_base, text=self.request2response) + response = list(stream.read_records(sync_mode=SyncMode.full_refresh))[0] + + assert response["content_type"] == "application/json" + assert json.loads(response["body"]) == self.json_body + + def test_text_body(self, mocker, requests_mock): + + stream = PostHttpStream() + mocker.patch.object(stream, "request_body_data", return_value=self.data_body) + + requests_mock.register_uri("POST", stream.url_base, text=self.request2response) + response = list(stream.read_records(sync_mode=SyncMode.full_refresh))[0] + + assert response["content_type"] is None + assert response["body"] == self.data_body + + def test_form_body(self, mocker, requests_mock): + + stream = PostHttpStream() + mocker.patch.object(stream, "request_body_data", return_value=self.form_body) + + requests_mock.register_uri("POST", stream.url_base, text=self.request2response) + response = list(stream.read_records(sync_mode=SyncMode.full_refresh))[0] + + assert response["content_type"] == "application/x-www-form-urlencoded" + assert response["body"] == self.urlencoded_form_body + + def test_text_json_body(self, mocker, requests_mock): + """checks a exception if both functions were overridden""" + stream = PostHttpStream() + mocker.patch.object(stream, "request_body_data", return_value=self.data_body) + mocker.patch.object(stream, "request_body_json", return_value=self.json_body) + requests_mock.register_uri("POST", stream.url_base, text=self.request2response) + with pytest.raises(RequestBodyException): + list(stream.read_records(sync_mode=SyncMode.full_refresh)) + + def test_body_for_all_methods(self, mocker, requests_mock): + """Stream must send a body for GET/POST/PATCH/PUT methods only""" + stream = PostHttpStream() + methods = { + "POST": True, + "PUT": True, + "PATCH": True, + "GET": True, + "DELETE": False, + "OPTIONS": False, + } + for method, with_body in methods.items(): + stream.http_method = method + mocker.patch.object(stream, "request_body_data", return_value=self.data_body) + requests_mock.register_uri(method, stream.url_base, text=self.request2response) + response = list(stream.read_records(sync_mode=SyncMode.full_refresh))[0] + if with_body: + assert response["body"] == self.data_body + else: + assert response["body"] is None + + +class CacheHttpStream(StubBasicReadHttpStream): + use_cache = True + + def get_json_schema(self) -> Mapping[str, Any]: + return {} + + +class CacheHttpSubStream(HttpSubStream): + url_base = "https://example.com" + primary_key = "" + + def __init__(self, parent): + super().__init__(parent=parent) + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + return [] + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + def path(self, **kwargs) -> str: + return "" + + +def test_caching_filename(): + stream = CacheHttpStream() + assert stream.cache_filename == f"{stream.name}.sqlite" + + +def test_caching_sessions_are_different(): + stream_1 = CacheHttpStream() + stream_2 = CacheHttpStream() + + assert stream_1._http_client._session != stream_2._http_client._session + assert stream_1.cache_filename == stream_2.cache_filename + + +# def test_cached_streams_wortk_when_request_path_is_not_set(mocker, requests_mock): +# This test verifies that HttpStreams with a cached session work even if the path is not set +# For instance, when running in a unit test +# stream = CacheHttpStream() +# with mocker.patch.object(stream._session, "send", wraps=stream._session.send): +# requests_mock.register_uri("GET", stream.url_base) +# records = list(stream.read_records(sync_mode=SyncMode.full_refresh)) +# assert records == [{"data": 1}] +# "" + + +def test_parent_attribute_exist(): + parent_stream = CacheHttpStream() + child_stream = CacheHttpSubStream(parent=parent_stream) + + assert child_stream.parent == parent_stream + + +def test_that_response_was_cached(mocker, requests_mock): + requests_mock.register_uri("GET", "https://google.com/", text="text") + stream = CacheHttpStream() + stream._http_client.clear_cache() + mocker.patch.object(stream, "url_base", "https://google.com/") + records = list(stream.read_records(sync_mode=SyncMode.full_refresh)) + + assert requests_mock.called + + requests_mock.reset_mock() + new_records = list(stream.read_records(sync_mode=SyncMode.full_refresh)) + + assert len(records) == len(new_records) + assert not requests_mock.called + + +class CacheHttpStreamWithSlices(CacheHttpStream): + paths = ["", "search"] + + def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: + return f'{stream_slice["path"]}' if stream_slice else "" + + def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: + for path in self.paths: + yield {"path": path} + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + yield {"value": len(response.text)} + + +@patch("airbyte_cdk.sources.streams.core.logging", MagicMock()) +def test_using_cache(mocker, requests_mock): + requests_mock.register_uri("GET", "https://google.com/", text="text") + requests_mock.register_uri("GET", "https://google.com/search", text="text") + + parent_stream = CacheHttpStreamWithSlices() + mocker.patch.object(parent_stream, "url_base", "https://google.com/") + parent_stream._http_client._session.cache.clear() + + assert requests_mock.call_count == 0 + assert len(parent_stream._http_client._session.cache.responses) == 0 + + for _slice in parent_stream.stream_slices(): + list(parent_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=_slice)) + + assert requests_mock.call_count == 2 + assert len(parent_stream._http_client._session.cache.responses) == 2 + + child_stream = CacheHttpSubStream(parent=parent_stream) + + for _slice in child_stream.stream_slices(sync_mode=SyncMode.full_refresh): + pass + + assert requests_mock.call_count == 2 + assert len(parent_stream._http_client._session.cache.responses) == 2 + assert parent_stream._http_client._session.cache.contains(url="https://google.com/") + assert parent_stream._http_client._session.cache.contains(url="https://google.com/search") + + +class AutoFailTrueHttpStream(StubBasicReadHttpStream): + raise_on_http_errors = True + + def should_retry(self, *args, **kwargs): + return True + + +@pytest.mark.parametrize( + "response_status_code,should_retry, raise_on_http_errors, expected_response_action", + [ + (300, True, True, ResponseAction.RETRY), + (200, False, True, ResponseAction.SUCCESS), + (503, False, True, ResponseAction.FAIL), + (503, False, False, ResponseAction.IGNORE), + ], +) +def test_http_stream_adapter_http_status_error_handler_should_retry_false_raise_on_http_errors( + mocker, response_status_code: int, should_retry: bool, raise_on_http_errors: bool, expected_response_action: ResponseAction +): + stream = AutoFailTrueHttpStream() + mocker.patch.object(stream, "should_retry", return_value=should_retry) + mocker.patch.object(stream, "raise_on_http_errors", raise_on_http_errors) + res = requests.Response() + res.status_code = response_status_code + error_handler = stream.get_error_handler() + error_resolution = error_handler.interpret_response(res) + assert error_resolution.response_action == expected_response_action + + +@pytest.mark.parametrize("status_code", range(400, 600)) +def test_send_raise_on_http_errors_logs(mocker, status_code): + mocker.patch("time.sleep", lambda x: None) + stream = AutoFailTrueHttpStream() + res = requests.Response() + res.status_code = status_code + mocker.patch.object(requests.Session, "send", return_value=res) + mocker.patch.object(stream._http_client, "_logger") + with pytest.raises(requests.exceptions.HTTPError): + response = stream._http_client.send_request("GET", "https://g", {}, exit_on_rate_limit=True) + stream._http_client.logger.error.assert_called_with(response.text) + assert response.status_code == status_code + + +@pytest.mark.parametrize( + "api_response, expected_message", + [ + ({"error": "something broke"}, "something broke"), + ({"error": {"message": "something broke"}}, "something broke"), + ({"error": "err-001", "message": "something broke"}, "something broke"), + ({"failure": {"message": "something broke"}}, "something broke"), + ({"error": {"errors": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}}, "one, two, three"), + ({"errors": ["one", "two", "three"]}, "one, two, three"), + ({"messages": ["one", "two", "three"]}, "one, two, three"), + ({"errors": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}, "one, two, three"), + ({"error": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}, "one, two, three"), + ({"errors": [{"error": "one"}, {"error": "two"}, {"error": "three"}]}, "one, two, three"), + ({"failures": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}, "one, two, three"), + (["one", "two", "three"], "one, two, three"), + ([{"error": "one"}, {"error": "two"}, {"error": "three"}], "one, two, three"), + ({"error": True}, None), + ({"something_else": "hi"}, None), + ({}, None), + ], +) +def test_default_parse_response_error_message(api_response: dict, expected_message: Optional[str]): + stream = StubBasicReadHttpStream() + response = MagicMock() + response.json.return_value = api_response + + message = stream.parse_response_error_message(response) + assert message == expected_message + + +def test_default_parse_response_error_message_not_json(requests_mock): + stream = StubBasicReadHttpStream() + requests_mock.register_uri("GET", "mock://test.com/not_json", text="this is not json") + response = requests.get("mock://test.com/not_json") + + message = stream.parse_response_error_message(response) + assert message is None + + +def test_default_get_error_display_message_handles_http_error(mocker): + stream = StubBasicReadHttpStream() + mocker.patch.object(stream, "parse_response_error_message", return_value="my custom message") + + non_http_err_msg = stream.get_error_display_message(RuntimeError("not me")) + assert non_http_err_msg is None + + response = requests.Response() + http_exception = requests.HTTPError(response=response) + http_err_msg = stream.get_error_display_message(http_exception) + assert http_err_msg == "my custom message" + + +@pytest.mark.parametrize( + "test_name, base_url, path, expected_full_url", + [ + ("test_no_slashes", "https://airbyte.io", "my_endpoint", "https://airbyte.io/my_endpoint"), + ("test_trailing_slash_on_base_url", "https://airbyte.io/", "my_endpoint", "https://airbyte.io/my_endpoint"), + ( + "test_trailing_slash_on_base_url_and_leading_slash_on_path", + "https://airbyte.io/", + "/my_endpoint", + "https://airbyte.io/my_endpoint", + ), + ("test_leading_slash_on_path", "https://airbyte.io", "/my_endpoint", "https://airbyte.io/my_endpoint"), + ("test_trailing_slash_on_path", "https://airbyte.io", "/my_endpoint/", "https://airbyte.io/my_endpoint/"), + ("test_nested_path_no_leading_slash", "https://airbyte.io", "v1/my_endpoint", "https://airbyte.io/v1/my_endpoint"), + ("test_nested_path_with_leading_slash", "https://airbyte.io", "/v1/my_endpoint", "https://airbyte.io/v1/my_endpoint"), + ], +) +def test_join_url(test_name, base_url, path, expected_full_url): + actual_url = HttpStream._join_url(base_url, path) + assert actual_url == expected_full_url + + +@pytest.mark.parametrize( + "deduplicate_query_params, path, params, expected_url", + [ + pytest.param( + True, "v1/endpoint?param1=value1", {}, "https://test_base_url.com/v1/endpoint?param1=value1", id="test_params_only_in_path" + ), + pytest.param( + True, "v1/endpoint", {"param1": "value1"}, "https://test_base_url.com/v1/endpoint?param1=value1", id="test_params_only_in_path" + ), + pytest.param(True, "v1/endpoint", None, "https://test_base_url.com/v1/endpoint", id="test_params_is_none_and_no_params_in_path"), + pytest.param( + True, + "v1/endpoint?param1=value1", + None, + "https://test_base_url.com/v1/endpoint?param1=value1", + id="test_params_is_none_and_no_params_in_path", + ), + pytest.param( + True, + "v1/endpoint?param1=value1", + {"param2": "value2"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m2=value2", + id="test_no_duplicate_params", + ), + pytest.param( + True, + "v1/endpoint?param1=value1", + {"param1": "value1"}, + "https://test_base_url.com/v1/endpoint?param1=value1", + id="test_duplicate_params_same_value", + ), + pytest.param( + True, + "v1/endpoint?param1=1", + {"param1": 1}, + "https://test_base_url.com/v1/endpoint?param1=1", + id="test_duplicate_params_same_value_not_string", + ), + pytest.param( + True, + "v1/endpoint?param1=value1", + {"param1": "value2"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m1=value2", + id="test_duplicate_params_different_value", + ), + pytest.param( + False, + "v1/endpoint?param1=value1", + {"param1": "value2"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m1=value2", + id="test_same_params_different_value_no_deduplication", + ), + pytest.param( + False, + "v1/endpoint?param1=value1", + {"param1": "value1"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m1=value1", + id="test_same_params_same_value_no_deduplication", + ), + ], +) +def test_duplicate_request_params_are_deduped(deduplicate_query_params, path, params, expected_url): + + stream = StubBasicReadHttpStream(deduplicate_query_params) + + if expected_url is None: + with pytest.raises(ValueError): + stream._http_client._create_prepared_request( + http_method=stream.http_method, + url=stream._join_url(stream.url_base, path), + params=params, + dedupe_query_params=deduplicate_query_params, + ) + else: + prepared_request = stream._http_client._create_prepared_request( + http_method=stream.http_method, + url=stream._join_url(stream.url_base, path), + params=params, + dedupe_query_params=deduplicate_query_params, + ) + assert prepared_request.url == expected_url + + +def test_connection_pool(): + stream = StubBasicReadHttpStream(authenticator=TokenAuthenticator("test-token")) + assert stream._http_client._session.adapters["https://"]._pool_connections == 20 + + +class StubParentHttpStream(HttpStream, CheckpointMixin): + primary_key = "primary_key" + + counter = 0 + + def __init__(self, records: List[Mapping[str, Any]]): + super().__init__() + self._records = records + self._state: MutableMapping[str, Any] = {} + + @property + def url_base(self) -> str: + return "https://airbyte.io/api/v1" + + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + return "/stub" + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return {"__ab_full_refresh_sync_complete": True} + + def _read_single_page( + self, + records_generator_fn: Callable[ + [requests.PreparedRequest, requests.Response, Mapping[str, Any], Optional[Mapping[str, Any]]], Iterable[StreamData] + ], + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + yield from self._records + + self.state = {"__ab_full_refresh_sync_complete": True} + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + return [] + + def get_json_schema(self) -> Mapping[str, Any]: + return {} + + +class StubParentResumableFullRefreshStream(HttpStream, CheckpointMixin): + primary_key = "primary_key" + + counter = 0 + + def __init__(self, record_pages: List[List[Mapping[str, Any]]]): + super().__init__() + self._record_pages = record_pages + self._state: MutableMapping[str, Any] = {} + + @property + def url_base(self) -> str: + return "https://airbyte.io/api/v1" + + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + return "/stub" + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return {"__ab_full_refresh_sync_complete": True} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + page_number = self.state.get("page") or 1 + yield from self._record_pages[page_number - 1] + + if page_number < len(self._record_pages): + self.state = {"page": page_number + 1} + else: + self.state = {"__ab_full_refresh_sync_complete": True} + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + return [] + + def get_json_schema(self) -> Mapping[str, Any]: + return {} + + +class StubHttpSubstream(HttpSubStream): + primary_key = "primary_key" + + @property + def url_base(self) -> str: + return "https://airbyte.io/api/v1" + + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + return "/stub" + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + def _read_pages( + self, + records_generator_fn: Callable[ + [requests.PreparedRequest, requests.Response, Mapping[str, Any], Optional[Mapping[str, Any]]], Iterable[StreamData] + ], + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + return [ + {"id": "abc", "parent": stream_slice.get("id")}, + {"id", "def", "parent", stream_slice.get("id")}, + ] + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + return [] + + +def test_substream_with_incremental_parent(): + expected_slices = [ + {"parent": {"id": "abc"}}, + {"parent": {"id": "def"}}, + ] + + parent_records = [ + {"id": "abc"}, + {"id": "def"}, + ] + + parent_stream = StubParentHttpStream(records=parent_records) + substream = StubHttpSubstream(parent=parent_stream) + + actual_slices = [slice for slice in substream.stream_slices(sync_mode=SyncMode.full_refresh)] + assert actual_slices == expected_slices + + +def test_substream_with_resumable_full_refresh_parent(): + parent_pages = [ + [ + {"id": "page_1_abc"}, + {"id": "page_1_def"}, + ], + [ + {"id": "page_2_abc"}, + {"id": "page_2_def"}, + ], + [ + {"id": "page_3_abc"}, + {"id": "page_3_def"}, + ], + ] + + expected_slices = [ + {"parent": {"id": "page_1_abc"}}, + {"parent": {"id": "page_1_def"}}, + {"parent": {"id": "page_2_abc"}}, + {"parent": {"id": "page_2_def"}}, + {"parent": {"id": "page_3_abc"}}, + {"parent": {"id": "page_3_def"}}, + ] + + parent_stream = StubParentResumableFullRefreshStream(record_pages=parent_pages) + substream = StubHttpSubstream(parent=parent_stream) + + actual_slices = [slice for slice in substream.stream_slices(sync_mode=SyncMode.full_refresh)] + assert actual_slices == expected_slices + + +def test_substream_skips_non_record_messages(): + expected_slices = [ + {"parent": {"id": "abc"}}, + {"parent": {"id": "def"}}, + {"parent": {"id": "ghi"}}, + ] + + parent_records = [ + {"id": "abc"}, + AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="should_not_be_parent_record")), + {"id": "def"}, + {"id": "ghi"}, + ] + + parent_stream = StubParentHttpStream(records=parent_records) + substream = StubHttpSubstream(parent=parent_stream) + + actual_slices = [slice for slice in substream.stream_slices(sync_mode=SyncMode.full_refresh)] + assert actual_slices == expected_slices + + +class StubFullRefreshHttpStream(HttpStream): + url_base = "https://test_base_url.com" + primary_key = "id" + + def __init__(self, deduplicate_query_params: bool = False, pages: int = 5, **kwargs): + super().__init__(**kwargs) + self._pages_request_count = 0 + self._page_counter = 0 + self.resp_counter = 0 + self._deduplicate_query_params = deduplicate_query_params + self._pages = pages + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + current_page = self.cursor.get_stream_state().get("page", 1) + if current_page < self._pages: + current_page += 1 + page_token = {"page": current_page} + return page_token + return None + + def path(self, **kwargs) -> str: + return "" + + def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]: + self.resp_counter += 1 + stubResp = {"data": self.resp_counter} + yield stubResp + + def must_deduplicate_query_params(self) -> bool: + return self._deduplicate_query_params + + +class StubFullRefreshLegacySliceHttpStream(StubFullRefreshHttpStream): + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + yield from [{}] + + +def test_resumable_full_refresh_read_from_start(mocker): + """ + Validates the default behavior of a stream that supports resumable full refresh by using read_records() which gets one + page per invocation and emits state afterward. + parses over + """ + pages = 5 + stream = StubFullRefreshHttpStream(pages=pages) + blank_response = {} # Send a blank response is fine as we ignore the response in `parse_response anyway. + mocker.patch.object(stream._http_client, "send_request", return_value=(None, blank_response)) + + # Wrap all methods we're interested in testing with mocked objects to spy on their input args and verify they were what we expect + mocker.patch.object(stream, "_read_single_page", wraps=getattr(stream, "_read_single_page")) + methods = ["request_params", "request_headers", "request_body_json"] + for method in methods: + mocker.patch.object(stream, method, wraps=getattr(stream, method)) + + checkpoint_reader = stream._get_checkpoint_reader( + cursor_field=[], logger=logging.getLogger("airbyte"), sync_mode=SyncMode.full_refresh, stream_state={} + ) + next_stream_slice = checkpoint_reader.next() + records = [] + + expected_checkpoints = [{"page": 2}, {"page": 3}, {"page": 4}, {"page": 5}, {"__ab_full_refresh_sync_complete": True}] + i = 0 + while next_stream_slice is not None: + next_records = list(stream.read_records(SyncMode.full_refresh, stream_slice=next_stream_slice)) + records.extend(next_records) + checkpoint_reader.observe(stream.state) + assert checkpoint_reader.get_checkpoint() == expected_checkpoints[i] + next_stream_slice = checkpoint_reader.next() + i += 1 + + assert getattr(stream, "_read_single_page").call_count == 5 + + # Since we have 5 pages, and we don't pass in the first page, we expect 4 tokens starting at {"page":2}, {"page":3}, etc... + expected_next_page_tokens = expected_checkpoints[:4] + for method in methods: + # First assert that they were called with no next_page_token. This is the first call in the pagination loop. + getattr(stream, method).assert_any_call(next_page_token=None, stream_slice={}, stream_state={}) + for token in expected_next_page_tokens: + # Then verify that each method + getattr(stream, method).assert_any_call(next_page_token=token, stream_slice=token, stream_state={}) + + expected = [{"data": 1}, {"data": 2}, {"data": 3}, {"data": 4}, {"data": 5}] + + assert records == expected + + +def test_resumable_full_refresh_read_from_state(mocker): + """ + Validates the default behavior of a stream that supports resumable full refresh with an incoming state by using + read_records() which gets one page per invocation and emits state afterward. + parses over + """ + pages = 5 + stream = StubFullRefreshHttpStream(pages=pages) + blank_response = {} # Send a blank response is fine as we ignore the response in `parse_response anyway. + mocker.patch.object(stream._http_client, "send_request", return_value=(None, blank_response)) + + # Wrap all methods we're interested in testing with mocked objects to spy on their input args and verify they were what we expect + mocker.patch.object(stream, "_read_single_page", wraps=getattr(stream, "_read_single_page")) + methods = ["request_params", "request_headers", "request_body_json"] + for method in methods: + mocker.patch.object(stream, method, wraps=getattr(stream, method)) + + checkpoint_reader = stream._get_checkpoint_reader( + cursor_field=[], logger=logging.getLogger("airbyte"), sync_mode=SyncMode.full_refresh, stream_state={"page": 3} + ) + next_stream_slice = checkpoint_reader.next() + records = [] + + expected_checkpoints = [{"page": 4}, {"page": 5}, {"__ab_full_refresh_sync_complete": True}] + i = 0 + while next_stream_slice is not None: + next_records = list(stream.read_records(SyncMode.full_refresh, stream_slice=next_stream_slice)) + records.extend(next_records) + checkpoint_reader.observe(stream.state) + assert checkpoint_reader.get_checkpoint() == expected_checkpoints[i] + next_stream_slice = checkpoint_reader.next() + i += 1 + + assert getattr(stream, "_read_single_page").call_count == 3 + + # Since we start at page 3, we expect 3 tokens starting at {"page":3}, {"page":4}, etc... + expected_next_page_tokens = [{"page": 3}, {"page": 4}, {"page": 5}] + for method in methods: + for token in expected_next_page_tokens: + # Then verify that each method + getattr(stream, method).assert_any_call(next_page_token=token, stream_slice=token, stream_state={}) + + expected = [{"data": 1}, {"data": 2}, {"data": 3}] + + assert records == expected + + +def test_resumable_full_refresh_legacy_stream_slice(mocker): + """ + Validates the default behavior of a stream that supports resumable full refresh where incoming stream slices use the + legacy Mapping format + """ + pages = 5 + stream = StubFullRefreshLegacySliceHttpStream(pages=pages) + blank_response = {} # Send a blank response is fine as we ignore the response in `parse_response anyway. + mocker.patch.object(stream._http_client, "send_request", return_value=(None, blank_response)) + + # Wrap all methods we're interested in testing with mocked objects to spy on their input args and verify they were what we expect + mocker.patch.object(stream, "_read_single_page", wraps=getattr(stream, "_read_single_page")) + methods = ["request_params", "request_headers", "request_body_json"] + for method in methods: + mocker.patch.object(stream, method, wraps=getattr(stream, method)) + + checkpoint_reader = stream._get_checkpoint_reader( + cursor_field=[], logger=logging.getLogger("airbyte"), sync_mode=SyncMode.full_refresh, stream_state={"page": 2} + ) + next_stream_slice = checkpoint_reader.next() + records = [] + + expected_checkpoints = [{"page": 3}, {"page": 4}, {"page": 5}, {"__ab_full_refresh_sync_complete": True}] + i = 0 + while next_stream_slice is not None: + next_records = list(stream.read_records(SyncMode.full_refresh, stream_slice=next_stream_slice)) + records.extend(next_records) + checkpoint_reader.observe(stream.state) + assert checkpoint_reader.get_checkpoint() == expected_checkpoints[i] + next_stream_slice = checkpoint_reader.next() + i += 1 + + assert getattr(stream, "_read_single_page").call_count == 4 + + # Since we start at page 3, we expect 3 tokens starting at {"page":3}, {"page":4}, etc... + expected_next_page_tokens = [{"page": 2}, {"page": 3}, {"page": 4}, {"page": 5}] + for method in methods: + for token in expected_next_page_tokens: + # Then verify that each method + getattr(stream, method).assert_any_call(next_page_token=token, stream_slice=token, stream_state={}) + + expected = [{"data": 1}, {"data": 2}, {"data": 3}, {"data": 4}] + + assert records == expected + + +class StubSubstreamResumableFullRefreshStream(HttpSubStream, CheckpointMixin): + primary_key = "primary_key" + + counter = 0 + + def __init__(self, parent: HttpStream, partition_id_to_child_records: Mapping[str, List[Mapping[str, Any]]]): + super().__init__(parent=parent) + self._partition_id_to_child_records = partition_id_to_child_records + # self._state: MutableMapping[str, Any] = {} + + @property + def url_base(self) -> str: + return "https://airbyte.io/api/v1" + + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + return f"/parents/{stream_slice.get('parent_id')}/children" + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + # def read_records( + # self, + # sync_mode: SyncMode, + # cursor_field: Optional[List[str]] = None, + # stream_slice: Optional[Mapping[str, Any]] = None, + # stream_state: Optional[Mapping[str, Any]] = None, + # ) -> Iterable[StreamData]: + # page_number = self.state.get("page") or 1 + # yield from self._record_pages[page_number - 1] + # + # if page_number < len(self._record_pages): + # self.state = {"page": page_number + 1} + # else: + # self.state = {"__ab_full_refresh_sync_complete": True} + + def _fetch_next_page( + self, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Tuple[requests.PreparedRequest, requests.Response]: + return requests.PreparedRequest(), requests.Response() + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + partition_id = stream_slice.get("parent").get("parent_id") + if partition_id in self._partition_id_to_child_records: + yield from self._partition_id_to_child_records.get(partition_id) + else: + raise Exception(f"No mocked output supplied for parent partition_id: {partition_id}") + + def get_json_schema(self) -> Mapping[str, Any]: + return {} + + +def test_substream_resumable_full_refresh_read_from_start(mocker): + """ + Validates the default behavior of a stream that supports resumable full refresh by using read_records() which gets one + page per invocation and emits state afterward. + parses over + """ + + parent_records = [ + {"parent_id": "100", "name": "christopher_nolan"}, + {"parent_id": "101", "name": "celine_song"}, + {"parent_id": "102", "name": "david_fincher"}, + ] + parent_stream = StubParentHttpStream(records=parent_records) + + parents_to_children_records = { + "100": [ + {"id": "a200", "parent_id": "100", "film": "interstellar"}, + {"id": "a201", "parent_id": "100", "film": "oppenheimer"}, + {"id": "a202", "parent_id": "100", "film": "inception"}, + ], + "101": [{"id": "b200", "parent_id": "101", "film": "past_lives"}, {"id": "b201", "parent_id": "101", "film": "materialists"}], + "102": [ + {"id": "c200", "parent_id": "102", "film": "the_social_network"}, + {"id": "c201", "parent_id": "102", "film": "gone_girl"}, + {"id": "c202", "parent_id": "102", "film": "the_curious_case_of_benjamin_button"}, + ], + } + stream = StubSubstreamResumableFullRefreshStream(parent=parent_stream, partition_id_to_child_records=parents_to_children_records) + + blank_response = {} # Send a blank response is fine as we ignore the response in `parse_response anyway. + mocker.patch.object(stream._http_client, "send_request", return_value=(None, blank_response)) + + # Wrap all methods we're interested in testing with mocked objects to spy on their input args and verify they were what we expect + mocker.patch.object(stream, "_read_pages", wraps=getattr(stream, "_read_pages")) + + checkpoint_reader = stream._get_checkpoint_reader( + cursor_field=[], logger=logging.getLogger("airbyte"), sync_mode=SyncMode.full_refresh, stream_state={} + ) + next_stream_slice = checkpoint_reader.next() + records = [] + + expected_checkpoints = [ + { + "states": [ + { + "cursor": {"__ab_full_refresh_sync_complete": True}, + "partition": {"parent": {"name": "christopher_nolan", "parent_id": "100"}}, + } + ] + }, + { + "states": [ + { + "cursor": {"__ab_full_refresh_sync_complete": True}, + "partition": {"parent": {"name": "christopher_nolan", "parent_id": "100"}}, + }, + {"cursor": {"__ab_full_refresh_sync_complete": True}, "partition": {"parent": {"name": "celine_song", "parent_id": "101"}}}, + ] + }, + { + "states": [ + { + "cursor": {"__ab_full_refresh_sync_complete": True}, + "partition": {"parent": {"name": "christopher_nolan", "parent_id": "100"}}, + }, + {"cursor": {"__ab_full_refresh_sync_complete": True}, "partition": {"parent": {"name": "celine_song", "parent_id": "101"}}}, + { + "cursor": {"__ab_full_refresh_sync_complete": True}, + "partition": {"parent": {"name": "david_fincher", "parent_id": "102"}}, + }, + ] + }, + ] + + i = 0 + while next_stream_slice is not None: + next_records = list(stream.read_records(SyncMode.full_refresh, stream_slice=next_stream_slice)) + records.extend(next_records) + checkpoint_reader.observe(stream.state) + assert checkpoint_reader.get_checkpoint() == expected_checkpoints[i] + next_stream_slice = checkpoint_reader.next() + i += 1 + + assert getattr(stream, "_read_pages").call_count == 3 + + expected = [ + {"film": "interstellar", "id": "a200", "parent_id": "100"}, + {"film": "oppenheimer", "id": "a201", "parent_id": "100"}, + {"film": "inception", "id": "a202", "parent_id": "100"}, + {"film": "past_lives", "id": "b200", "parent_id": "101"}, + {"film": "materialists", "id": "b201", "parent_id": "101"}, + {"film": "the_social_network", "id": "c200", "parent_id": "102"}, + {"film": "gone_girl", "id": "c201", "parent_id": "102"}, + {"film": "the_curious_case_of_benjamin_button", "id": "c202", "parent_id": "102"}, + ] + + assert records == expected + + +def test_substream_resumable_full_refresh_read_from_state(mocker): + """ + Validates the default behavior of a stream that supports resumable full refresh by using read_records() which gets one + page per invocation and emits state afterward. + parses over + """ + + parent_records = [ + {"parent_id": "100", "name": "christopher_nolan"}, + {"parent_id": "101", "name": "celine_song"}, + ] + parent_stream = StubParentHttpStream(records=parent_records) + + parents_to_children_records = { + "100": [ + {"id": "a200", "parent_id": "100", "film": "interstellar"}, + {"id": "a201", "parent_id": "100", "film": "oppenheimer"}, + {"id": "a202", "parent_id": "100", "film": "inception"}, + ], + "101": [{"id": "b200", "parent_id": "101", "film": "past_lives"}, {"id": "b201", "parent_id": "101", "film": "materialists"}], + } + stream = StubSubstreamResumableFullRefreshStream(parent=parent_stream, partition_id_to_child_records=parents_to_children_records) + + blank_response = {} # Send a blank response is fine as we ignore the response in `parse_response anyway. + mocker.patch.object(stream._http_client, "send_request", return_value=(None, blank_response)) + + # Wrap all methods we're interested in testing with mocked objects to spy on their input args and verify they were what we expect + mocker.patch.object(stream, "_read_pages", wraps=getattr(stream, "_read_pages")) + + checkpoint_reader = stream._get_checkpoint_reader( + cursor_field=[], + logger=logging.getLogger("airbyte"), + sync_mode=SyncMode.full_refresh, + stream_state={ + "states": [ + { + "cursor": {"__ab_full_refresh_sync_complete": True}, + "partition": {"parent": {"name": "christopher_nolan", "parent_id": "100"}}, + }, + ] + }, + ) + next_stream_slice = checkpoint_reader.next() + records = [] + + expected_checkpoints = [ + { + "states": [ + { + "cursor": {"__ab_full_refresh_sync_complete": True}, + "partition": {"parent": {"name": "christopher_nolan", "parent_id": "100"}}, + }, + {"cursor": {"__ab_full_refresh_sync_complete": True}, "partition": {"parent": {"name": "celine_song", "parent_id": "101"}}}, + ] + }, + ] + + i = 0 + while next_stream_slice is not None: + next_records = list(stream.read_records(SyncMode.full_refresh, stream_slice=next_stream_slice)) + records.extend(next_records) + checkpoint_reader.observe(stream.state) + assert checkpoint_reader.get_checkpoint() == expected_checkpoints[i] + next_stream_slice = checkpoint_reader.next() + i += 1 + + assert getattr(stream, "_read_pages").call_count == 1 + + expected = [ + {"film": "past_lives", "id": "b200", "parent_id": "101"}, + {"film": "materialists", "id": "b201", "parent_id": "101"}, + ] + + assert records == expected + + +class StubWithCursorFields(StubBasicReadHttpStream): + def __init__(self, has_multiple_slices: bool, set_cursor_field: List[str], deduplicate_query_params: bool = False, **kwargs): + self.has_multiple_slices = has_multiple_slices + self._cursor_field = set_cursor_field + super().__init__() + + @property + def cursor_field(self) -> Union[str, List[str]]: + return self._cursor_field + + +@pytest.mark.parametrize( + "cursor_field, is_substream, expected_cursor", + [ + pytest.param([], False, ResumableFullRefreshCursor(), id="test_stream_supports_resumable_full_refresh_cursor"), + pytest.param(["updated_at"], False, None, id="test_incremental_stream_does_not_use_cursor"), + pytest.param(["updated_at"], True, None, id="test_incremental_substream_does_not_use_cursor"), + pytest.param( + [], + True, + SubstreamResumableFullRefreshCursor(), + id="test_full_refresh_substream_automatically_applies_substream_resumable_full_refresh_cursor", + ), + ], +) +def test_get_cursor(cursor_field, is_substream, expected_cursor): + stream = StubWithCursorFields(set_cursor_field=cursor_field, has_multiple_slices=is_substream) + actual_cursor = stream.get_cursor() + + assert actual_cursor == expected_cursor diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/test_http_client.py b/airbyte-cdk/python/unit_tests/sources/streams/http/test_http_client.py new file mode 100644 index 000000000000..0c0f3c62b739 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/test_http_client.py @@ -0,0 +1,570 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import logging +from datetime import timedelta +from unittest.mock import MagicMock, patch + +import pytest +import requests +from airbyte_cdk.models import FailureType +from airbyte_cdk.sources.streams.call_rate import CachedLimiterSession, LimiterSession +from airbyte_cdk.sources.streams.http import HttpClient +from airbyte_cdk.sources.streams.http.error_handlers import BackoffStrategy, ErrorResolution, HttpStatusErrorHandler, ResponseAction +from airbyte_cdk.sources.streams.http.exceptions import DefaultBackoffException, RequestBodyException, UserDefinedBackoffException +from airbyte_cdk.sources.streams.http.requests_native_auth import TokenAuthenticator +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from requests_cache import CachedRequest + + +def test_http_client(): + return HttpClient(name="StubHttpClient", logger=MagicMock()) + + +def test_cache_http_client(): + return HttpClient(name="StubCacheHttpClient", logger=MagicMock(), use_cache=True) + + +def test_cache_filename(): + http_client = test_http_client() + http_client.cache_filename == f"{http_client._name}.sqlite" + + +@pytest.mark.parametrize( + "use_cache, expected_session", + [ + (True, CachedLimiterSession), + (False, LimiterSession), + ], +) +def test_request_session_returns_valid_session(use_cache, expected_session): + http_client = HttpClient(name="test", logger=MagicMock(), use_cache=use_cache) + assert isinstance(http_client._request_session(), expected_session) + + +@pytest.mark.parametrize( + "deduplicate_query_params, url, params, expected_url", + [ + pytest.param( + True, + "https://test_base_url.com/v1/endpoint?param1=value1", + {}, + "https://test_base_url.com/v1/endpoint?param1=value1", + id="test_params_only_in_path", + ), + pytest.param( + True, + "https://test_base_url.com/v1/endpoint", + {"param1": "value1"}, + "https://test_base_url.com/v1/endpoint?param1=value1", + id="test_params_only_in_path", + ), + pytest.param( + True, + "https://test_base_url.com/v1/endpoint", + None, + "https://test_base_url.com/v1/endpoint", + id="test_params_is_none_and_no_params_in_path", + ), + pytest.param( + True, + "https://test_base_url.com/v1/endpoint?param1=value1", + None, + "https://test_base_url.com/v1/endpoint?param1=value1", + id="test_params_is_none_and_no_params_in_path", + ), + pytest.param( + True, + "https://test_base_url.com/v1/endpoint?param1=value1", + {"param2": "value2"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m2=value2", + id="test_no_duplicate_params", + ), + pytest.param( + True, + "https://test_base_url.com/v1/endpoint?param1=value1", + {"param1": "value1"}, + "https://test_base_url.com/v1/endpoint?param1=value1", + id="test_duplicate_params_same_value", + ), + pytest.param( + True, + "https://test_base_url.com/v1/endpoint?param1=1", + {"param1": 1}, + "https://test_base_url.com/v1/endpoint?param1=1", + id="test_duplicate_params_same_value_not_string", + ), + pytest.param( + True, + "https://test_base_url.com/v1/endpoint?param1=value1", + {"param1": "value2"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m1=value2", + id="test_duplicate_params_different_value", + ), + pytest.param( + False, + "https://test_base_url.com/v1/endpoint?param1=value1", + {"param1": "value2"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m1=value2", + id="test_same_params_different_value_no_deduplication", + ), + pytest.param( + False, + "https://test_base_url.com/v1/endpoint?param1=value1", + {"param1": "value1"}, + "https://test_base_url.com/v1/endpoint?param1=value1¶m1=value1", + id="test_same_params_same_value_no_deduplication", + ), + ], +) +def test_duplicate_request_params_are_deduped(deduplicate_query_params, url, params, expected_url): + http_client = test_http_client() + + if expected_url is None: + with pytest.raises(ValueError): + http_client._create_prepared_request(http_method="get", url=url, dedupe_query_params=deduplicate_query_params, params=params) + else: + prepared_request = http_client._create_prepared_request( + http_method="get", url=url, dedupe_query_params=deduplicate_query_params, params=params + ) + assert prepared_request.url == expected_url + + +def test_create_prepared_response_given_given_both_json_and_data_raises_request_body_exception(): + http_client = test_http_client() + + with pytest.raises(RequestBodyException): + http_client._create_prepared_request( + http_method="get", url="https://test_base_url.com/v1/endpoint", json={"test": "json"}, data={"test": "data"} + ) + + +@pytest.mark.parametrize( + "json, data", + [ + ({"test": "json"}, None), + (None, {"test": "data"}), + ], +) +def test_create_prepared_response_given_either_json_or_data_returns_valid_request(json, data): + http_client = test_http_client() + prepared_request = http_client._create_prepared_request( + http_method="get", url="https://test_base_url.com/v1/endpoint", json=json, data=data + ) + assert prepared_request + assert isinstance(prepared_request, requests.PreparedRequest) + + +def test_connection_pool(): + http_client = HttpClient(name="test", logger=MagicMock(), authenticator=TokenAuthenticator("test-token")) + assert http_client._session.adapters["https://"]._pool_connections == 20 + + +def test_valid_basic_send_request(mocker): + http_client = test_http_client() + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 200 + mocked_response.headers = {} + mocker.patch.object(requests.Session, "send", return_value=mocked_response) + returned_request, returned_response = http_client.send_request( + http_method="get", url="https://test_base_url.com/v1/endpoint", request_kwargs={} + ) + + assert isinstance(returned_request, requests.PreparedRequest) + assert returned_response == mocked_response + + +def test_send_raises_airbyte_traced_exception_with_fail_response_action(): + mocked_session = MagicMock(spec=requests.Session) + http_client = HttpClient( + name="test", + logger=MagicMock(), + error_handler=HttpStatusErrorHandler( + logger=MagicMock(), error_mapping={400: ErrorResolution(ResponseAction.FAIL, FailureType.system_error, "test error message")} + ), + session=mocked_session, + ) + prepared_request = requests.PreparedRequest() + mocked_response = requests.Response() + mocked_response.status_code = 400 + mocked_session.send.return_value = mocked_response + + with pytest.raises(AirbyteTracedException): + http_client._send(prepared_request, {}) + + assert http_client._session.send.call_count == 1 + + +def test_send_ignores_with_ignore_reponse_action_and_returns_response(): + mocked_session = MagicMock(spec=requests.Session) + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 300 + mocked_response.headers = {} + mocked_session.send.return_value = mocked_response + mocked_logger = MagicMock(spec=logging.Logger) + http_client = HttpClient( + name="test", + logger=mocked_logger, + error_handler=HttpStatusErrorHandler( + logger=MagicMock(), error_mapping={300: ErrorResolution(ResponseAction.IGNORE, FailureType.system_error, "test ignore message")} + ), + session=mocked_session, + ) + + prepared_request = http_client._create_prepared_request(http_method="get", url="https://test_base_url.com/v1/endpoint") + + returned_response = http_client._send(prepared_request, {}) + + mocked_logger.info.call_count == 1 + assert isinstance(returned_response, requests.Response) + assert returned_response == mocked_response + + +class CustomBackoffStrategy(BackoffStrategy): + def __init__(self, backoff_time_value: float) -> None: + self._backoff_time_value = backoff_time_value + + def backoff_time(self, *args, **kwargs) -> float: + return self._backoff_time_value + + +@pytest.mark.parametrize("backoff_time_value, exception_type", [(0.1, UserDefinedBackoffException), (None, DefaultBackoffException)]) +def test_raises_backoff_exception_with_retry_response_action(mocker, backoff_time_value, exception_type): + http_client = HttpClient( + name="test", + logger=MagicMock(), + error_handler=HttpStatusErrorHandler( + logger=MagicMock(), error_mapping={408: ErrorResolution(ResponseAction.FAIL, FailureType.system_error, "test retry message")} + ), + backoff_strategy=CustomBackoffStrategy(backoff_time_value=backoff_time_value), + ) + prepared_request = http_client._create_prepared_request(http_method="get", url="https://test_base_url.com/v1/endpoint") + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 408 + mocked_response.headers = {} + http_client._logger.info = MagicMock() + + mocker.patch.object(requests.Session, "send", return_value=mocked_response) + mocker.patch.object( + http_client._error_handler, + "interpret_response", + return_value=ErrorResolution(ResponseAction.RETRY, FailureType.system_error, "test retry message"), + ) + + with pytest.raises(exception_type): + http_client._send(prepared_request, {}) + + +@pytest.mark.parametrize("backoff_time_value, exception_type", [(0.1, UserDefinedBackoffException), (None, DefaultBackoffException)]) +def test_raises_backoff_exception_with_response_with_unmapped_error(mocker, backoff_time_value, exception_type): + http_client = HttpClient( + name="test", + logger=MagicMock(), + error_handler=HttpStatusErrorHandler( + logger=MagicMock(), error_mapping={408: ErrorResolution(ResponseAction.FAIL, FailureType.system_error, "test retry message")} + ), + backoff_strategy=CustomBackoffStrategy(backoff_time_value=backoff_time_value), + ) + prepared_request = requests.PreparedRequest() + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 508 + mocked_response.headers = {} + mocked_response.ok = False + + mocker.patch.object(requests.Session, "send", return_value=mocked_response) + + with pytest.raises(exception_type): + http_client._send(prepared_request, {}) + + +@pytest.mark.usefixtures("mock_sleep") +def test_send_request_given_retry_response_action_retries_and_returns_valid_response(): + mocked_session = MagicMock(spec=requests.Session) + valid_response = MagicMock(spec=requests.Response) + valid_response.status_code = 200 + valid_response.ok = True + valid_response.headers = {} + call_count = 2 + + def update_response(*args, **kwargs): + if http_client._session.send.call_count == call_count: + return valid_response + else: + retry_response = MagicMock(spec=requests.Response) + retry_response.ok = False + retry_response.status_code = 408 + retry_response.headers = {} + return retry_response + + mocked_session.send.side_effect = update_response + + http_client = HttpClient( + name="test", + logger=MagicMock(), + error_handler=HttpStatusErrorHandler( + logger=MagicMock(), error_mapping={408: ErrorResolution(ResponseAction.RETRY, FailureType.system_error, "test retry message")} + ), + session=mocked_session, + ) + + prepared_request = requests.PreparedRequest() + + returned_response = http_client._send_with_retry(prepared_request, request_kwargs={}) + + assert http_client._session.send.call_count == call_count + assert returned_response == valid_response + + +def test_session_request_exception_raises_backoff_exception(): + error_handler = HttpStatusErrorHandler( + logger=MagicMock(), + error_mapping={ + requests.exceptions.RequestException: ErrorResolution(ResponseAction.RETRY, FailureType.system_error, "test retry message") + }, + ) + mocked_session = MagicMock(spec=requests.Session) + mocked_session.send.side_effect = requests.RequestException + http_client = HttpClient(name="test", logger=MagicMock(), error_handler=error_handler, session=mocked_session) + prepared_request = requests.PreparedRequest() + + with pytest.raises(DefaultBackoffException): + http_client._send(prepared_request, {}) + + +def test_that_response_was_cached(requests_mock): + cached_http_client = test_cache_http_client() + + assert isinstance(cached_http_client._session, CachedLimiterSession) + + cached_http_client._session.cache.clear() + + prepared_request = cached_http_client._create_prepared_request(http_method="GET", url="https://google.com/") + + requests_mock.register_uri("GET", "https://google.com/", json='{"test": "response"}') + + cached_http_client._send(prepared_request, {}) + + assert requests_mock.called + requests_mock.reset_mock() + + second_response = cached_http_client._send(prepared_request, {}) + + assert isinstance(second_response.request, CachedRequest) + assert not requests_mock.called + + +def test_send_handles_response_action_given_session_send_raises_request_exception(): + error_resolution = ErrorResolution(ResponseAction.FAIL, FailureType.system_error, "test fail message") + + custom_error_handler = HttpStatusErrorHandler(logger=MagicMock(), error_mapping={requests.RequestException: error_resolution}) + + mocked_session = MagicMock(spec=requests.Session) + mocked_session.send.side_effect = requests.RequestException + + http_client = HttpClient(name="test", logger=MagicMock(), error_handler=custom_error_handler, session=mocked_session) + prepared_request = requests.PreparedRequest() + + with pytest.raises(AirbyteTracedException) as e: + http_client._send(prepared_request, {}) + assert e.internal_message == error_resolution.error_message + assert e.message == error_resolution.error_message + assert e.failure_type == error_resolution.failure_type + + +@pytest.mark.usefixtures("mock_sleep") +def test_send_request_given_request_exception_and_retry_response_action_retries_and_returns_valid_response(): + mocked_session = MagicMock(spec=requests.Session) + + def update_response(*args, **kwargs): + if mocked_session.send.call_count == call_count: + return valid_response + else: + raise requests.RequestException() + + mocked_session.send.side_effect = update_response + + valid_response = MagicMock(spec=requests.Response) + valid_response.status_code = 200 + valid_response.ok = True + valid_response.headers = {} + call_count = 2 + + http_client = HttpClient( + name="test", + logger=MagicMock(), + error_handler=HttpStatusErrorHandler( + logger=MagicMock(), error_mapping={408: ErrorResolution(ResponseAction.RETRY, FailureType.system_error, "test retry message")} + ), + session=mocked_session, + ) + + prepared_request = requests.PreparedRequest() + + returned_response = http_client._send_with_retry(prepared_request, request_kwargs={}) + + assert http_client._session.send.call_count == call_count + assert returned_response == valid_response + + +def test_disable_retries(): + class BackoffStrategy: + def backoff_time(self, *args, **kwargs): + return 0.001 + + http_client = HttpClient( + name="test", + logger=MagicMock(), + error_handler=HttpStatusErrorHandler(logger=MagicMock()), + backoff_strategy=BackoffStrategy(), + disable_retries=True, + ) + + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 429 + mocked_response.headers = {} + mocked_response.ok = False + session_send = MagicMock(spec=requests.Session.send) + session_send.return_value = mocked_response + + with patch.object(requests.Session, "send", return_value=mocked_response) as mocked_send: + with pytest.raises(UserDefinedBackoffException): + http_client.send_request(http_method="get", url="https://test_base_url.com/v1/endpoint", request_kwargs={}) + assert mocked_send.call_count == 1 + + +@pytest.mark.usefixtures("mock_sleep") +def test_default_max_retries(): + class BackoffStrategy: + def backoff_time(self, *args, **kwargs): + return 0.001 + + http_client = HttpClient( + name="test", logger=MagicMock(), error_handler=HttpStatusErrorHandler(logger=MagicMock()), backoff_strategy=BackoffStrategy() + ) + + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 429 + mocked_response.headers = {} + mocked_response.ok = False + session_send = MagicMock(spec=requests.Session.send) + session_send.return_value = mocked_response + + with patch.object(requests.Session, "send", return_value=mocked_response) as mocked_send: + with pytest.raises(UserDefinedBackoffException): + http_client.send_request(http_method="get", url="https://test_base_url.com/v1/endpoint", request_kwargs={}) + assert mocked_send.call_count == 6 + + +@pytest.mark.usefixtures("mock_sleep") +def test_backoff_strategy_max_retries(): + class BackoffStrategy: + def backoff_time(self, *args, **kwargs): + return 0.001 + + retries = 3 + + http_client = HttpClient( + name="test", + logger=MagicMock(), + error_handler=HttpStatusErrorHandler(logger=MagicMock(), max_retries=retries), + backoff_strategy=BackoffStrategy(), + ) + + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 429 + mocked_response.headers = {} + mocked_response.ok = False + session_send = MagicMock(spec=requests.Session.send) + session_send.return_value = mocked_response + + with patch.object(requests.Session, "send", return_value=mocked_response) as mocked_send: + with pytest.raises(UserDefinedBackoffException): + http_client.send_request(http_method="get", url="https://test_base_url.com/v1/endpoint", request_kwargs={}) + assert mocked_send.call_count == retries + 1 + + +@pytest.mark.usefixtures("mock_sleep") +def test_backoff_strategy_max_time(): + error_handler = HttpStatusErrorHandler( + logger=MagicMock(), + error_mapping={requests.RequestException: ErrorResolution(ResponseAction.RETRY, FailureType.system_error, "test retry message")}, + max_retries=10, + max_time=timedelta(seconds=2), + ) + + class BackoffStrategy: + def backoff_time(self, *args, **kwargs): + return 1 + + http_client = HttpClient(name="test", logger=MagicMock(), error_handler=error_handler, backoff_strategy=BackoffStrategy()) + + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 429 + mocked_response.headers = {} + mocked_response.ok = False + session_send = MagicMock(spec=requests.Session.send) + session_send.return_value = mocked_response + + with patch.object(requests.Session, "send", return_value=mocked_response) as mocked_send: + with pytest.raises(UserDefinedBackoffException): + http_client.send_request(http_method="get", url="https://test_base_url.com/v1/endpoint", request_kwargs={}) + assert mocked_send.call_count == 2 + + +@pytest.mark.usefixtures("mock_sleep") +def test_send_emit_stream_status_with_rate_limit_reason(capsys): + class BackoffStrategy: + def backoff_time(self, *args, **kwargs): + return 0.001 + + http_client = HttpClient( + name="test", logger=MagicMock(), error_handler=HttpStatusErrorHandler(logger=MagicMock()), backoff_strategy=BackoffStrategy() + ) + + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 429 + mocked_response.headers = {} + mocked_response.ok = False + session_send = MagicMock(spec=requests.Session.send) + session_send.return_value = mocked_response + + with patch.object(requests.Session, "send", return_value=mocked_response) as mocked_send: + with pytest.raises(UserDefinedBackoffException): + http_client.send_request(http_method="get", url="https://test_base_url.com/v1/endpoint", request_kwargs={}) + + trace_messages = capsys.readouterr().out.split() + assert len(trace_messages) == mocked_send.call_count + + +@pytest.mark.parametrize( + "exit_on_rate_limit, expected_call_count, expected_error", [[True, 6, DefaultBackoffException], [False, 38, OverflowError]] +) +@pytest.mark.usefixtures("mock_sleep") +def test_backoff_strategy_endless(exit_on_rate_limit, expected_call_count, expected_error): + http_client = HttpClient(name="test", logger=MagicMock(), error_handler=HttpStatusErrorHandler(logger=MagicMock())) + + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 429 + mocked_response.headers = {} + mocked_response.ok = False + session_send = MagicMock(spec=requests.Session.send) + session_send.return_value = mocked_response + + with patch.object(requests.Session, "send", return_value=mocked_response) as mocked_send: + with pytest.raises(expected_error): + http_client.send_request( + http_method="get", url="https://test_base_url.com/v1/endpoint", request_kwargs={}, exit_on_rate_limit=exit_on_rate_limit + ) + assert mocked_send.call_count == expected_call_count + + +def test_given_different_headers_then_response_is_not_cached(requests_mock): + http_client = HttpClient(name="test", logger=MagicMock(), use_cache=True) + first_request_headers = {"header_key": "first"} + second_request_headers = {"header_key": "second"} + requests_mock.register_uri("GET", "https://google.com/", request_headers=first_request_headers, json={"test": "first response"}) + requests_mock.register_uri("GET", "https://google.com/", request_headers=second_request_headers, json={"test": "second response"}) + + http_client.send_request("GET", "https://google.com/", headers=first_request_headers, request_kwargs={}) + _, second_response = http_client.send_request("GET", "https://google.com/", headers=second_request_headers, request_kwargs={}) + + assert second_response.json()["test"] == "second response" diff --git a/airbyte-cdk/python/unit_tests/sources/streams/test_call_rate.py b/airbyte-cdk/python/unit_tests/sources/streams/test_call_rate.py new file mode 100644 index 000000000000..c78d494ba413 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/test_call_rate.py @@ -0,0 +1,300 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import os +import tempfile +import time +from datetime import datetime, timedelta +from typing import Any, Iterable, Mapping, Optional + +import pytest +import requests +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.streams.call_rate import ( + APIBudget, + CallRateLimitHit, + FixedWindowCallRatePolicy, + HttpRequestMatcher, + MovingWindowCallRatePolicy, + Rate, + UnlimitedCallRatePolicy, +) +from airbyte_cdk.sources.streams.http import HttpStream +from airbyte_cdk.sources.streams.http.requests_native_auth import TokenAuthenticator +from airbyte_cdk.utils.constants import ENV_REQUEST_CACHE_PATH +from requests import Request + + +class StubDummyHttpStream(HttpStream): + url_base = "https://test_base_url.com" + primary_key = "some_key" + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return {"next_page_token": True} # endless pages + + def path(self, **kwargs) -> str: + return "" + + def parse_response(self, *args, **kwargs) -> Iterable[Mapping]: + yield {"data": "some_data"} + + +class StubDummyCacheHttpStream(StubDummyHttpStream): + use_cache = True + + +@pytest.fixture(name="enable_cache") +def enable_cache_fixture(): + prev_cache_path = os.environ.get(ENV_REQUEST_CACHE_PATH) + with tempfile.TemporaryDirectory() as temp_dir: + os.environ[ENV_REQUEST_CACHE_PATH] = temp_dir + yield + + if prev_cache_path is not None: + os.environ[ENV_REQUEST_CACHE_PATH] = prev_cache_path + + +class TestHttpRequestMatcher: + try_all_types_of_requests = pytest.mark.parametrize( + "request_factory", + [Request, lambda *args, **kwargs: Request(*args, **kwargs).prepare()], + ) + + @try_all_types_of_requests + def test_url(self, request_factory): + matcher = HttpRequestMatcher(url="http://some_url/") + assert not matcher(request_factory(url="http://some_wrong_url")) + assert matcher(request_factory(url="http://some_url")) + + @try_all_types_of_requests + def test_method(self, request_factory): + matcher = HttpRequestMatcher(method="GET") + assert not matcher(request_factory(url="http://some_url")) + assert not matcher(request_factory(url="http://some_url", method="POST")) + assert matcher(request_factory(url="http://some_url", method="GET")) + + @try_all_types_of_requests + def test_params(self, request_factory): + matcher = HttpRequestMatcher(params={"param1": 10, "param2": 15}) + assert not matcher(request_factory(url="http://some_url/")) + assert not matcher(request_factory(url="http://some_url/", params={"param1": 10, "param3": 100})) + assert not matcher(request_factory(url="http://some_url/", params={"param1": 10, "param2": 10})) + assert matcher(request_factory(url="http://some_url/", params={"param1": 10, "param2": 15, "param3": 100})) + + @try_all_types_of_requests + def test_header(self, request_factory): + matcher = HttpRequestMatcher(headers={"header1": 10, "header2": 15}) + assert not matcher(request_factory(url="http://some_url")) + assert not matcher(request_factory(url="http://some_url", headers={"header1": "10", "header3": "100"})) + assert not matcher(request_factory(url="http://some_url", headers={"header1": "10", "header2": "10"})) + assert matcher(request_factory(url="http://some_url", headers={"header1": "10", "header2": "15", "header3": "100"})) + + @try_all_types_of_requests + def test_combination(self, request_factory): + matcher = HttpRequestMatcher(method="GET", url="http://some_url/", headers={"header1": 10}, params={"param2": "test"}) + assert matcher(request_factory(method="GET", url="http://some_url", headers={"header1": "10"}, params={"param2": "test"})) + assert not matcher(request_factory(method="GET", url="http://some_url", headers={"header1": "10"})) + assert not matcher(request_factory(method="GET", url="http://some_url")) + assert not matcher(request_factory(url="http://some_url")) + + +def test_http_request_matching(mocker): + """Test policy lookup based on matchers.""" + users_policy = mocker.Mock(spec=MovingWindowCallRatePolicy) + groups_policy = mocker.Mock(spec=MovingWindowCallRatePolicy) + root_policy = mocker.Mock(spec=MovingWindowCallRatePolicy) + + users_policy.matches.side_effect = HttpRequestMatcher(url="http://domain/api/users", method="GET") + groups_policy.matches.side_effect = HttpRequestMatcher(url="http://domain/api/groups", method="POST") + root_policy.matches.side_effect = HttpRequestMatcher(method="GET") + api_budget = APIBudget( + policies=[ + users_policy, + groups_policy, + root_policy, + ] + ) + + api_budget.acquire_call(Request("POST", url="http://domain/unmatched_endpoint"), block=False), "unrestricted call" + users_policy.try_acquire.assert_not_called() + groups_policy.try_acquire.assert_not_called() + root_policy.try_acquire.assert_not_called() + + users_request = Request("GET", url="http://domain/api/users") + api_budget.acquire_call(users_request, block=False), "first call, first matcher" + users_policy.try_acquire.assert_called_once_with(users_request, weight=1) + groups_policy.try_acquire.assert_not_called() + root_policy.try_acquire.assert_not_called() + + api_budget.acquire_call(Request("GET", url="http://domain/api/users"), block=False), "second call, first matcher" + assert users_policy.try_acquire.call_count == 2 + groups_policy.try_acquire.assert_not_called() + root_policy.try_acquire.assert_not_called() + + group_request = Request("POST", url="http://domain/api/groups") + api_budget.acquire_call(group_request, block=False), "first call, second matcher" + assert users_policy.try_acquire.call_count == 2 + groups_policy.try_acquire.assert_called_once_with(group_request, weight=1) + root_policy.try_acquire.assert_not_called() + + api_budget.acquire_call(Request("POST", url="http://domain/api/groups"), block=False), "second call, second matcher" + assert users_policy.try_acquire.call_count == 2 + assert groups_policy.try_acquire.call_count == 2 + root_policy.try_acquire.assert_not_called() + + any_get_request = Request("GET", url="http://domain/api/") + api_budget.acquire_call(any_get_request, block=False), "first call, third matcher" + assert users_policy.try_acquire.call_count == 2 + assert groups_policy.try_acquire.call_count == 2 + root_policy.try_acquire.assert_called_once_with(any_get_request, weight=1) + + +class TestUnlimitedCallRatePolicy: + def test_try_acquire(self, mocker): + policy = UnlimitedCallRatePolicy(matchers=[]) + assert policy.matches(mocker.Mock()), "should match anything" + policy.try_acquire(mocker.Mock(), weight=1) + policy.try_acquire(mocker.Mock(), weight=10) + + def test_update(self): + policy = UnlimitedCallRatePolicy(matchers=[]) + policy.update(available_calls=10, call_reset_ts=datetime.now()) + policy.update(available_calls=None, call_reset_ts=datetime.now()) + policy.update(available_calls=10, call_reset_ts=None) + + +class TestFixedWindowCallRatePolicy: + def test_limit_rate(self, mocker): + policy = FixedWindowCallRatePolicy(matchers=[], next_reset_ts=datetime.now(), period=timedelta(hours=1), call_limit=100) + policy.try_acquire(mocker.Mock(), weight=1) + policy.try_acquire(mocker.Mock(), weight=20) + with pytest.raises(ValueError, match="Weight can not exceed the call limit"): + policy.try_acquire(mocker.Mock(), weight=101) + + with pytest.raises(CallRateLimitHit) as exc: + policy.try_acquire(mocker.Mock(), weight=100 - 20 - 1 + 1) + + assert exc.value.time_to_wait + assert exc.value.weight == 100 - 20 - 1 + 1 + assert exc.value.item + + def test_update_available_calls(self, mocker): + policy = FixedWindowCallRatePolicy(matchers=[], next_reset_ts=datetime.now(), period=timedelta(hours=1), call_limit=100) + # update to decrease number of calls available + policy.update(available_calls=2, call_reset_ts=None) + # hit the limit with weight=3 + with pytest.raises(CallRateLimitHit): + policy.try_acquire(mocker.Mock(), weight=3) + # ok with less weight=1 + policy.try_acquire(mocker.Mock(), weight=1) + + # update to increase number of calls available, ignored + policy.update(available_calls=20, call_reset_ts=None) + # so we still hit the limit with weight=3 + with pytest.raises(CallRateLimitHit): + policy.try_acquire(mocker.Mock(), weight=3) + + +class TestMovingWindowCallRatePolicy: + def test_no_rates(self): + """should raise a ValueError when no rates provided""" + with pytest.raises(ValueError, match="The list of rates can not be empty"): + MovingWindowCallRatePolicy(rates=[], matchers=[]) + + def test_limit_rate(self): + """try_acquire must respect configured call rate and throw CallRateLimitHit when hit the limit.""" + policy = MovingWindowCallRatePolicy(rates=[Rate(10, timedelta(minutes=1))], matchers=[]) + + for i in range(10): + policy.try_acquire("call", weight=1), f"{i + 1} call" + + with pytest.raises(CallRateLimitHit) as excinfo1: + policy.try_acquire("call", weight=1), "call over limit" + assert excinfo1.value.time_to_wait.total_seconds() == pytest.approx(60, 0.1) + + time.sleep(0.5) + + with pytest.raises(CallRateLimitHit) as excinfo2: + policy.try_acquire("call", weight=1), "call over limit" + assert excinfo2.value.time_to_wait < excinfo1.value.time_to_wait, "time to wait must decrease over time" + + def test_limit_rate_support_custom_weight(self): + """try_acquire must take into account provided weight and throw CallRateLimitHit when hit the limit.""" + policy = MovingWindowCallRatePolicy(rates=[Rate(10, timedelta(minutes=1))], matchers=[]) + + policy.try_acquire("call", weight=2), "1st call with weight of 2" + with pytest.raises(CallRateLimitHit) as excinfo: + policy.try_acquire("call", weight=9), "2nd call, over limit since 2 + 9 = 11 > 10" + assert excinfo.value.time_to_wait.total_seconds() == pytest.approx(60, 0.1), "should wait 1 minute before next call" + + def test_multiple_limit_rates(self): + """try_acquire must take into all call rates and apply stricter.""" + policy = MovingWindowCallRatePolicy( + matchers=[], + rates=[ + Rate(10, timedelta(minutes=10)), + Rate(3, timedelta(seconds=10)), + Rate(2, timedelta(hours=1)), + ], + ) + + policy.try_acquire("call", weight=2), "1 call" + + with pytest.raises(CallRateLimitHit) as excinfo: + policy.try_acquire("call", weight=1), "1 call" + + assert excinfo.value.time_to_wait.total_seconds() == pytest.approx(3600, 0.1) + assert str(excinfo.value) == "Bucket for item=call with Rate limit=2/1.0h is already full" + + +class TestHttpStreamIntegration: + def test_without_cache(self, mocker, requests_mock): + """Test that HttpStream will use call budget when provided""" + requests_mock.get(f"{StubDummyHttpStream.url_base}/", json={"data": "test"}) + + mocker.patch.object(MovingWindowCallRatePolicy, "try_acquire") + + api_budget = APIBudget( + policies=[ + MovingWindowCallRatePolicy( + matchers=[HttpRequestMatcher(url=f"{StubDummyHttpStream.url_base}/", method="GET")], + rates=[ + Rate(2, timedelta(minutes=1)), + ], + ), + ] + ) + + stream = StubDummyHttpStream(api_budget=api_budget, authenticator=TokenAuthenticator(token="ABCD")) + for i in range(10): + records = stream.read_records(SyncMode.full_refresh) + assert next(records) == {"data": "some_data"} + + assert MovingWindowCallRatePolicy.try_acquire.call_count == 10 + + @pytest.mark.usefixtures("enable_cache") + def test_with_cache(self, mocker, requests_mock): + """Test that HttpStream will use call budget when provided and not cached""" + requests_mock.get(f"{StubDummyHttpStream.url_base}/", json={"data": "test"}) + + mocker.patch.object(MovingWindowCallRatePolicy, "try_acquire") + + api_budget = APIBudget( + policies=[ + MovingWindowCallRatePolicy( + matchers=[ + HttpRequestMatcher(url=f"{StubDummyHttpStream.url_base}/", method="GET"), + ], + rates=[ + Rate(2, timedelta(minutes=1)), + ], + ) + ] + ) + + stream = StubDummyCacheHttpStream(api_budget=api_budget) + for i in range(10): + records = stream.read_records(SyncMode.full_refresh) + assert next(records) == {"data": "some_data"} + + assert MovingWindowCallRatePolicy.try_acquire.call_count == 1 diff --git a/airbyte-cdk/python/unit_tests/sources/streams/test_stream_read.py b/airbyte-cdk/python/unit_tests/sources/streams/test_stream_read.py new file mode 100644 index 000000000000..5b82ab119d03 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/test_stream_read.py @@ -0,0 +1,593 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from copy import deepcopy +from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union +from unittest.mock import Mock + +import pytest +from airbyte_cdk.models import ( + AirbyteLogMessage, + AirbyteMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + ConfiguredAirbyteStream, + DestinationSyncMode, + Level, + StreamDescriptor, + SyncMode, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor +from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager +from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade +from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor +from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer +from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader +from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition +from airbyte_cdk.sources.streams.concurrent.partitions.record import Record +from airbyte_cdk.sources.streams.core import CheckpointMixin, StreamData +from airbyte_cdk.sources.utils.schema_helpers import InternalConfig +from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger + +_A_CURSOR_FIELD = ["NESTED", "CURSOR"] +_DEFAULT_INTERNAL_CONFIG = InternalConfig() +_STREAM_NAME = "STREAM" +_NO_STATE = None + + +class _MockStream(Stream): + def __init__(self, slice_to_records: Mapping[str, List[Mapping[str, Any]]], json_schema: Dict[str, Any] = None): + self._slice_to_records = slice_to_records + self._mocked_json_schema = json_schema or {} + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return None + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + for partition in self._slice_to_records.keys(): + yield {"partition_key": partition} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + yield from self._slice_to_records[stream_slice["partition_key"]] + + def get_json_schema(self) -> Mapping[str, Any]: + return self._mocked_json_schema + + +class _MockIncrementalStream(_MockStream, CheckpointMixin): + _state = {} + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + """State setter, accept state serialized by state getter.""" + self._state = value + + @property + def cursor_field(self) -> Union[str, List[str]]: + return ["created_at"] + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + cursor = self.cursor_field[0] + for record in self._slice_to_records[stream_slice["partition_key"]]: + yield record + if cursor not in self._state: + self._state[cursor] = record.get(cursor) + else: + self._state[cursor] = max(self._state[cursor], record.get(cursor)) + + +class MockConcurrentCursor(Cursor): + _state: MutableMapping[str, Any] + _message_repository: MessageRepository + + def __init__(self, message_repository: MessageRepository): + self._message_repository = message_repository + self._state = {} + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + def observe(self, record: Record) -> None: + partition = str(record.data.get("partition")) + timestamp = record.data.get("created_at") + self._state[partition] = {"created_at": timestamp} + + def close_partition(self, partition: Partition) -> None: + self._message_repository.emit_message( + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="__mock_stream", namespace=None), + stream_state=AirbyteStateBlob(**self._state), + ), + ), + ) + ) + + def ensure_at_least_one_state_emitted(self) -> None: + pass + + +def _stream(slice_to_partition_mapping, slice_logger, logger, message_repository, json_schema=None): + return _MockStream(slice_to_partition_mapping, json_schema=json_schema) + + +def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Optional[Cursor] = None): + stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository) + cursor = cursor or FinalStateCursor(stream_name=stream.name, stream_namespace=stream.namespace, message_repository=message_repository) + source = Mock() + source._slice_logger = slice_logger + source.message_repository = message_repository + stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, cursor) + stream.logger.setLevel(logger.level) + return stream + + +def _incremental_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, timestamp): + stream = _MockIncrementalStream(slice_to_partition_mapping) + return stream + + +def _incremental_concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor): + stream = _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor) + return stream + + +def _stream_with_no_cursor_field(slice_to_partition_mapping, slice_logger, logger, message_repository): + def get_updated_state(current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> MutableMapping[str, Any]: + raise Exception("I shouldn't be invoked by a full_refresh stream") + + mock_stream = _MockStream(slice_to_partition_mapping) + mock_stream.get_updated_state = get_updated_state + return mock_stream + + +@pytest.mark.parametrize( + "constructor", + [ + pytest.param(_stream, id="synchronous_reader"), + pytest.param(_concurrent_stream, id="concurrent_reader"), + ], +) +def test_full_refresh_read_a_single_slice_with_debug(constructor): + # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object. + # It is done by running the same test cases on both streams + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + internal_config = InternalConfig() + records = [ + {"id": 1, "partition_key": 1}, + {"id": 2, "partition_key": 1}, + ] + slice_to_partition = {1: records} + slice_logger = DebugSliceLogger() + logger = _mock_logger(True) + message_repository = InMemoryMessageRepository(Level.DEBUG) + stream = constructor(slice_to_partition, slice_logger, logger, message_repository) + state_manager = ConnectorStateManager() + + expected_records = [ + AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.INFO, + message='slice:{"partition_key": 1}', + ), + ), + *records, + ] + + # Synchronous streams emit a final state message to indicate that the stream has finished reading + # Concurrent streams don't emit their own state messages - the concurrent source observes the cursor + # and emits the state messages. Therefore, we can only check the value of the cursor's state at the end + if constructor == _stream: + expected_records.append( + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="__mock_stream", namespace=None), + stream_state=AirbyteStateBlob(__ab_no_cursor_state_message=True), + ), + ), + ), + ) + + actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config) + + if constructor == _concurrent_stream: + assert hasattr(stream._cursor, "state") + assert str(stream._cursor.state) == "{'__ab_no_cursor_state_message': True}" + + assert actual_records == expected_records + + +@pytest.mark.parametrize( + "constructor", + [ + pytest.param(_stream, id="synchronous_reader"), + pytest.param(_concurrent_stream, id="concurrent_reader"), + ], +) +def test_full_refresh_read_a_single_slice(constructor): + # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object. + # It is done by running the same test cases on both streams + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + internal_config = InternalConfig() + logger = _mock_logger() + slice_logger = DebugSliceLogger() + message_repository = InMemoryMessageRepository(Level.INFO) + state_manager = ConnectorStateManager() + + records = [ + {"id": 1, "partition": 1}, + {"id": 2, "partition": 1}, + ] + slice_to_partition = {1: records} + stream = constructor(slice_to_partition, slice_logger, logger, message_repository) + + expected_records = [*records] + + # Synchronous streams emit a final state message to indicate that the stream has finished reading + # Concurrent streams don't emit their own state messages - the concurrent source observes the cursor + # and emits the state messages. Therefore, we can only check the value of the cursor's state at the end + if constructor == _stream: + expected_records.append( + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="__mock_stream", namespace=None), + stream_state=AirbyteStateBlob(__ab_no_cursor_state_message=True), + ), + ), + ), + ) + + actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config) + + if constructor == _concurrent_stream: + assert hasattr(stream._cursor, "state") + assert str(stream._cursor.state) == "{'__ab_no_cursor_state_message': True}" + + assert actual_records == expected_records + + +@pytest.mark.parametrize( + "constructor", + [ + pytest.param(_stream, id="synchronous_reader"), + pytest.param(_concurrent_stream, id="concurrent_reader"), + pytest.param(_stream_with_no_cursor_field, id="no_cursor_field"), + ], +) +def test_full_refresh_read_two_slices(constructor): + # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object + # It is done by running the same test cases on both streams + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + internal_config = InternalConfig() + logger = _mock_logger() + slice_logger = DebugSliceLogger() + message_repository = InMemoryMessageRepository(Level.INFO) + state_manager = ConnectorStateManager() + + records_partition_1 = [ + {"id": 1, "partition": 1}, + {"id": 2, "partition": 1}, + ] + records_partition_2 = [ + {"id": 3, "partition": 2}, + {"id": 4, "partition": 2}, + ] + slice_to_partition = {1: records_partition_1, 2: records_partition_2} + stream = constructor(slice_to_partition, slice_logger, logger, message_repository) + + expected_records = [ + *records_partition_1, + *records_partition_2, + ] + + # Synchronous streams emit a final state message to indicate that the stream has finished reading + # Concurrent streams don't emit their own state messages - the concurrent source observes the cursor + # and emits the state messages. Therefore, we can only check the value of the cursor's state at the end + if constructor == _stream or constructor == _stream_with_no_cursor_field: + expected_records.append( + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="__mock_stream", namespace=None), + stream_state=AirbyteStateBlob(__ab_no_cursor_state_message=True), + ), + ), + ), + ) + + actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config) + + if constructor == _concurrent_stream: + assert hasattr(stream._cursor, "state") + assert str(stream._cursor.state) == "{'__ab_no_cursor_state_message': True}" + + for record in expected_records: + assert record in actual_records + assert len(actual_records) == len(expected_records) + + +def test_incremental_read_two_slices(): + # This test verifies that a stream running in incremental mode emits state messages correctly + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), + sync_mode=SyncMode.incremental, + cursor_field=["created_at"], + destination_sync_mode=DestinationSyncMode.overwrite, + ) + internal_config = InternalConfig() + logger = _mock_logger() + slice_logger = DebugSliceLogger() + message_repository = InMemoryMessageRepository(Level.INFO) + state_manager = ConnectorStateManager() + timestamp = "1708899427" + + records_partition_1 = [ + {"id": 1, "partition": 1, "created_at": "1708899000"}, + {"id": 2, "partition": 1, "created_at": "1708899000"}, + ] + records_partition_2 = [ + {"id": 3, "partition": 2, "created_at": "1708899400"}, + {"id": 4, "partition": 2, "created_at": "1708899427"}, + ] + slice_to_partition = {1: records_partition_1, 2: records_partition_2} + stream = _incremental_stream(slice_to_partition, slice_logger, logger, message_repository, timestamp) + + expected_records = [ + *records_partition_1, + _create_state_message("__mock_incremental_stream", {"created_at": timestamp}), + *records_partition_2, + _create_state_message("__mock_incremental_stream", {"created_at": timestamp}), + ] + + actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config) + + for record in expected_records: + assert record in actual_records + assert len(actual_records) == len(expected_records) + + +def test_concurrent_incremental_read_two_slices(): + # This test verifies that an incremental concurrent stream manages state correctly for multiple slices syncing concurrently + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + internal_config = InternalConfig() + logger = _mock_logger() + slice_logger = DebugSliceLogger() + message_repository = InMemoryMessageRepository(Level.INFO) + state_manager = ConnectorStateManager() + slice_timestamp_1 = "1708850000" + slice_timestamp_2 = "1708950000" + cursor = MockConcurrentCursor(message_repository) + + records_partition_1 = [ + {"id": 1, "partition": 1, "created_at": "1708800000"}, + {"id": 2, "partition": 1, "created_at": slice_timestamp_1}, + ] + records_partition_2 = [ + {"id": 3, "partition": 2, "created_at": "1708900000"}, + {"id": 4, "partition": 2, "created_at": slice_timestamp_2}, + ] + slice_to_partition = {1: records_partition_1, 2: records_partition_2} + stream = _incremental_concurrent_stream(slice_to_partition, slice_logger, logger, message_repository, cursor) + + expected_records = [ + *records_partition_1, + *records_partition_2, + ] + + expected_state = _create_state_message( + "__mock_stream", {"1": {"created_at": slice_timestamp_1}, "2": {"created_at": slice_timestamp_2}} + ) + + actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config) + + handler = ConcurrentReadProcessor( + [stream], + Mock(spec=PartitionEnqueuer), + Mock(spec=ThreadPoolManager), + logger, + slice_logger, + message_repository, + Mock(spec=PartitionReader), + ) + + for record in expected_records: + assert record in actual_records + + # We need run on_record to update cursor with record cursor value + for record in actual_records: + list(handler.on_record(Record(record, Mock(spec=Partition, **{"stream_name.return_value": "__mock_stream"})))) + + assert len(actual_records) == len(expected_records) + + # We don't have a real source that reads from the message_repository for state, so we read from the queue directly to verify + # the cursor observed records correctly and updated partition states + mock_partition = Mock() + cursor.close_partition(mock_partition) + actual_state = [state for state in message_repository.consume_queue()] + assert len(actual_state) == 1 + assert actual_state[0] == expected_state + + +def setup_stream_dependencies(configured_json_schema): + configured_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema=configured_json_schema), + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + internal_config = InternalConfig() + logger = _mock_logger() + slice_logger = DebugSliceLogger() + message_repository = InMemoryMessageRepository(Level.INFO) + state_manager = ConnectorStateManager() + return configured_stream, internal_config, logger, slice_logger, message_repository, state_manager + + +def test_configured_json_schema(): + current_json_schema = { + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": {"type": ["null", "number"]}, + "name": {"type": ["null", "string"]}, + }, + } + + configured_stream, internal_config, logger, slice_logger, message_repository, state_manager = setup_stream_dependencies( + current_json_schema + ) + records = [ + {"id": 1, "partition": 1}, + {"id": 2, "partition": 1}, + ] + + slice_to_partition = {1: records} + stream = _stream(slice_to_partition, slice_logger, logger, message_repository, json_schema=current_json_schema) + assert not stream.configured_json_schema + _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config) + assert stream.configured_json_schema == current_json_schema + + +def test_configured_json_schema_with_invalid_properties(): + """ + Configured Schemas can have very old fields, so we need to housekeeping ourselves. + The purpose of this test in ensure that correct cleanup occurs when configured catalog schema is compared with current stream schema. + """ + old_user_insights = "old_user_insights" + old_feature_info = "old_feature_info" + configured_json_schema = { + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": {"type": ["null", "number"]}, + "name": {"type": ["null", "string"]}, + "cost_per_conversation": {"type": ["null", "string"]}, + old_user_insights: {"type": ["null", "string"]}, + old_feature_info: {"type": ["null", "string"]}, + }, + } + # stream schema is updated e.g. some fields in new api version are deprecated + stream_schema = deepcopy(configured_json_schema) + del stream_schema["properties"][old_user_insights] + del stream_schema["properties"][old_feature_info] + + configured_stream, internal_config, logger, slice_logger, message_repository, state_manager = setup_stream_dependencies( + configured_json_schema + ) + records = [ + {"id": 1, "partition": 1}, + {"id": 2, "partition": 1}, + ] + + slice_to_partition = {1: records} + stream = _stream(slice_to_partition, slice_logger, logger, message_repository, json_schema=stream_schema) + assert not stream.configured_json_schema + _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config) + assert stream.configured_json_schema != configured_json_schema + configured_json_schema_properties = stream.configured_json_schema["properties"] + assert old_user_insights not in configured_json_schema_properties + assert old_feature_info not in configured_json_schema_properties + for stream_schema_property in stream_schema["properties"]: + assert ( + stream_schema_property in configured_json_schema_properties + ), f"Stream schema property: {stream_schema_property} missing in configured schema" + assert stream_schema["properties"][stream_schema_property] == configured_json_schema_properties[stream_schema_property] + + +def _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config): + records = [] + for record in stream.read(configured_stream, logger, slice_logger, {}, state_manager, internal_config): + for message in message_repository.consume_queue(): + records.append(message) + records.append(record) + return records + + +def _mock_partition_generator(name: str, slices, records_per_partition, *, available=True, debug_log=False): + stream = Mock() + stream.name = name + stream.get_json_schema.return_value = {} + stream.generate_partitions.return_value = iter(slices) + stream.read_records.side_effect = [iter(records) for records in records_per_partition] + stream.logger.isEnabledFor.return_value = debug_log + if available: + stream.check_availability.return_value = True, None + else: + stream.check_availability.return_value = False, "A reason why the stream is unavailable" + return stream + + +def _mock_logger(enabled_for_debug=False): + logger = Mock() + logger.isEnabledFor.return_value = enabled_for_debug + logger.level = logging.DEBUG if enabled_for_debug else logging.INFO + return logger + + +def _create_state_message(stream: str, state: Mapping[str, Any]) -> AirbyteMessage: + return AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=stream, namespace=None), + stream_state=AirbyteStateBlob(**state), + ), + ), + ) diff --git a/airbyte-cdk/python/unit_tests/sources/streams/test_streams_core.py b/airbyte-cdk/python/unit_tests/sources/streams/test_streams_core.py new file mode 100644 index 000000000000..9f356b5c80bb --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/test_streams_core.py @@ -0,0 +1,446 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional +from unittest import mock + +import pytest +import requests +from airbyte_cdk.models import AirbyteStream, SyncMode +from airbyte_cdk.sources.streams import CheckpointMixin, Stream +from airbyte_cdk.sources.streams.checkpoint import ( + Cursor, + CursorBasedCheckpointReader, + FullRefreshCheckpointReader, + IncrementalCheckpointReader, + LegacyCursorBasedCheckpointReader, + ResumableFullRefreshCheckpointReader, + ResumableFullRefreshCursor, +) +from airbyte_cdk.sources.streams.http import HttpStream, HttpSubStream +from airbyte_cdk.sources.types import StreamSlice + +logger = logging.getLogger("airbyte") + + +class StreamStubFullRefresh(Stream): + """ + Stub full refresh class to assist with testing. + """ + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + pass + + primary_key = None + + +class StreamStubIncremental(Stream, CheckpointMixin): + """ + Stub full incremental class to assist with testing. + """ + + _state = {} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + pass + + cursor_field = "test_cursor" + primary_key = "primary_key" + namespace = "test_namespace" + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + self._state = value + + +class StreamStubResumableFullRefresh(Stream, CheckpointMixin): + """ + Stub full incremental class to assist with testing. + """ + + _state = {} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + pass + + primary_key = "primary_key" + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + self._state = value + + +class StreamStubLegacyStateInterface(Stream): + """ + Stub full incremental class to assist with testing. + """ + + _state = {} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + pass + + cursor_field = "test_cursor" + primary_key = "primary_key" + namespace = "test_namespace" + + def get_updated_state( + self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] + ) -> MutableMapping[str, Any]: + return {} + + +class StreamStubIncrementalEmptyNamespace(Stream): + """ + Stub full incremental class, with empty namespace, to assist with testing. + """ + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + pass + + cursor_field = "test_cursor" + primary_key = "primary_key" + namespace = "" + + +class HttpSubStreamStubFullRefreshLegacySlices(HttpSubStream): + """ + Stub substream full refresh class to assist with testing. + """ + + primary_key = "primary_key" + + @property + def url_base(self) -> str: + return "https://airbyte.io/api/v1" + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + pass + + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + return "/stub" + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + return [] + + +class CursorBasedStreamStubFullRefresh(StreamStubFullRefresh): + def get_cursor(self) -> Optional[Cursor]: + return ResumableFullRefreshCursor() + + +class LegacyCursorBasedStreamStubFullRefresh(CursorBasedStreamStubFullRefresh): + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + yield from [{}] + + +class MultipleSlicesStreamStub(HttpStream): + """ + Stub full refresh class that returns multiple StreamSlice instances to assist with testing. + """ + + primary_key = "primary_key" + + @property + def url_base(self) -> str: + return "https://airbyte.io/api/v1" + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + yield from [ + StreamSlice(partition={"parent_id": "korra"}, cursor_slice={}), + StreamSlice(partition={"parent_id": "asami"}, cursor_slice={}), + ] + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + pass + + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + return "/stub" + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + return [] + + +class ParentHttpStreamStub(HttpStream): + primary_key = "primary_key" + url_base = "https://airbyte.io/api/v1" + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + return [{"id": 400, "name": "a_parent_record"}] + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + return None + + def path( + self, + *, + stream_state: Optional[Mapping[str, Any]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + return "/parent" + + def parse_response( + self, + response: requests.Response, + *, + stream_state: Mapping[str, Any], + stream_slice: Optional[Mapping[str, Any]] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any]]: + return [] + + +def test_as_airbyte_stream_full_refresh(mocker): + """ + Should return an full refresh AirbyteStream with information matching the + provided Stream interface. + """ + test_stream = StreamStubFullRefresh() + + mocker.patch.object(StreamStubFullRefresh, "get_json_schema", return_value={}) + airbyte_stream = test_stream.as_airbyte_stream() + + exp = AirbyteStream(name="stream_stub_full_refresh", json_schema={}, supported_sync_modes=[SyncMode.full_refresh], is_resumable=False) + assert airbyte_stream == exp + + +def test_as_airbyte_stream_incremental(mocker): + """ + Should return an incremental refresh AirbyteStream with information matching + the provided Stream interface. + """ + test_stream = StreamStubIncremental() + + mocker.patch.object(StreamStubIncremental, "get_json_schema", return_value={}) + airbyte_stream = test_stream.as_airbyte_stream() + + exp = AirbyteStream( + name="stream_stub_incremental", + namespace="test_namespace", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + default_cursor_field=["test_cursor"], + source_defined_cursor=True, + source_defined_primary_key=[["primary_key"]], + is_resumable=True, + ) + assert airbyte_stream == exp + + +def test_supports_incremental_cursor_set(): + """ + Should return true if cursor is set. + """ + test_stream = StreamStubIncremental() + test_stream.cursor_field = "test_cursor" + + assert test_stream.supports_incremental + + +def test_supports_incremental_cursor_not_set(): + """ + Should return false if cursor is not. + """ + test_stream = StreamStubFullRefresh() + + assert not test_stream.supports_incremental + + +def test_namespace_set(): + """ + Should allow namespace property to be set. + """ + test_stream = StreamStubIncremental() + + assert test_stream.namespace == "test_namespace" + + +def test_namespace_set_to_empty_string(mocker): + """ + Should not set namespace property if equal to empty string. + """ + test_stream = StreamStubIncremental() + + mocker.patch.object(StreamStubIncremental, "get_json_schema", return_value={}) + mocker.patch.object(StreamStubIncremental, "namespace", "") + + airbyte_stream = test_stream.as_airbyte_stream() + + exp = AirbyteStream( + name="stream_stub_incremental", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + default_cursor_field=["test_cursor"], + source_defined_cursor=True, + source_defined_primary_key=[["primary_key"]], + namespace=None, + is_resumable=True, + ) + assert airbyte_stream == exp + + +def test_namespace_not_set(): + """ + Should be equal to unset value of None. + """ + test_stream = StreamStubFullRefresh() + + assert test_stream.namespace is None + + +@pytest.mark.parametrize( + "test_input, expected", + [("key", [["key"]]), (["key1", "key2"], [["key1"], ["key2"]]), ([["key1", "key2"], ["key3"]], [["key1", "key2"], ["key3"]])], +) +def test_wrapped_primary_key_various_argument(test_input, expected): + """ + Should always wrap primary key into list of lists. + """ + + wrapped = Stream._wrapped_primary_key(test_input) + + assert wrapped == expected + + +@mock.patch("airbyte_cdk.sources.utils.schema_helpers.ResourceSchemaLoader.get_schema") +def test_get_json_schema_is_cached(mocked_method): + stream = StreamStubFullRefresh() + for i in range(5): + stream.get_json_schema() + assert mocked_method.call_count == 1 + + +@pytest.mark.parametrize( + "stream, stream_state, expected_checkpoint_reader_type", + [ + pytest.param(StreamStubIncremental(), {}, IncrementalCheckpointReader, id="test_incremental_checkpoint_reader"), + pytest.param(StreamStubFullRefresh(), {}, FullRefreshCheckpointReader, id="test_full_refresh_checkpoint_reader"), + pytest.param( + StreamStubResumableFullRefresh(), {}, ResumableFullRefreshCheckpointReader, id="test_resumable_full_refresh_checkpoint_reader" + ), + pytest.param( + StreamStubLegacyStateInterface(), {}, IncrementalCheckpointReader, id="test_incremental_checkpoint_reader_with_legacy_state" + ), + pytest.param( + CursorBasedStreamStubFullRefresh(), + {"next_page_token": 10}, + CursorBasedCheckpointReader, + id="test_checkpoint_reader_using_rfr_cursor", + ), + pytest.param( + LegacyCursorBasedStreamStubFullRefresh(), + {}, + LegacyCursorBasedCheckpointReader, + id="test_full_refresh_checkpoint_reader_for_legacy_slice_format", + ), + ], +) +def test_get_checkpoint_reader(stream: Stream, stream_state, expected_checkpoint_reader_type): + checkpoint_reader = stream._get_checkpoint_reader( + logger=logger, + cursor_field=["updated_at"], + sync_mode=SyncMode.incremental, + stream_state=stream_state, + ) + + assert isinstance(checkpoint_reader, expected_checkpoint_reader_type) + + if isinstance(checkpoint_reader, CursorBasedCheckpointReader): + cursor = checkpoint_reader._cursor + if isinstance(cursor, ResumableFullRefreshCursor): + actual_cursor_state = cursor.get_stream_state() + + assert actual_cursor_state == stream_state + + +def test_checkpoint_reader_with_no_partitions(): + """ + Tests the edge case where an incremental stream might not generate any partitions, but should still attempt at least + one iteration of calling read_records() + """ + stream = StreamStubIncremental() + checkpoint_reader = stream._get_checkpoint_reader( + logger=logger, + cursor_field=["updated_at"], + sync_mode=SyncMode.incremental, + stream_state={}, + ) + + assert checkpoint_reader.next() == {} diff --git a/airbyte-cdk/python/unit_tests/sources/streams/utils/test_stream_helper.py b/airbyte-cdk/python/unit_tests/sources/streams/utils/test_stream_helper.py new file mode 100644 index 000000000000..da76a78714d7 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/streams/utils/test_stream_helper.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy + + +class MockStream: + def __init__(self, records, exit_on_rate_limit=True): + self.records = records + self._exit_on_rate_limit = exit_on_rate_limit + type(self).exit_on_rate_limit = property( + lambda self: self._get_exit_on_rate_limit(), lambda self, value: self._set_exit_on_rate_limit(value) + ) + + def _get_exit_on_rate_limit(self): + return self._exit_on_rate_limit + + def _set_exit_on_rate_limit(self, value): + self._exit_on_rate_limit = value + + def read_records(self, sync_mode, stream_slice): + return self.records + + +@pytest.mark.parametrize( + "records, stream_slice, exit_on_rate_limit, expected_result, raises_exception", + [ + ([{"id": 1}], None, True, {"id": 1}, False), # Single record, with setter + ([{"id": 1}, {"id": 2}], None, True, {"id": 1}, False), # Multiple records, with setter + ([], None, True, None, True), # No records, with setter + ], +) +def test_get_first_record_for_slice(records, stream_slice, exit_on_rate_limit, expected_result, raises_exception): + stream = MockStream(records, exit_on_rate_limit) + + if raises_exception: + with pytest.raises(StopIteration): + HttpAvailabilityStrategy().get_first_record_for_slice(stream, stream_slice) + else: + result = HttpAvailabilityStrategy().get_first_record_for_slice(stream, stream_slice) + assert result == expected_result + + assert stream.exit_on_rate_limit == exit_on_rate_limit diff --git a/airbyte-cdk/python/unit_tests/sources/test_abstract_source.py b/airbyte-cdk/python/unit_tests/sources/test_abstract_source.py new file mode 100644 index 000000000000..9de46b9e116f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/test_abstract_source.py @@ -0,0 +1,1713 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +import datetime +import logging +from typing import Any, Callable, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union +from unittest.mock import Mock + +import pytest +from airbyte_cdk.models import ( + AirbyteCatalog, + AirbyteConnectionStatus, + AirbyteErrorTraceMessage, + AirbyteLogMessage, + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + AirbyteStreamStatus, + AirbyteStreamStatusTraceMessage, + AirbyteTraceMessage, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + FailureType, + Level, + Status, + StreamDescriptor, + SyncMode, + TraceType, +) +from airbyte_cdk.models import Type +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources import AbstractSource +from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams import IncrementalMixin, Stream +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message +from airbyte_cdk.utils.airbyte_secrets_utils import update_secrets +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from pytest import fixture + +logger = logging.getLogger("airbyte") + + +class MockSource(AbstractSource): + def __init__( + self, + check_lambda: Callable[[], Tuple[bool, Optional[Any]]] = None, + streams: List[Stream] = None, + message_repository: MessageRepository = None, + exception_on_missing_stream: bool = True, + stop_sync_on_stream_failure: bool = False, + ): + self._streams = streams + self.check_lambda = check_lambda + self.exception_on_missing_stream = exception_on_missing_stream + self._message_repository = message_repository + self._stop_sync_on_stream_failure = stop_sync_on_stream_failure + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + if self.check_lambda: + return self.check_lambda() + return False, "Missing callable." + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + if not self._streams: + raise Exception("Stream is not set") + return self._streams + + @property + def raise_exception_on_missing_stream(self) -> bool: + return self.exception_on_missing_stream + + @property + def per_stream_state_enabled(self) -> bool: + return self.per_stream + + @property + def message_repository(self): + return self._message_repository + + +class MockSourceWithStopSyncFalseOverride(MockSource): + @property + def stop_sync_on_stream_failure(self) -> bool: + return False + + +class StreamNoStateMethod(Stream): + name = "managers" + primary_key = None + + def read_records(self, *args, **kwargs) -> Iterable[Mapping[str, Any]]: + return {} + + +class MockStreamOverridesStateMethod(Stream, IncrementalMixin): + name = "teams" + primary_key = None + cursor_field = "updated_at" + _cursor_value = "" + start_date = "1984-12-12" + + def read_records(self, *args, **kwargs) -> Iterable[Mapping[str, Any]]: + return {} + + @property + def state(self) -> MutableMapping[str, Any]: + return {self.cursor_field: self._cursor_value} if self._cursor_value else {} + + @state.setter + def state(self, value: MutableMapping[str, Any]): + self._cursor_value = value.get(self.cursor_field, self.start_date) + + +class StreamRaisesException(Stream): + name = "lamentations" + primary_key = None + + def __init__(self, exception_to_raise): + self._exception_to_raise = exception_to_raise + + def read_records(self, *args, **kwargs) -> Iterable[Mapping[str, Any]]: + raise self._exception_to_raise + + +MESSAGE_FROM_REPOSITORY = Mock() + + +@fixture +def message_repository(): + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [message for message in [MESSAGE_FROM_REPOSITORY]] + return message_repository + + +def test_successful_check(): + """Tests that if a source returns TRUE for the connection check the appropriate connectionStatus success message is returned""" + expected = AirbyteConnectionStatus(status=Status.SUCCEEDED) + assert MockSource(check_lambda=lambda: (True, None)).check(logger, {}) == expected + + +def test_failed_check(): + """Tests that if a source returns FALSE for the connection check the appropriate connectionStatus failure message is returned""" + expected = AirbyteConnectionStatus(status=Status.FAILED, message="'womp womp'") + assert MockSource(check_lambda=lambda: (False, "womp womp")).check(logger, {}) == expected + + +def test_raising_check(mocker): + """Tests that if a source raises an unexpected exception the appropriate connectionStatus failure message is returned.""" + check_lambda = mocker.Mock(side_effect=BaseException("this should fail")) + with pytest.raises(BaseException): + MockSource(check_lambda=check_lambda).check(logger, {}) + + +class MockStream(Stream): + def __init__( + self, + inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Iterable[Mapping[str, Any]]]] = None, + name: str = None, + ): + self._inputs_and_mocked_outputs = inputs_and_mocked_outputs + self._name = name + + @property + def name(self): + return self._name + + def read_records(self, **kwargs) -> Iterable[Mapping[str, Any]]: # type: ignore + # Remove None values + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if self._inputs_and_mocked_outputs: + for _input, output in self._inputs_and_mocked_outputs: + if kwargs == _input: + return output + + raise Exception(f"No mocked output supplied for input: {kwargs}. Mocked inputs/outputs: {self._inputs_and_mocked_outputs}") + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return "pk" + + @property + def cursor_field(self) -> Union[str, List[str]]: + return ["updated_at"] + + +class MockStreamWithCursor(MockStream): + cursor_field = "cursor" + + def __init__(self, inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Iterable[Mapping[str, Any]]]], name: str): + super().__init__(inputs_and_mocked_outputs, name) + + +class MockStreamWithState(MockStreamWithCursor): + def __init__(self, inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Iterable[Mapping[str, Any]]]], name: str, state=None): + super().__init__(inputs_and_mocked_outputs, name) + self._state = state + + @property + def state(self): + return self._state + + @state.setter + def state(self, value): + pass + + +class MockStreamEmittingAirbyteMessages(MockStreamWithState): + def __init__( + self, inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Iterable[AirbyteMessage]]] = None, name: str = None, state=None + ): + super().__init__(inputs_and_mocked_outputs, name, state) + self._inputs_and_mocked_outputs = inputs_and_mocked_outputs + self._name = name + + @property + def name(self): + return self._name + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return "pk" + + @property + def state(self) -> MutableMapping[str, Any]: + return {self.cursor_field: self._cursor_value} if self._cursor_value else {} + + @state.setter + def state(self, value: MutableMapping[str, Any]): + self._cursor_value = value.get(self.cursor_field) + + +class MockResumableFullRefreshStream(Stream): + def __init__( + self, + inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Mapping[str, Any]]] = None, + name: str = None, + ): + self._inputs_and_mocked_outputs = inputs_and_mocked_outputs + self._name = name + self._state = {} + + @property + def name(self): + return self._name + + def read_records(self, **kwargs) -> Iterable[Mapping[str, Any]]: # type: ignore + output = None + next_page_token = {} + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if self._inputs_and_mocked_outputs: + for _input, mocked_output in self._inputs_and_mocked_outputs: + if kwargs == _input: + if "error" in mocked_output: + raise AirbyteTracedException(message=mocked_output.get("error")) + else: + next_page_token = mocked_output.get("next_page") + output = mocked_output.get("records") + + if output is None: + raise Exception(f"No mocked output supplied for input: {kwargs}. Mocked inputs/outputs: {self._inputs_and_mocked_outputs}") + + self.state = next_page_token or {"__ab_full_refresh_sync_complete": True} + yield from output + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return "id" + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]): + self._state = value + + +def test_discover(mocker): + """Tests that the appropriate AirbyteCatalog is returned from the discover method""" + airbyte_stream1 = AirbyteStream( + name="1", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + default_cursor_field=["cursor"], + source_defined_cursor=True, + source_defined_primary_key=[["pk"]], + ) + airbyte_stream2 = AirbyteStream(name="2", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]) + + stream1 = MockStream() + stream2 = MockStream() + mocker.patch.object(stream1, "as_airbyte_stream", return_value=airbyte_stream1) + mocker.patch.object(stream2, "as_airbyte_stream", return_value=airbyte_stream2) + + expected = AirbyteCatalog(streams=[airbyte_stream1, airbyte_stream2]) + src = MockSource(check_lambda=lambda: (True, None), streams=[stream1, stream2]) + + assert src.discover(logger, {}) == expected + + +def test_read_nonexistent_stream_raises_exception(mocker): + """Tests that attempting to sync a stream which the source does not return from the `streams` method raises an exception""" + s1 = MockStream(name="s1") + s2 = MockStream(name="this_stream_doesnt_exist_in_the_source") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1]) + catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s2, SyncMode.full_refresh)]) + with pytest.raises(AirbyteTracedException) as exc_info: + list(src.read(logger, {}, catalog)) + + assert exc_info.value.failure_type == FailureType.config_error + assert "not found in the source" in exc_info.value.message + + +def test_read_nonexistent_stream_without_raises_exception(mocker, as_stream_status): + """Tests that attempting to sync a stream which the source does not return from the `streams` method raises an exception""" + s1 = MockStream(name="s1") + s2 = MockStream(name="this_stream_doesnt_exist_in_the_source") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1], exception_on_missing_stream=False) + + catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s2, SyncMode.full_refresh)]) + messages = list(src.read(logger, {}, catalog)) + messages = _fix_emitted_at(messages) + + expected = _fix_emitted_at([as_stream_status("this_stream_doesnt_exist_in_the_source", AirbyteStreamStatus.INCOMPLETE)]) + + assert messages == expected + + +def test_read_stream_emits_repository_message_before_record(mocker, message_repository): + stream = MockStream(name="my_stream") + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "read_records", side_effect=[[{"a record": "a value"}, {"another record": "another value"}]]) + message_repository.consume_queue.side_effect = [[message for message in [MESSAGE_FROM_REPOSITORY]], [], []] + + source = MockSource(streams=[stream], message_repository=message_repository) + + messages = list(source.read(logger, {}, ConfiguredAirbyteCatalog(streams=[_configured_stream(stream, SyncMode.full_refresh)]))) + + assert messages.count(MESSAGE_FROM_REPOSITORY) == 1 + record_messages = (message for message in messages if message.type == Type.RECORD) + assert all(messages.index(MESSAGE_FROM_REPOSITORY) < messages.index(record) for record in record_messages) + + +def test_read_stream_emits_repository_message_on_error(mocker, message_repository): + stream = MockStream(name="my_stream") + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "read_records", side_effect=RuntimeError("error")) + message_repository.consume_queue.return_value = [message for message in [MESSAGE_FROM_REPOSITORY]] + + source = MockSource(streams=[stream], message_repository=message_repository) + + with pytest.raises(AirbyteTracedException): + messages = list(source.read(logger, {}, ConfiguredAirbyteCatalog(streams=[_configured_stream(stream, SyncMode.full_refresh)]))) + assert MESSAGE_FROM_REPOSITORY in messages + + +def test_read_stream_with_error_gets_display_message(mocker): + stream = MockStream(name="my_stream") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "read_records", side_effect=RuntimeError("oh no!")) + + source = MockSource(streams=[stream]) + catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(stream, SyncMode.full_refresh)]) + + # without get_error_display_message + with pytest.raises(AirbyteTracedException): + list(source.read(logger, {}, catalog)) + + mocker.patch.object(MockStream, "get_error_display_message", return_value="my message") + + with pytest.raises(AirbyteTracedException) as exc: + list(source.read(logger, {}, catalog)) + assert "oh no!" in exc.value.message + + +GLOBAL_EMITTED_AT = 1 + + +def _as_record(stream: str, data: Dict[str, Any]) -> AirbyteMessage: + return AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage(stream=stream, data=data, emitted_at=GLOBAL_EMITTED_AT), + ) + + +def _as_records(stream: str, data: List[Dict[str, Any]]) -> List[AirbyteMessage]: + return [_as_record(stream, datum) for datum in data] + + +# TODO: Replace call of this function to fixture in the tests +def _as_stream_status(stream: str, status: AirbyteStreamStatus) -> AirbyteMessage: + trace_message = AirbyteTraceMessage( + emitted_at=datetime.datetime.now().timestamp() * 1000.0, + type=TraceType.STREAM_STATUS, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=stream), + status=status, + ), + ) + + return AirbyteMessage(type=MessageType.TRACE, trace=trace_message) + + +def _as_state(stream_name: str = "", per_stream_state: Dict[str, Any] = None): + return AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=stream_name), stream_state=AirbyteStateBlob(per_stream_state) + ), + ), + ) + + +def _as_error_trace( + stream: str, error_message: str, internal_message: Optional[str], failure_type: Optional[FailureType], stack_trace: Optional[str] +) -> AirbyteMessage: + trace_message = AirbyteTraceMessage( + emitted_at=datetime.datetime.now().timestamp() * 1000.0, + type=TraceType.ERROR, + error=AirbyteErrorTraceMessage( + stream_descriptor=StreamDescriptor(name=stream), + message=error_message, + internal_message=internal_message, + failure_type=failure_type, + stack_trace=stack_trace, + ), + ) + + return AirbyteMessage(type=MessageType.TRACE, trace=trace_message) + + +def _configured_stream(stream: Stream, sync_mode: SyncMode): + return ConfiguredAirbyteStream( + stream=stream.as_airbyte_stream(), + sync_mode=sync_mode, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + + +def _fix_emitted_at(messages: List[AirbyteMessage]) -> List[AirbyteMessage]: + for msg in messages: + if msg.type == Type.RECORD and msg.record: + msg.record.emitted_at = GLOBAL_EMITTED_AT + if msg.type == Type.TRACE and msg.trace: + msg.trace.emitted_at = GLOBAL_EMITTED_AT + return messages + + +def test_valid_full_refresh_read_no_slices(mocker): + """Tests that running a full refresh sync on streams which don't specify slices produces the expected AirbyteMessages""" + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + s1 = MockStream([({"stream_slice": {}, "stream_state": {}, "sync_mode": SyncMode.full_refresh}, stream_output)], name="s1") + s2 = MockStream([({"stream_slice": {}, "stream_state": {}, "sync_mode": SyncMode.full_refresh}, stream_output)], name="s2") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "cursor_field", return_value=[]) + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", stream_output), + _as_state("s1", {"__ab_no_cursor_state_message": True}), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + *_as_records("s2", stream_output), + _as_state("s2", {"__ab_no_cursor_state_message": True}), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + messages = _fix_emitted_at(list(src.read(logger, {}, catalog))) + + assert messages == expected + + +def test_valid_full_refresh_read_with_slices(mocker): + """Tests that running a full refresh sync on streams which use slices produces the expected AirbyteMessages""" + slices = [{"1": "1"}, {"2": "2"}] + # When attempting to sync a slice, just output that slice as a record + s1 = MockStream( + [({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": s}, [s]) for s in slices], + name="s1", + ) + s2 = MockStream( + [({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": s}, [s]) for s in slices], + name="s2", + ) + + mocker.patch.object(MockStream, "cursor_field", return_value=None) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "stream_slices", return_value=slices) + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", slices), + _as_state("s1", {"__ab_no_cursor_state_message": True}), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + *_as_records("s2", slices), + _as_state("s2", {"__ab_no_cursor_state_message": True}), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog))) + + assert messages == expected + + +@pytest.mark.parametrize( + "slices", + [[{"1": "1"}, {"2": "2"}], [{"date": datetime.date(year=2023, month=1, day=1)}, {"date": datetime.date(year=2023, month=1, day=1)}]], +) +def test_read_full_refresh_with_slices_sends_slice_messages(mocker, slices): + """Given the logger is debug and a full refresh, AirbyteMessages are sent for slices""" + debug_logger = logging.getLogger("airbyte.debug") + debug_logger.setLevel(logging.DEBUG) + stream = MockStream( + [({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": s}, [s]) for s in slices], + name="s1", + ) + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "stream_slices", return_value=slices) + + src = MockSource(streams=[stream]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream, SyncMode.full_refresh), + ] + ) + + messages = src.read(debug_logger, {}, catalog) + + assert 2 == len(list(filter(lambda message: message.log and message.log.message.startswith("slice:"), messages))) + + +def test_read_incremental_with_slices_sends_slice_messages(mocker): + """Given the logger is debug and a incremental, AirbyteMessages are sent for slices""" + debug_logger = logging.getLogger("airbyte.debug") + debug_logger.setLevel(logging.DEBUG) + slices = [{"1": "1"}, {"2": "2"}] + stream = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_slice": s, "stream_state": {}}, [s]) for s in slices], + name="s1", + ) + + MockStream.supports_incremental = mocker.PropertyMock(return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "stream_slices", return_value=slices) + + src = MockSource(streams=[stream]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream, SyncMode.incremental), + ] + ) + + messages = src.read(debug_logger, {}, catalog) + + assert 2 == len(list(filter(lambda message: message.log and message.log.message.startswith("slice:"), messages))) + + +class TestIncrementalRead: + def test_with_state_attribute(self, mocker): + """Test correct state passing for the streams that have a state attribute""" + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + old_state = {"cursor": "old_value"} + input_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState(stream_descriptor=StreamDescriptor(name="s1"), stream_state=AirbyteStateBlob(old_state)), + ), + ] + new_state_from_connector = {"cursor": "new_value"} + + stream_1 = MockStreamWithState( + [ + ( + {"sync_mode": SyncMode.incremental, "stream_slice": {}, "stream_state": old_state}, + stream_output, + ) + ], + name="s1", + ) + stream_2 = MockStreamWithState( + [({"sync_mode": SyncMode.incremental, "stream_slice": {}, "stream_state": {}}, stream_output)], + name="s2", + ) + + # Mock the stream's getter property for each time the stream reads self.state while syncing a stream + getter_mock = Mock(wraps=MockStreamWithState.state.fget) + getter_mock.side_effect = [ + old_state, # stream s1: Setting the checkpoint reader state to self.state if implemented + old_state, # stream s1: observe state after first record + old_state, # stream s1: observe state after second record + new_state_from_connector, # stream s2: observe state after first slice + {}, # stream s2: Setting the checkpoint reader state to self.state if implemented + {}, # stream s2: observe state after first record + {}, # stream s2: observe state after second record + new_state_from_connector, # stream s2: observe state after first slice + ] + mock_get_property = MockStreamWithState.state.getter(getter_mock) + state_property = mocker.patch.object( + MockStreamWithState, + "state", + mock_get_property, + ) + + mocker.patch.object(MockStreamWithState, "get_json_schema", return_value={}) + src = MockSource(streams=[stream_1, stream_2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_1, SyncMode.incremental), + _configured_stream(stream_2, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + _as_record("s1", stream_output[0]), + _as_record("s1", stream_output[1]), + _as_state("s1", new_state_from_connector), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + _as_record("s2", stream_output[0]), + _as_record("s2", stream_output[1]), + _as_state("s2", new_state_from_connector), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + + assert messages == expected + + # The state getter is called when we call the stream's observe method. We call self.state at the start of each stream (2 times), + # once for each record (4 times), and at the end of each slice (2 times) + assert len(state_property.fget.mock_calls) == 8 + + def test_with_checkpoint_interval(self, mocker): + """Tests that an incremental read which doesn't specify a checkpoint interval outputs a STATE message + after reading N records within a stream. + """ + input_state = [] + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + + stream_1 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_slice": {}, "stream_state": {}}, stream_output)], + name="s1", + ) + stream_2 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_slice": {}, "stream_state": {}}, stream_output)], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + # Tell the source to output one state message per record + mocker.patch.object( + MockStream, + "state_checkpoint_interval", + new_callable=mocker.PropertyMock, + return_value=1, + ) + + src = MockSource(streams=[stream_1, stream_2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_1, SyncMode.incremental), + _configured_stream(stream_2, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + _as_record("s1", stream_output[0]), + _as_state("s1", state), + _as_record("s1", stream_output[1]), + _as_state("s1", state), + _as_state("s1", state), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + _as_record("s2", stream_output[0]), + _as_state("s2", state), + _as_record("s2", stream_output[1]), + _as_state("s2", state), + _as_state("s2", state), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + + assert messages == expected + + def test_with_no_interval(self, mocker): + """Tests that an incremental read which doesn't specify a checkpoint interval outputs + a STATE message only after fully reading the stream and does not output any STATE messages during syncing the stream. + """ + input_state = [] + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + + stream_1 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_slice": {}, "stream_state": {}}, stream_output)], + name="s1", + ) + stream_2 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_slice": {}, "stream_state": {}}, stream_output)], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + + src = MockSource(streams=[stream_1, stream_2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_1, SyncMode.incremental), + _configured_stream(stream_2, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", stream_output), + _as_state("s1", state), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + *_as_records("s2", stream_output), + _as_state("s2", state), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + + assert messages == expected + + def test_with_slices(self, mocker): + """Tests that an incremental read which uses slices outputs each record in the slice followed by a STATE message, for each slice""" + input_state = [] + slices = [{"1": "1"}, {"2": "2"}] + stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] + + stream_1 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s1", + ) + stream_2 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "stream_slices", return_value=slices) + + src = MockSource(streams=[stream_1, stream_2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_1, SyncMode.incremental), + _configured_stream(stream_2, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + # stream 1 slice 1 + *_as_records("s1", stream_output), + _as_state("s1", state), + # stream 1 slice 2 + *_as_records("s1", stream_output), + _as_state("s1", state), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + # stream 2 slice 1 + *_as_records("s2", stream_output), + _as_state("s2", state), + # stream 2 slice 2 + *_as_records("s2", stream_output), + _as_state("s2", state), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + + assert messages == expected + + @pytest.mark.parametrize("slices", [pytest.param([], id="test_slices_as_list"), pytest.param(iter([]), id="test_slices_as_iterator")]) + def test_no_slices(self, mocker, slices): + """ + Tests that an incremental read returns at least one state messages even if no records were read: + 1. outputs a state message after reading the entire stream + """ + state = {"cursor": "value"} + input_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState(stream_descriptor=StreamDescriptor(name="s1"), stream_state=AirbyteStateBlob(state)), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState(stream_descriptor=StreamDescriptor(name="s2"), stream_state=AirbyteStateBlob(state)), + ), + ] + + stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] + stream_1 = MockStreamWithState( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s1", + state=state, + ) + stream_2 = MockStreamWithState( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s2", + state=state, + ) + + mocker.patch.object(MockStreamWithState, "supports_incremental", return_value=True) + mocker.patch.object(MockStreamWithState, "get_json_schema", return_value={}) + mocker.patch.object(MockStreamWithState, "stream_slices", return_value=slices) + mocker.patch.object( + MockStreamWithState, + "state_checkpoint_interval", + new_callable=mocker.PropertyMock, + return_value=2, + ) + + src = MockSource(streams=[stream_1, stream_2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_1, SyncMode.incremental), + _configured_stream(stream_2, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_state("s1", state), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_state("s2", state), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + + assert messages == expected + + def test_with_slices_and_interval(self, mocker): + """ + Tests that an incremental read which uses slices and a checkpoint interval: + 1. outputs all records + 2. outputs a state message every N records (N=checkpoint_interval) + 3. outputs a state message after reading the entire slice + """ + input_state = [] + slices = [{"1": "1"}, {"2": "2"}] + stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] + stream_1 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s1", + ) + stream_2 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "stream_slices", return_value=slices) + mocker.patch.object( + MockStream, + "state_checkpoint_interval", + new_callable=mocker.PropertyMock, + return_value=2, + ) + + src = MockSource(streams=[stream_1, stream_2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_1, SyncMode.incremental), + _configured_stream(stream_2, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + # stream 1 slice 1 + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + _as_record("s1", stream_output[0]), + _as_record("s1", stream_output[1]), + _as_state("s1", state), + _as_record("s1", stream_output[2]), + _as_state("s1", state), + # stream 1 slice 2 + _as_record("s1", stream_output[0]), + _as_state("s1", state), + _as_record("s1", stream_output[1]), + _as_record("s1", stream_output[2]), + _as_state("s1", state), + _as_state("s1", state), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + # stream 2 slice 1 + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + _as_record("s2", stream_output[0]), + _as_record("s2", stream_output[1]), + _as_state("s2", state), + _as_record("s2", stream_output[2]), + _as_state("s2", state), + # stream 2 slice 2 + _as_record("s2", stream_output[0]), + _as_state("s2", state), + _as_record("s2", stream_output[1]), + _as_record("s2", stream_output[2]), + _as_state("s2", state), + _as_state("s2", state), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + + assert messages == expected + + def test_emit_non_records(self, mocker): + """ + Tests that an incremental read which uses slices and a checkpoint interval: + 1. outputs all records + 2. outputs a state message every N records (N=checkpoint_interval) + 3. outputs a state message after reading the entire slice + """ + + input_state = [] + slices = [{"1": "1"}, {"2": "2"}] + stream_output = [ + {"k1": "v1"}, + AirbyteLogMessage(level=Level.INFO, message="HELLO"), + {"k2": "v2"}, + {"k3": "v3"}, + ] + stream_1 = MockStreamEmittingAirbyteMessages( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s1", + state=copy.deepcopy(input_state), + ) + stream_2 = MockStreamEmittingAirbyteMessages( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s2", + state=copy.deepcopy(input_state), + ) + + state = {"cursor": "value"} + getter_mock = Mock(wraps=MockStreamEmittingAirbyteMessages.state.fget) + getter_mock.return_value = state + mock_get_property = MockStreamEmittingAirbyteMessages.state.getter(getter_mock) + mocker.patch.object( + MockStreamEmittingAirbyteMessages, + "state", + mock_get_property, + ) + + mocker.patch.object(MockStreamWithState, "supports_incremental", return_value=True) + mocker.patch.object(MockStreamWithState, "get_json_schema", return_value={}) + mocker.patch.object(MockStreamWithState, "stream_slices", return_value=slices) + mocker.patch.object( + MockStreamWithState, + "state_checkpoint_interval", + new_callable=mocker.PropertyMock, + return_value=2, + ) + + src = MockSource(streams=[stream_1, stream_2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_1, SyncMode.incremental), + _configured_stream(stream_2, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + # stream 1 slice 1 + stream_data_to_airbyte_message("s1", stream_output[0]), + stream_data_to_airbyte_message("s1", stream_output[1]), + stream_data_to_airbyte_message("s1", stream_output[2]), + _as_state("s1", state), + stream_data_to_airbyte_message("s1", stream_output[3]), + _as_state("s1", state), + # stream 1 slice 2 + stream_data_to_airbyte_message("s1", stream_output[0]), + _as_state("s1", state), + stream_data_to_airbyte_message("s1", stream_output[1]), + stream_data_to_airbyte_message("s1", stream_output[2]), + stream_data_to_airbyte_message("s1", stream_output[3]), + _as_state("s1", state), + _as_state("s1", state), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + # stream 2 slice 1 + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + stream_data_to_airbyte_message("s2", stream_output[0]), + stream_data_to_airbyte_message("s2", stream_output[1]), + stream_data_to_airbyte_message("s2", stream_output[2]), + _as_state("s2", state), + stream_data_to_airbyte_message("s2", stream_output[3]), + _as_state("s2", state), + # stream 2 slice 2 + stream_data_to_airbyte_message("s2", stream_output[0]), + _as_state("s2", state), + stream_data_to_airbyte_message("s2", stream_output[1]), + stream_data_to_airbyte_message("s2", stream_output[2]), + stream_data_to_airbyte_message("s2", stream_output[3]), + _as_state("s2", state), + _as_state("s2", state), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + + assert messages == expected + + def test_without_state_attribute_for_stream_with_desc_records(self, mocker): + """ + This test will check that the state resolved by get_updated_state is used and returned in the state message. + In this scenario records are returned in descending order, but we keep the "highest" cursor in the state. + """ + stream_cursor = MockStreamWithCursor.cursor_field + stream_output = [{f"k{cursor_id}": f"v{cursor_id}", stream_cursor: cursor_id} for cursor_id in range(5, 1, -1)] + initial_state = {stream_cursor: 1} + stream_name = "stream_with_cursor" + input_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name=stream_name), stream_state=AirbyteStateBlob(initial_state) + ), + ), + ] + stream_with_cursor = MockStreamWithCursor( + [({"sync_mode": SyncMode.incremental, "stream_slice": {}, "stream_state": initial_state}, stream_output)], + name=stream_name, + ) + + def mock_get_updated_state(current_stream, current_stream_state, latest_record): + state_cursor_value = current_stream_state.get(current_stream.cursor_field, 0) + latest_record_value = latest_record.get(current_stream.cursor_field) + return {current_stream.cursor_field: max(latest_record_value, state_cursor_value)} + + mocker.patch.object(MockStreamWithCursor, "get_updated_state", mock_get_updated_state) + mocker.patch.object(MockStreamWithCursor, "get_json_schema", return_value={}) + src = MockSource(streams=[stream_with_cursor]) + + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(stream_with_cursor, SyncMode.incremental), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status(stream_name, AirbyteStreamStatus.STARTED), + _as_stream_status(stream_name, AirbyteStreamStatus.RUNNING), + _as_record(stream_name, stream_output[0]), + _as_record(stream_name, stream_output[1]), + _as_record(stream_name, stream_output[2]), + _as_record(stream_name, stream_output[3]), + _as_state(stream_name, {stream_cursor: stream_output[0][stream_cursor]}), + _as_stream_status(stream_name, AirbyteStreamStatus.COMPLETE), + ] + ) + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state))) + assert messages + assert messages == expected + + +class TestResumableFullRefreshRead: + def test_resumable_full_refresh_multiple_pages(self, mocker): + """Tests that running a resumable full refresh sync from the first attempt with no prior state""" + responses = [ + {"records": [{"1": "1"}, {"2": "2"}], "next_page": {"page": 1}}, + {"records": [{"3": "3"}, {"4": "4"}], "next_page": {"page": 2}}, + {"records": [{"3": "3"}, {"4": "4"}]}, + ] + # When attempting to sync a slice, just output that slice as a record + + # We've actually removed this filtering logic and will rely on the platform to dicate whether to pass state to the connector + # So in reality we can probably get rid of this test entirely + s1 = MockResumableFullRefreshStream( + [ + ({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": {}}, responses[0]), + ({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 1}}, responses[1]), + ({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 2}}, responses[2]), + ], + name="s1", + ) + + mocker.patch.object(MockResumableFullRefreshStream, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", responses[0]["records"]), + _as_state("s1", {"page": 1}), + *_as_records("s1", responses[1]["records"]), + _as_state("s1", {"page": 2}), + *_as_records("s1", responses[2]["records"]), + _as_state("s1", {"__ab_full_refresh_sync_complete": True}), + _as_state("s1", {"__ab_full_refresh_sync_complete": True}), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog))) + + assert messages == expected + + def test_resumable_full_refresh_with_incoming_state(self, mocker): + """Tests that running a resumable full refresh sync from the second attempt with partial state passed in""" + responses = [ + {"records": [{"100": "100"}, {"200": "200"}], "next_page": {"page": 11}}, + {"records": [{"300": "300"}, {"400": "400"}], "next_page": {"page": 12}}, + {"records": [{"500": "500"}, {"600": "600"}], "next_page": {"page": 13}}, + {"records": [{"700": "700"}, {"800": "800"}]}, + ] + # When attempting to sync a slice, just output that slice as a record + + # We've actually removed this filtering logic and will rely on the platform to dicate whether to pass state to the connector + # So in reality we can probably get rid of this test entirely + s1 = MockResumableFullRefreshStream( + [ + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 10}}, responses[0]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 11}}, responses[1]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 12}}, responses[2]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 13}}, responses[3]), + ], + name="s1", + ) + + mocker.patch.object(MockResumableFullRefreshStream, "get_json_schema", return_value={}) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="s1"), + stream_state=AirbyteStateBlob({"page": 10}), + ), + ) + ] + + src = MockSource(streams=[s1]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", responses[0]["records"]), + _as_state("s1", {"page": 11}), + *_as_records("s1", responses[1]["records"]), + _as_state("s1", {"page": 12}), + *_as_records("s1", responses[2]["records"]), + _as_state("s1", {"page": 13}), + *_as_records("s1", responses[3]["records"]), + _as_state("s1", {"__ab_full_refresh_sync_complete": True}), + _as_state("s1", {"__ab_full_refresh_sync_complete": True}), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state))) + + assert messages == expected + + def test_resumable_full_refresh_partial_failure(self, mocker): + """Tests that running a resumable full refresh sync from the first attempt that fails before completing successfully""" + expected_error_message = "I have failed you Anakin." + responses = [ + {"records": [{"1": "1"}, {"2": "2"}], "next_page": {"page": 1}}, + {"records": [{"3": "3"}, {"4": "4"}], "next_page": {"page": 2}}, + {"error": expected_error_message}, + ] + # When attempting to sync a slice, just output that slice as a record + + # We've actually removed this filtering logic and will rely on the platform to dicate whether to pass state to the connector + # So in reality we can probably get rid of this test entirely + s1 = MockResumableFullRefreshStream( + [ + ({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": {}}, responses[0]), + ({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 1}}, responses[1]), + ({"stream_state": {}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 2}}, responses[2]), + ], + name="s1", + ) + + mocker.patch.object(MockResumableFullRefreshStream, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", responses[0]["records"]), + _as_state("s1", {"page": 1}), + *_as_records("s1", responses[1]["records"]), + _as_state("s1", {"page": 2}), + _as_stream_status("s1", AirbyteStreamStatus.INCOMPLETE), + _as_error_trace("s1", expected_error_message, None, FailureType.system_error, None), + ] + ) + + messages = [] + with pytest.raises(AirbyteTracedException) as exc: + for message in src.read(logger, {}, catalog): + messages.append(_remove_stack_trace(message)) + + assert _fix_emitted_at(messages) == expected + assert "s1" in exc.value.message + assert exc.value.failure_type == FailureType.config_error + + def test_resumable_full_refresh_skip_prior_successful_streams(self, mocker): + """ + Tests that running a resumable full refresh sync from the second attempt where one stream was successful + and should not be synced. The other should sync beginning at the partial state passed in. + """ + responses = [ + {"records": [{"100": "100"}, {"200": "200"}], "next_page": {"page": 11}}, + {"records": [{"300": "300"}, {"400": "400"}], "next_page": {"page": 12}}, + {"records": [{"500": "500"}, {"600": "600"}], "next_page": {"page": 13}}, + {"records": [{"700": "700"}, {"800": "800"}]}, + ] + # When attempting to sync a slice, just output that slice as a record + + # We've actually removed this filtering logic and will rely on the platform to dicate whether to pass state to the connector + # So in reality we can probably get rid of this test entirely + s1 = MockResumableFullRefreshStream( + [ + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 10}}, responses[0]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 11}}, responses[1]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 12}}, responses[2]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 13}}, responses[3]), + ], + name="s1", + ) + + s2 = MockResumableFullRefreshStream( + [ + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 10}}, responses[0]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 11}}, responses[1]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 12}}, responses[2]), + ({"stream_state": {"page": 10}, "sync_mode": SyncMode.full_refresh, "stream_slice": {"page": 13}}, responses[3]), + ], + name="s2", + ) + + mocker.patch.object(MockResumableFullRefreshStream, "get_json_schema", return_value={}) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="s1"), + stream_state=AirbyteStateBlob({"__ab_full_refresh_sync_complete": True}), + ), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="s2"), + stream_state=AirbyteStateBlob({"page": 10}), + ), + ), + ] + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_state("s1", {"__ab_full_refresh_sync_complete": True}), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("s2", AirbyteStreamStatus.STARTED), + _as_stream_status("s2", AirbyteStreamStatus.RUNNING), + *_as_records("s2", responses[0]["records"]), + _as_state("s2", {"page": 11}), + *_as_records("s2", responses[1]["records"]), + _as_state("s2", {"page": 12}), + *_as_records("s2", responses[2]["records"]), + _as_state("s2", {"page": 13}), + *_as_records("s2", responses[3]["records"]), + _as_state("s2", {"__ab_full_refresh_sync_complete": True}), + _as_state("s2", {"__ab_full_refresh_sync_complete": True}), + _as_stream_status("s2", AirbyteStreamStatus.COMPLETE), + ] + ) + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state))) + + assert messages == expected + + +@pytest.mark.parametrize( + "exception_to_raise,expected_error_message,expected_internal_message", + [ + pytest.param( + AirbyteTracedException(message="I was born only to crash like Icarus"), + "I was born only to crash like Icarus", + None, + id="test_raises_traced_exception", + ), + pytest.param( + Exception("Generic connector error message"), + "Something went wrong in the connector. See the logs for more details.", + "Generic connector error message", + id="test_raises_generic_exception", + ), + ], +) +def test_continue_sync_with_failed_streams(mocker, exception_to_raise, expected_error_message, expected_internal_message): + """ + Tests that running a sync for a connector with multiple streams will continue syncing when one stream fails + with an error. This source does not override the default behavior defined in the AbstractSource class. + """ + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + s1 = MockStream([({"sync_mode": SyncMode.full_refresh}, stream_output)], name="s1") + s2 = StreamRaisesException(exception_to_raise=exception_to_raise) + s3 = MockStream([({"sync_mode": SyncMode.full_refresh}, stream_output)], name="s3") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(StreamRaisesException, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1, s2, s3]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + _configured_stream(s3, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", stream_output), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("lamentations", AirbyteStreamStatus.STARTED), + _as_stream_status("lamentations", AirbyteStreamStatus.INCOMPLETE), + _as_error_trace("lamentations", expected_error_message, expected_internal_message, FailureType.system_error, None), + _as_stream_status("s3", AirbyteStreamStatus.STARTED), + _as_stream_status("s3", AirbyteStreamStatus.RUNNING), + *_as_records("s3", stream_output), + _as_stream_status("s3", AirbyteStreamStatus.COMPLETE), + ] + ) + + with pytest.raises(AirbyteTracedException) as exc: + messages = [_remove_stack_trace(message) for message in src.read(logger, {}, catalog)] + messages = _fix_emitted_at(messages) + + assert messages == expected + + assert "lamentations" in exc.value.message + assert exc.value.failure_type == FailureType.config_error + + +def test_continue_sync_source_override_false(mocker): + """ + Tests that running a sync for a connector explicitly overriding the default AbstractSource.stop_sync_on_stream_failure + property to be False which will continue syncing stream even if one encountered an exception. + """ + update_secrets(["API_KEY_VALUE"]) + + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + s1 = MockStream([({"sync_mode": SyncMode.full_refresh}, stream_output)], name="s1") + s2 = StreamRaisesException(exception_to_raise=AirbyteTracedException(message="I was born only to crash like Icarus")) + s3 = MockStream([({"sync_mode": SyncMode.full_refresh}, stream_output)], name="s3") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(StreamRaisesException, "get_json_schema", return_value={}) + + src = MockSourceWithStopSyncFalseOverride(streams=[s1, s2, s3]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + _configured_stream(s3, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", stream_output), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("lamentations", AirbyteStreamStatus.STARTED), + _as_stream_status("lamentations", AirbyteStreamStatus.INCOMPLETE), + _as_error_trace("lamentations", "I was born only to crash like Icarus", None, FailureType.system_error, None), + _as_stream_status("s3", AirbyteStreamStatus.STARTED), + _as_stream_status("s3", AirbyteStreamStatus.RUNNING), + *_as_records("s3", stream_output), + _as_stream_status("s3", AirbyteStreamStatus.COMPLETE), + ] + ) + + with pytest.raises(AirbyteTracedException) as exc: + messages = [_remove_stack_trace(message) for message in src.read(logger, {}, catalog)] + messages = _fix_emitted_at(messages) + + assert messages == expected + + assert "lamentations" in exc.value.message + assert exc.value.failure_type == FailureType.config_error + + +def test_sync_error_trace_messages_obfuscate_secrets(mocker): + """ + Tests that exceptions emitted as trace messages by a source have secrets properly sanitized + """ + update_secrets(["API_KEY_VALUE"]) + + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + s1 = MockStream([({"sync_mode": SyncMode.full_refresh}, stream_output)], name="s1") + s2 = StreamRaisesException( + exception_to_raise=AirbyteTracedException(message="My api_key value API_KEY_VALUE flew too close to the sun.") + ) + s3 = MockStream([({"sync_mode": SyncMode.full_refresh}, stream_output)], name="s3") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(StreamRaisesException, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1, s2, s3]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + _configured_stream(s3, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", stream_output), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("lamentations", AirbyteStreamStatus.STARTED), + _as_stream_status("lamentations", AirbyteStreamStatus.INCOMPLETE), + _as_error_trace("lamentations", "My api_key value **** flew too close to the sun.", None, FailureType.system_error, None), + _as_stream_status("s3", AirbyteStreamStatus.STARTED), + _as_stream_status("s3", AirbyteStreamStatus.RUNNING), + *_as_records("s3", stream_output), + _as_stream_status("s3", AirbyteStreamStatus.COMPLETE), + ] + ) + + with pytest.raises(AirbyteTracedException) as exc: + messages = [_remove_stack_trace(message) for message in src.read(logger, {}, catalog)] + messages = _fix_emitted_at(messages) + + assert messages == expected + + assert "lamentations" in exc.value.message + assert exc.value.failure_type == FailureType.config_error + + +def test_continue_sync_with_failed_streams_with_override_false(mocker): + """ + Tests that running a sync for a connector with multiple streams and stop_sync_on_stream_failure enabled stops + the sync when one stream fails with an error. + """ + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + s1 = MockStream([({"stream_state": {}, "stream_slice": {}, "sync_mode": SyncMode.full_refresh}, stream_output)], name="s1") + s2 = StreamRaisesException(AirbyteTracedException(message="I was born only to crash like Icarus")) + s3 = MockStream([({"stream_state": {}, "stream_slice": {}, "sync_mode": SyncMode.full_refresh}, stream_output)], name="s3") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(StreamRaisesException, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1, s2, s3]) + mocker.patch.object(MockSource, "stop_sync_on_stream_failure", return_value=True) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + _configured_stream(s3, SyncMode.full_refresh), + ] + ) + + expected = _fix_emitted_at( + [ + _as_stream_status("s1", AirbyteStreamStatus.STARTED), + _as_stream_status("s1", AirbyteStreamStatus.RUNNING), + *_as_records("s1", stream_output), + _as_stream_status("s1", AirbyteStreamStatus.COMPLETE), + _as_stream_status("lamentations", AirbyteStreamStatus.STARTED), + _as_stream_status("lamentations", AirbyteStreamStatus.INCOMPLETE), + _as_error_trace("lamentations", "I was born only to crash like Icarus", None, FailureType.system_error, None), + ] + ) + + with pytest.raises(AirbyteTracedException) as exc: + messages = [_remove_stack_trace(message) for message in src.read(logger, {}, catalog)] + messages = _fix_emitted_at(messages) + + assert messages == expected + + assert "lamentations" in exc.value.message + assert exc.value.failure_type == FailureType.config_error + + +# TODO: Replace call of this function to fixture in the tests +def _remove_stack_trace(message: AirbyteMessage) -> AirbyteMessage: + """ + Helper method that removes the stack trace from Airbyte trace messages to make asserting against expected records easier + """ + if message.trace and message.trace.error and message.trace.error.stack_trace: + message.trace.error.stack_trace = None + return message + + +def test_read_nonexistent_stream_emit_incomplete_stream_status(mocker, remove_stack_trace, as_stream_status): + """ + Tests that attempting to sync a stream which the source does not return from the `streams` method emit incomplete stream status + """ + s1 = MockStream(name="s1") + s2 = MockStream(name="this_stream_doesnt_exist_in_the_source") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1]) + catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s2, SyncMode.full_refresh)]) + + expected = _fix_emitted_at([as_stream_status("this_stream_doesnt_exist_in_the_source", AirbyteStreamStatus.INCOMPLETE)]) + + expected_error_message = ( + "The stream 'this_stream_doesnt_exist_in_the_source' in your connection configuration was not found in the " + "source. Refresh the schema in your replication settings and remove this stream from future sync attempts." + ) + + with pytest.raises(AirbyteTracedException) as exc_info: + messages = [remove_stack_trace(message) for message in src.read(logger, {}, catalog)] + messages = _fix_emitted_at(messages) + + assert messages == expected + + assert expected_error_message in exc_info.value.message + assert exc_info.value.failure_type == FailureType.config_error diff --git a/airbyte-cdk/python/unit_tests/sources/test_config.py b/airbyte-cdk/python/unit_tests/sources/test_config.py new file mode 100644 index 000000000000..8cac7992a671 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/test_config.py @@ -0,0 +1,92 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import List, Union + +from airbyte_cdk.sources.config import BaseConfig +from pydantic.v1 import BaseModel, Field + + +class InnerClass(BaseModel): + field1: str + field2: int + + +class Choice1(BaseModel): + selected_strategy = Field("option1", const=True) + + name: str + count: int + + +class Choice2(BaseModel): + selected_strategy = Field("option2", const=True) + + sequence: List[str] + + +class SomeSourceConfig(BaseConfig): + class Config: + title = "Some Source" + + items: List[InnerClass] + choice: Union[Choice1, Choice2] + + +class TestBaseConfig: + EXPECTED_SCHEMA = { + "properties": { + "choice": { + "oneOf": [ + { + "properties": { + "count": {"title": "Count", "type": "integer"}, + "name": {"title": "Name", "type": "string"}, + "selected_strategy": { + "const": "option1", + "title": "Selected " "Strategy", + "type": "string", + "default": "option1", + }, + }, + "required": ["name", "count"], + "title": "Choice1", + "type": "object", + }, + { + "properties": { + "selected_strategy": { + "const": "option2", + "title": "Selected " "Strategy", + "type": "string", + "default": "option2", + }, + "sequence": {"items": {"type": "string"}, "title": "Sequence", "type": "array"}, + }, + "required": ["sequence"], + "title": "Choice2", + "type": "object", + }, + ], + "title": "Choice", + }, + "items": { + "items": { + "properties": {"field1": {"title": "Field1", "type": "string"}, "field2": {"title": "Field2", "type": "integer"}}, + "required": ["field1", "field2"], + "title": "InnerClass", + "type": "object", + }, + "title": "Items", + "type": "array", + }, + }, + "required": ["items", "choice"], + "title": "Some Source", + "type": "object", + } + + def test_schema_postprocessing(self): + schema = SomeSourceConfig.schema() + assert schema == self.EXPECTED_SCHEMA diff --git a/airbyte-cdk/python/unit_tests/sources/test_connector_state_manager.py b/airbyte-cdk/python/unit_tests/sources/test_connector_state_manager.py new file mode 100644 index 000000000000..1a5526b105d5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/test_connector_state_manager.py @@ -0,0 +1,430 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from contextlib import nullcontext as does_not_raise +from typing import List + +import pytest +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateMessageSerializer, + AirbyteStateType, + AirbyteStreamState, + StreamDescriptor, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager, HashableStreamDescriptor + + +@pytest.mark.parametrize( + "input_stream_state, expected_stream_state, expected_error", + ( + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "actors", "namespace": "public"}, "stream_state": {"id": "mando_michael"}}, + }, + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "actresses", "namespace": "public"}, "stream_state": {"id": "seehorn_rhea"}}, + }, + ], + { + HashableStreamDescriptor(name="actors", namespace="public"): AirbyteStateBlob({"id": "mando_michael"}), + HashableStreamDescriptor(name="actresses", namespace="public"): AirbyteStateBlob({"id": "seehorn_rhea"}), + }, + does_not_raise(), + id="test_incoming_per_stream_state", + ), + pytest.param([], {}, does_not_raise(), id="test_incoming_empty_stream_state"), + pytest.param( + [{"type": "STREAM", "stream": {"stream_descriptor": {"name": "actresses", "namespace": "public"}}}], + {HashableStreamDescriptor(name="actresses", namespace="public"): None}, + does_not_raise(), + id="test_stream_states_that_have_none_state_blob", + ), + pytest.param( + [ + { + "type": "GLOBAL", + "global": { + "shared_state": {"television": "better_call_saul"}, + "stream_states": [ + { + "stream_descriptor": {"name": "actors", "namespace": "public"}, + "stream_state": {"id": "mando_michael"}, + }, + { + "stream_descriptor": {"name": "actresses", "namespace": "public"}, + "stream_state": {"id": "seehorn_rhea"}, + }, + ], + }, + }, + ], + { + HashableStreamDescriptor(name="actors", namespace="public"): AirbyteStateBlob({"id": "mando_michael"}), + HashableStreamDescriptor(name="actresses", namespace="public"): AirbyteStateBlob({"id": "seehorn_rhea"}), + }, + pytest.raises(ValueError), + id="test_incoming_global_state_with_shared_state_throws_error", + ), + pytest.param( + [ + { + "type": "GLOBAL", + "global": { + "stream_states": [ + {"stream_descriptor": {"name": "actors", "namespace": "public"}, "stream_state": {"id": "mando_michael"}}, + ], + }, + }, + ], + { + HashableStreamDescriptor(name="actors", namespace="public"): AirbyteStateBlob({"id": "mando_michael"}), + }, + does_not_raise(), + id="test_incoming_global_state_without_shared", + ), + pytest.param( + [ + { + "type": "GLOBAL", + "global": { + "shared_state": None, + "stream_states": [ + { + "stream_descriptor": {"name": "actors", "namespace": "public"}, + "stream_state": {"id": "mando_michael"}, + }, + ], + }, + }, + ], + { + HashableStreamDescriptor(name="actors", namespace="public"): AirbyteStateBlob({"id": "mando_michael"}), + }, + does_not_raise(), + id="test_incoming_global_state_with_none_shared", + ), + pytest.param( + [ + { + "type": "GLOBAL", + "global": { + "stream_states": [ + {"stream_descriptor": {"name": "actresses", "namespace": "public"}}, + ], + }, + }, + ], + {HashableStreamDescriptor(name="actresses", namespace="public"): None}, + does_not_raise(), + id="test_incoming_global_state_without_stream_state", + ), + ), +) +def test_initialize_state_manager(input_stream_state, expected_stream_state, expected_error): + if isinstance(input_stream_state, List): + input_stream_state = [AirbyteStateMessageSerializer.load(state_obj) for state_obj in list(input_stream_state)] + + with expected_error: + state_manager = ConnectorStateManager(input_stream_state) + + assert state_manager.per_stream_states == expected_stream_state + + +@pytest.mark.parametrize( + "input_state, stream_name, namespace, expected_state", + [ + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "users", "namespace": "public"}, "stream_state": {"created_at": 12345}}, + }, + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "accounts", "namespace": "public"}, "stream_state": {"id": "abc"}}, + }, + ], + "users", + "public", + {"created_at": 12345}, + id="test_get_stream_only", + ), + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "users"}, "stream_state": {"created_at": 12345}}, + }, + {"type": "STREAM", "stream": {"stream_descriptor": {"name": "accounts"}, "stream_state": {"id": "abc"}}}, + ], + "users", + None, + {"created_at": 12345}, + id="test_get_stream_without_namespace", + ), + pytest.param( + [ + {"type": "STREAM", "stream": {"stream_descriptor": {"name": "users"}}}, + {"type": "STREAM", "stream": {"stream_descriptor": {"name": "accounts"}, "stream_state": {"id": "abc"}}}, + ], + "users", + None, + {}, + id="test_get_stream_without_stream_state", + ), + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "users", "namespace": "public"}, "stream_state": {"created_at": 12345}}, + }, + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "accounts", "namespace": "public"}, "stream_state": {"id": "abc"}}, + }, + ], + "missing", + "public", + {}, + id="test_get_missing_stream", + ), + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "users", "namespace": "public"}, "stream_state": {"created_at": 12345}}, + }, + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "accounts", "namespace": "public"}, "stream_state": {"id": "abc"}}, + }, + ], + "users", + "wrong_namespace", + {}, + id="test_get_stream_wrong_namespace", + ), + pytest.param([], "users", "public", {}, id="test_get_empty_stream_state_defaults_to_empty_dictionary"), + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "users", "namespace": "public"}, "stream_state": None}, + }, + ], + "users", + "public", + {}, + id="test_get_stream_with_stream_state_none_returns_empty_map", + ), + ], +) +def test_get_stream_state(input_state, stream_name, namespace, expected_state): + state_messages = [AirbyteStateMessageSerializer.load(state_obj) for state_obj in list(input_state)] + state_manager = ConnectorStateManager(state_messages) + + actual_state = state_manager.get_stream_state(stream_name, namespace) + + assert actual_state == expected_state + + +def test_get_state_returns_deep_copy(): + input_state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="public"), + stream_state=AirbyteStateBlob({"id": [109]}), + ), + ) + ] + state_manager = ConnectorStateManager(input_state) + + per_stream_state = state_manager.get_stream_state("episodes", "public") + per_stream_state["id"].append(309) + + assert state_manager.get_stream_state("episodes", "public") == {"id": [109]} + + +@pytest.mark.parametrize( + "start_state, update_name, update_namespace, update_value", + [ + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "actors", "namespace": "public"}, "stream_state": {"id": "mckean_michael"}}, + }, + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "actresses", "namespace": "public"}, "stream_state": {"id": "seehorn_rhea"}}, + }, + ], + "actors", + "public", + {"id": "fabian_patrick"}, + id="test_update_existing_stream_state", + ), + pytest.param( + [], + "actresses", + None, + {"id": "seehorn_rhea"}, + id="test_update_first_time_sync_without_namespace", + ), + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "actresses", "namespace": "public"}, "stream_state": {"id": "seehorn_rhea"}}, + } + ], + "actors", + "public", + {"id": "banks_jonathan"}, + id="test_update_missing_state", + ), + pytest.param( + [ + { + "type": "STREAM", + "stream": {"stream_descriptor": {"name": "actresses", "namespace": "public"}, "stream_state": {"id": "seehorn_rhea"}}, + } + ], + "actors", + "public", + {"id": "banks_jonathan"}, + id="test_ignore_when_per_stream_state_value_is_none", + ), + ], +) +def test_update_state_for_stream(start_state, update_name, update_namespace, update_value): + state_messages = [AirbyteStateMessage(state_obj) for state_obj in list(start_state)] + state_manager = ConnectorStateManager(state_messages) + + state_manager.update_state_for_stream(update_name, update_namespace, update_value) + + assert state_manager.per_stream_states[HashableStreamDescriptor(name=update_name, namespace=update_namespace)] == AirbyteStateBlob( + update_value + ) + + +@pytest.mark.parametrize( + "start_state, update_name, update_namespace, expected_state_message", + [ + pytest.param( + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="public"), + stream_state=AirbyteStateBlob({"created_at": "2022_05_22"}), + ), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="seasons", namespace="public"), + stream_state=AirbyteStateBlob({"id": 1}), + ), + ), + ], + "episodes", + "public", + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="public"), + stream_state=AirbyteStateBlob({"created_at": "2022_05_22"}), + ), + ), + ), + id="test_emit_state_message", + ), + pytest.param( + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="public"), + stream_state=None, + ), + ), + ], + "episodes", + "public", + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="public"), + stream_state=AirbyteStateBlob(), + ), + ), + ), + id="test_always_emit_message_with_stream_state_blob", + ), + pytest.param( + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="public"), + stream_state=AirbyteStateBlob({"id": 507}), + ), + ) + ], + "missing", + "public", + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="missing", namespace="public"), stream_state=AirbyteStateBlob() + ), + ), + ), + id="test_emit_state_nonexistent_stream_name", + ), + pytest.param( + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="public"), + stream_state=AirbyteStateBlob({"id": 507}), + ), + ) + ], + "episodes", + "nonexistent", + AirbyteMessage( + type=MessageType.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="episodes", namespace="nonexistent"), stream_state=AirbyteStateBlob() + ), + ), + ), + id="test_emit_state_wrong_namespace", + ), + ], +) +def test_create_state_message(start_state, update_name, update_namespace, expected_state_message): + state_manager = ConnectorStateManager(start_state) + + actual_state_message = state_manager.create_state_message(stream_name=update_name, namespace=update_namespace) + assert actual_state_message == expected_state_message diff --git a/airbyte-cdk/python/unit_tests/sources/test_http_logger.py b/airbyte-cdk/python/unit_tests/sources/test_http_logger.py new file mode 100644 index 000000000000..5711d3529211 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/test_http_logger.py @@ -0,0 +1,252 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +import requests +from airbyte_cdk.sources.http_logger import format_http_message + +A_TITLE = "a title" +A_DESCRIPTION = "a description" +A_STREAM_NAME = "a stream name" +ANY_REQUEST = requests.Request(method="POST", url="http://a-url.com", headers={}, params={}).prepare() + + +class ResponseBuilder: + def __init__(self): + self._body_content = "" + self._headers = {} + self._request = ANY_REQUEST + self._status_code = 100 + + def body_content(self, body_content: bytes) -> "ResponseBuilder": + self._body_content = body_content + return self + + def headers(self, headers: dict) -> "ResponseBuilder": + self._headers = headers + return self + + def request(self, request: requests.PreparedRequest) -> "ResponseBuilder": + self._request = request + return self + + def status_code(self, status_code: int) -> "ResponseBuilder": + self._status_code = status_code + return self + + def build(self): + response = requests.Response() + response._content = self._body_content + response.headers = self._headers + response.request = self._request + response.status_code = self._status_code + return response + + +EMPTY_RESPONSE = {"body": {"content": ""}, "headers": {}, "status_code": 100} + + +@pytest.mark.parametrize( + "test_name, http_method, url, headers, params, body_json, body_data, expected_airbyte_message", + [ + ( + "test_basic_get_request", + "GET", + "https://airbyte.io", + {}, + {}, + {}, + {}, + { + "airbyte_cdk": {"stream": {"name": A_STREAM_NAME}}, + "http": { + "title": A_TITLE, + "description": A_DESCRIPTION, + "request": {"method": "GET", "body": {"content": None}, "headers": {}}, + "response": EMPTY_RESPONSE, + }, + "log": {"level": "debug"}, + "url": {"full": "https://airbyte.io/"}, + }, + ), + ( + "test_get_request_with_headers", + "GET", + "https://airbyte.io", + {"h1": "v1", "h2": "v2"}, + {}, + {}, + {}, + { + "airbyte_cdk": {"stream": {"name": A_STREAM_NAME}}, + "http": { + "title": A_TITLE, + "description": A_DESCRIPTION, + "request": {"method": "GET", "body": {"content": None}, "headers": {"h1": "v1", "h2": "v2"}}, + "response": EMPTY_RESPONSE, + }, + "log": {"level": "debug"}, + "url": {"full": "https://airbyte.io/"}, + }, + ), + ( + "test_get_request_with_request_params", + "GET", + "https://airbyte.io", + {}, + {"p1": "v1", "p2": "v2"}, + {}, + {}, + { + "airbyte_cdk": {"stream": {"name": A_STREAM_NAME}}, + "http": { + "title": A_TITLE, + "description": A_DESCRIPTION, + "request": {"method": "GET", "body": {"content": None}, "headers": {}}, + "response": EMPTY_RESPONSE, + }, + "log": {"level": "debug"}, + "url": {"full": "https://airbyte.io/?p1=v1&p2=v2"}, + }, + ), + ( + "test_get_request_with_request_body_json", + "GET", + "https://airbyte.io", + {"Content-Type": "application/json"}, + {}, + {"b1": "v1", "b2": "v2"}, + {}, + { + "airbyte_cdk": {"stream": {"name": A_STREAM_NAME}}, + "http": { + "title": A_TITLE, + "description": A_DESCRIPTION, + "request": { + "method": "GET", + "body": {"content": '{"b1": "v1", "b2": "v2"}'}, + "headers": {"Content-Type": "application/json", "Content-Length": "24"}, + }, + "response": EMPTY_RESPONSE, + }, + "log": {"level": "debug"}, + "url": {"full": "https://airbyte.io/"}, + }, + ), + ( + "test_get_request_with_headers_params_and_body", + "GET", + "https://airbyte.io", + {"Content-Type": "application/json", "h1": "v1"}, + {"p1": "v1", "p2": "v2"}, + {"b1": "v1", "b2": "v2"}, + {}, + { + "airbyte_cdk": {"stream": {"name": A_STREAM_NAME}}, + "http": { + "title": A_TITLE, + "description": A_DESCRIPTION, + "request": { + "method": "GET", + "body": {"content": '{"b1": "v1", "b2": "v2"}'}, + "headers": {"Content-Type": "application/json", "Content-Length": "24", "h1": "v1"}, + }, + "response": EMPTY_RESPONSE, + }, + "log": {"level": "debug"}, + "url": {"full": "https://airbyte.io/?p1=v1&p2=v2"}, + }, + ), + ( + "test_get_request_with_request_body_data", + "GET", + "https://airbyte.io", + {"Content-Type": "application/x-www-form-urlencoded"}, + {}, + {}, + {"b1": "v1", "b2": "v2"}, + { + "airbyte_cdk": {"stream": {"name": A_STREAM_NAME}}, + "http": { + "title": A_TITLE, + "description": A_DESCRIPTION, + "request": { + "method": "GET", + "body": {"content": "b1=v1&b2=v2"}, + "headers": {"Content-Type": "application/x-www-form-urlencoded", "Content-Length": "11"}, + }, + "response": EMPTY_RESPONSE, + }, + "log": {"level": "debug"}, + "url": {"full": "https://airbyte.io/"}, + }, + ), + ( + "test_basic_post_request", + "POST", + "https://airbyte.io", + {}, + {}, + {}, + {}, + { + "airbyte_cdk": {"stream": {"name": A_STREAM_NAME}}, + "http": { + "title": A_TITLE, + "description": A_DESCRIPTION, + "request": {"method": "POST", "body": {"content": None}, "headers": {"Content-Length": "0"}}, + "response": EMPTY_RESPONSE, + }, + "log": {"level": "debug"}, + "url": {"full": "https://airbyte.io/"}, + }, + ), + ], +) +def test_prepared_request_to_airbyte_message(test_name, http_method, url, headers, params, body_json, body_data, expected_airbyte_message): + request = requests.Request(method=http_method, url=url, headers=headers, params=params) + if body_json: + request.json = body_json + if body_data: + request.data = body_data + prepared_request = request.prepare() + + actual_airbyte_message = format_http_message(ResponseBuilder().request(prepared_request).build(), A_TITLE, A_DESCRIPTION, A_STREAM_NAME) + + assert actual_airbyte_message == expected_airbyte_message + + +@pytest.mark.parametrize( + "test_name, response_body, response_headers, status_code, expected_airbyte_message", + [ + ("test_response_no_body_no_headers", b"", {}, 200, {"body": {"content": ""}, "headers": {}, "status_code": 200}), + ( + "test_response_no_body_with_headers", + b"", + {"h1": "v1", "h2": "v2"}, + 200, + {"body": {"content": ""}, "headers": {"h1": "v1", "h2": "v2"}, "status_code": 200}, + ), + ( + "test_response_with_body_no_headers", + b'{"b1": "v1", "b2": "v2"}', + {}, + 200, + {"body": {"content": '{"b1": "v1", "b2": "v2"}'}, "headers": {}, "status_code": 200}, + ), + ( + "test_response_with_body_and_headers", + b'{"b1": "v1", "b2": "v2"}', + {"h1": "v1", "h2": "v2"}, + 200, + {"body": {"content": '{"b1": "v1", "b2": "v2"}'}, "headers": {"h1": "v1", "h2": "v2"}, "status_code": 200}, + ), + ], +) +def test_response_to_airbyte_message(test_name, response_body, response_headers, status_code, expected_airbyte_message): + response = ResponseBuilder().body_content(response_body).headers(response_headers).status_code(status_code).build() + + actual_airbyte_message = format_http_message(response, A_TITLE, A_DESCRIPTION, A_STREAM_NAME) + + assert actual_airbyte_message["http"]["response"] == expected_airbyte_message diff --git a/airbyte-cdk/python/unit_tests/sources/test_integration_source.py b/airbyte-cdk/python/unit_tests/sources/test_integration_source.py new file mode 100644 index 000000000000..17628a0263cd --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/test_integration_source.py @@ -0,0 +1,86 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import os +from typing import Any, List, Mapping +from unittest import mock +from unittest.mock import patch + +import pytest +import requests +from airbyte_cdk.entrypoint import launch +from airbyte_cdk.utils import AirbyteTracedException +from unit_tests.sources.fixtures.source_test_fixture import ( + HttpTestStream, + SourceFixtureOauthAuthenticator, + SourceTestFixture, + fixture_mock_send, +) + + +@pytest.mark.parametrize( + "deployment_mode, url_base, expected_records, expected_error", + [ + pytest.param("CLOUD", "https://airbyte.com/api/v1/", [], None, id="test_cloud_read_with_public_endpoint"), + pytest.param("CLOUD", "http://unsecured.com/api/v1/", [], "system_error", id="test_cloud_read_with_unsecured_url"), + pytest.param("CLOUD", "https://172.20.105.99/api/v1/", [], "config_error", id="test_cloud_read_with_private_endpoint"), + pytest.param("CLOUD", "https://localhost:80/api/v1/", [], "config_error", id="test_cloud_read_with_localhost"), + pytest.param("OSS", "https://airbyte.com/api/v1/", [], None, id="test_oss_read_with_public_endpoint"), + pytest.param("OSS", "https://172.20.105.99/api/v1/", [], None, id="test_oss_read_with_private_endpoint"), + ], +) +@patch.object(requests.Session, "send", fixture_mock_send) +def test_external_request_source(capsys, deployment_mode, url_base, expected_records, expected_error): + source = SourceTestFixture() + + with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): # clear=True clears the existing os.environ dict + with mock.patch.object(HttpTestStream, "url_base", url_base): + args = ["read", "--config", "config.json", "--catalog", "configured_catalog.json"] + if expected_error: + with pytest.raises(AirbyteTracedException): + launch(source, args) + messages = [json.loads(line) for line in capsys.readouterr().out.splitlines()] + assert contains_error_trace_message(messages, expected_error) + else: + launch(source, args) + + +@pytest.mark.parametrize( + "deployment_mode, token_refresh_url, expected_records, expected_error", + [ + pytest.param("CLOUD", "https://airbyte.com/api/v1/", [], None, id="test_cloud_read_with_public_endpoint"), + pytest.param("CLOUD", "http://unsecured.com/api/v1/", [], "system_error", id="test_cloud_read_with_unsecured_url"), + pytest.param("CLOUD", "https://172.20.105.99/api/v1/", [], "config_error", id="test_cloud_read_with_private_endpoint"), + pytest.param("OSS", "https://airbyte.com/api/v1/", [], None, id="test_oss_read_with_public_endpoint"), + pytest.param("OSS", "https://172.20.105.99/api/v1/", [], None, id="test_oss_read_with_private_endpoint"), + ], +) +@patch.object(requests.Session, "send", fixture_mock_send) +def test_external_oauth_request_source(capsys, deployment_mode, token_refresh_url, expected_records, expected_error): + oauth_authenticator = SourceFixtureOauthAuthenticator( + client_id="nora", client_secret="hae_sung", refresh_token="arthur", token_refresh_endpoint=token_refresh_url + ) + source = SourceTestFixture(authenticator=oauth_authenticator) + + with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): # clear=True clears the existing os.environ dict + args = ["read", "--config", "config.json", "--catalog", "configured_catalog.json"] + if expected_error: + with pytest.raises(AirbyteTracedException): + launch(source, args) + messages = [json.loads(line) for line in capsys.readouterr().out.splitlines()] + assert contains_error_trace_message(messages, expected_error) + else: + launch(source, args) + + +def contains_error_trace_message(messages: List[Mapping[str, Any]], expected_error: str) -> bool: + for message in messages: + if message.get("type") != "TRACE": + continue + elif message.get("trace").get("type") != "ERROR": + continue + elif message.get("trace").get("error").get("failure_type") == expected_error: + return True + return False diff --git a/airbyte-cdk/python/unit_tests/sources/test_source.py b/airbyte-cdk/python/unit_tests/sources/test_source.py new file mode 100644 index 000000000000..d548a51b1ebb --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/test_source.py @@ -0,0 +1,477 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +import tempfile +from contextlib import nullcontext as does_not_raise +from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union + +import pytest +from airbyte_cdk.models import ( + AirbyteGlobalState, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateMessageSerializer, + AirbyteStateType, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteCatalogSerializer, + StreamDescriptor, + SyncMode, + Type, +) +from airbyte_cdk.sources import AbstractSource, Source +from airbyte_cdk.sources.streams.core import Stream +from airbyte_cdk.sources.streams.http.http import HttpStream +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer +from orjson import orjson +from serpyco_rs import SchemaValidationError + + +class MockSource(Source): + def read( + self, logger: logging.Logger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None + ): + pass + + def check(self, logger: logging.Logger, config: Mapping[str, Any]): + pass + + def discover(self, logger: logging.Logger, config: Mapping[str, Any]): + pass + + +class MockAbstractSource(AbstractSource): + def __init__(self, streams: Optional[List[Stream]] = None): + self._streams = streams + + def check_connection(self, *args, **kwargs) -> Tuple[bool, Optional[Any]]: + return True, "" + + def streams(self, *args, **kwargs) -> List[Stream]: + if self._streams: + return self._streams + return [] + + +@pytest.fixture +def source(): + return MockSource() + + +@pytest.fixture +def catalog(): + configured_catalog = { + "streams": [ + { + "stream": {"name": "mock_http_stream", "json_schema": {}, "supported_sync_modes": ["full_refresh"]}, + "destination_sync_mode": "overwrite", + "sync_mode": "full_refresh", + }, + { + "stream": {"name": "mock_stream", "json_schema": {}, "supported_sync_modes": ["full_refresh"]}, + "destination_sync_mode": "overwrite", + "sync_mode": "full_refresh", + }, + ] + } + return ConfiguredAirbyteCatalogSerializer.load(configured_catalog) + + +@pytest.fixture +def abstract_source(mocker): + mocker.patch.multiple(HttpStream, __abstractmethods__=set()) + mocker.patch.multiple(Stream, __abstractmethods__=set()) + + class MockHttpStream(mocker.MagicMock, HttpStream): + url_base = "http://example.com" + path = "/dummy/path" + get_json_schema = mocker.MagicMock() + _state = {} + + @property + def cursor_field(self) -> Union[str, List[str]]: + return ["updated_at"] + + def get_backoff_strategy(self): + return None + + def get_error_handler(self): + return None + + def __init__(self, *args, **kvargs): + mocker.MagicMock.__init__(self) + HttpStream.__init__(self, *args, kvargs) + self.read_records = mocker.MagicMock() + + @property + def availability_strategy(self): + return None + + @property + def state(self) -> MutableMapping[str, Any]: + return self._state + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + self._state = value + + class MockStream(mocker.MagicMock, Stream): + page_size = None + get_json_schema = mocker.MagicMock() + + def __init__(self, **kwargs): + mocker.MagicMock.__init__(self) + self.read_records = mocker.MagicMock() + + streams = [MockHttpStream(), MockStream()] + + class MockAbstractSource(AbstractSource): + def check_connection(self): + return True, None + + def streams(self, config): + self.streams_config = config + return streams + + return MockAbstractSource() + + +@pytest.mark.parametrize( + "incoming_state, expected_state, expected_error", + [ + pytest.param( + [ + { + "type": "STREAM", + "stream": { + "stream_state": {"created_at": "2009-07-19"}, + "stream_descriptor": {"name": "movies", "namespace": "public"}, + }, + } + ], + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="movies", namespace="public"), + stream_state=AirbyteStateBlob({"created_at": "2009-07-19"}), + ), + ) + ], + does_not_raise(), + id="test_incoming_stream_state", + ), + pytest.param( + [ + { + "type": "STREAM", + "stream": { + "stream_state": {"created_at": "2009-07-19"}, + "stream_descriptor": {"name": "movies", "namespace": "public"}, + }, + }, + { + "type": "STREAM", + "stream": { + "stream_state": {"id": "villeneuve_denis"}, + "stream_descriptor": {"name": "directors", "namespace": "public"}, + }, + }, + { + "type": "STREAM", + "stream": { + "stream_state": {"created_at": "1995-12-27"}, + "stream_descriptor": {"name": "actors", "namespace": "public"}, + }, + }, + ], + [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="movies", namespace="public"), + stream_state=AirbyteStateBlob({"created_at": "2009-07-19"}), + ), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="directors", namespace="public"), + stream_state=AirbyteStateBlob({"id": "villeneuve_denis"}), + ), + ), + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="actors", namespace="public"), + stream_state=AirbyteStateBlob({"created_at": "1995-12-27"}), + ), + ), + ], + does_not_raise(), + id="test_incoming_multiple_stream_states", + ), + pytest.param( + [ + { + "type": "GLOBAL", + "global": { + "shared_state": {"shared_key": "shared_val"}, + "stream_states": [ + {"stream_state": {"created_at": "2009-07-19"}, "stream_descriptor": {"name": "movies", "namespace": "public"}} + ], + }, + } + ], + [ + AirbyteStateMessage( + type=AirbyteStateType.GLOBAL, + global_=AirbyteGlobalState( + shared_state=AirbyteStateBlob({"shared_key": "shared_val"}), + stream_states=[ + AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="movies", namespace="public"), + stream_state=AirbyteStateBlob({"created_at": "2009-07-19"}), + ) + ], + ), + ), + ], + does_not_raise(), + id="test_incoming_global_state", + ), + pytest.param([], [], does_not_raise(), id="test_empty_incoming_stream_state"), + pytest.param(None, [], does_not_raise(), id="test_none_incoming_state"), + pytest.param( + [ + { + "type": "NOT_REAL", + "stream": { + "stream_state": {"created_at": "2009-07-19"}, + "stream_descriptor": {"name": "movies", "namespace": "public"}, + }, + } + ], + None, + pytest.raises(SchemaValidationError), + id="test_invalid_stream_state_invalid_type", + ), + pytest.param( + [{"type": "STREAM", "stream": {"stream_state": {"created_at": "2009-07-19"}}}], + None, + pytest.raises(SchemaValidationError), + id="test_invalid_stream_state_missing_descriptor", + ), + pytest.param( + [{"type": "GLOBAL", "global": {"shared_state": {"shared_key": "shared_val"}}}], + None, + pytest.raises(SchemaValidationError), + id="test_invalid_global_state_missing_streams", + ), + pytest.param( + [ + { + "type": "GLOBAL", + "global": { + "shared_state": {"shared_key": "shared_val"}, + "stream_states": { + "stream_state": {"created_at": "2009-07-19"}, + "stream_descriptor": {"name": "movies", "namespace": "public"}, + }, + }, + } + ], + None, + pytest.raises(SchemaValidationError), + id="test_invalid_global_state_streams_not_list", + ), + ], +) +def test_read_state(source, incoming_state, expected_state, expected_error): + with tempfile.NamedTemporaryFile("w") as state_file: + state_file.write(json.dumps(incoming_state)) + state_file.flush() + with expected_error: + actual = source.read_state(state_file.name) + if expected_state and actual: + assert AirbyteStateMessageSerializer.dump(actual[0]) == AirbyteStateMessageSerializer.dump(expected_state[0]) + + +def test_read_invalid_state(source): + with tempfile.NamedTemporaryFile("w") as state_file: + state_file.write("invalid json content") + state_file.flush() + with pytest.raises(ValueError, match="Could not read json file"): + source.read_state(state_file.name) + + +@pytest.mark.parametrize( + "source, expected_state", + [ + pytest.param(MockAbstractSource(), [], id="test_source_not_implementing_read_returns_per_stream_format"), + ], +) +def test_read_state_nonexistent(source, expected_state): + assert source.read_state("") == expected_state + + +def test_read_catalog(source): + configured_catalog = { + "streams": [ + { + "stream": { + "name": "mystream", + "json_schema": {"type": "object", "properties": {"k": "v"}}, + "supported_sync_modes": ["full_refresh"], + }, + "destination_sync_mode": "overwrite", + "sync_mode": "full_refresh", + } + ] + } + expected = ConfiguredAirbyteCatalogSerializer.load(configured_catalog) + with tempfile.NamedTemporaryFile("w") as catalog_file: + catalog_file.write(orjson.dumps(ConfiguredAirbyteCatalogSerializer.dump(expected)).decode()) + catalog_file.flush() + actual = source.read_catalog(catalog_file.name) + assert actual == expected + + +def test_internal_config(abstract_source, catalog): + streams = abstract_source.streams(None) + assert len(streams) == 2 + http_stream, non_http_stream = streams + assert isinstance(http_stream, HttpStream) + assert not isinstance(non_http_stream, HttpStream) + http_stream.read_records.return_value = [{}] * 3 + non_http_stream.read_records.return_value = [{}] * 3 + + # Test with empty config + logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}") + records = [r for r in abstract_source.read(logger=logger, config={}, catalog=catalog, state={})] + # 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x) + assert len(records) == 3 + 3 + 1 + 1 + 3 + 3 + assert http_stream.read_records.called + assert non_http_stream.read_records.called + # Make sure page_size havent been set + assert not http_stream.page_size + assert not non_http_stream.page_size + # Test with records limit set to 1 + internal_config = {"some_config": 100, "_limit": 1} + records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})] + # 1 from http stream + 1 from non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x) + assert len(records) == 1 + 1 + 1 + 1 + 3 + 3 + assert "_limit" not in abstract_source.streams_config + assert "some_config" in abstract_source.streams_config + # Test with records limit set to number that exceeds expceted records + internal_config = {"some_config": 100, "_limit": 20} + records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})] + assert len(records) == 3 + 3 + 1 + 1 + 3 + 3 + + # Check if page_size paramter is set to http instance only + internal_config = {"some_config": 100, "_page_size": 2} + records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})] + assert "_page_size" not in abstract_source.streams_config + assert "some_config" in abstract_source.streams_config + assert len(records) == 3 + 3 + 1 + 1 + 3 + 3 + assert http_stream.page_size == 2 + # Make sure page_size havent been set for non http streams + assert not non_http_stream.page_size + + +def test_internal_config_limit(mocker, abstract_source, catalog): + logger_mock = mocker.MagicMock() + logger_mock.level = logging.DEBUG + del catalog.streams[1] + STREAM_LIMIT = 2 + SLICE_DEBUG_LOG_COUNT = 1 + FULL_RECORDS_NUMBER = 3 + TRACE_STATUS_COUNT = 3 + STATE_COUNT = 1 + streams = abstract_source.streams(None) + http_stream = streams[0] + http_stream.read_records.return_value = [{}] * FULL_RECORDS_NUMBER + internal_config = {"some_config": 100, "_limit": STREAM_LIMIT} + + catalog.streams[0].sync_mode = SyncMode.full_refresh + records = [r for r in abstract_source.read(logger=logger_mock, config=internal_config, catalog=catalog, state={})] + assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT + logger_info_args = [call[0][0] for call in logger_mock.info.call_args_list] + # Check if log line matches number of limit + read_log_record = [_l for _l in logger_info_args if _l.startswith("Read")] + assert read_log_record[0].startswith(f"Read {STREAM_LIMIT} ") + + # No limit, check if state record produced for incremental stream + catalog.streams[0].sync_mode = SyncMode.incremental + records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})] + assert len(records) == FULL_RECORDS_NUMBER + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + 1 + assert records[-2].type == Type.STATE + assert records[-1].type == Type.TRACE + + # Set limit and check if state is produced when limit is set for incremental stream + logger_mock.reset_mock() + records = [r for r in abstract_source.read(logger=logger_mock, config=internal_config, catalog=catalog, state={})] + assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + 1 + assert records[-2].type == Type.STATE + assert records[-1].type == Type.TRACE + logger_info_args = [call[0][0] for call in logger_mock.info.call_args_list] + read_log_record = [_l for _l in logger_info_args if _l.startswith("Read")] + assert read_log_record[0].startswith(f"Read {STREAM_LIMIT} ") + + +SCHEMA = {"type": "object", "properties": {"value": {"type": "string"}}} + + +def test_source_config_no_transform(mocker, abstract_source, catalog): + SLICE_DEBUG_LOG_COUNT = 1 + TRACE_STATUS_COUNT = 3 + STATE_COUNT = 1 + # Read operation has an extra get_json_schema call when filtering invalid fields + GET_JSON_SCHEMA_COUNT_WHEN_FILTERING = 1 + logger_mock = mocker.MagicMock() + logger_mock.level = logging.DEBUG + streams = abstract_source.streams(None) + http_stream, non_http_stream = streams + http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA + http_stream.read_records.return_value, non_http_stream.read_records.return_value = [[{"value": 23}] * 5] * 2 + records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})] + assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT) + assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": 23}] * 2 * 5 + assert http_stream.get_json_schema.call_count == 5 + GET_JSON_SCHEMA_COUNT_WHEN_FILTERING + assert non_http_stream.get_json_schema.call_count == 5 + GET_JSON_SCHEMA_COUNT_WHEN_FILTERING + + +def test_source_config_transform(mocker, abstract_source, catalog): + logger_mock = mocker.MagicMock() + logger_mock.level = logging.DEBUG + SLICE_DEBUG_LOG_COUNT = 2 + TRACE_STATUS_COUNT = 6 + STATE_COUNT = 2 + streams = abstract_source.streams(None) + http_stream, non_http_stream = streams + http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) + non_http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) + http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA + http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}] + records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})] + assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT + assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}] * 2 + + +def test_source_config_transform_and_no_transform(mocker, abstract_source, catalog): + logger_mock = mocker.MagicMock() + logger_mock.level = logging.DEBUG + SLICE_DEBUG_LOG_COUNT = 2 + TRACE_STATUS_COUNT = 6 + STATE_COUNT = 2 + streams = abstract_source.streams(None) + http_stream, non_http_stream = streams + http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) + http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA + http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}] + records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})] + assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT + assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}, {"value": 23}] diff --git a/airbyte-cdk/python/unit_tests/sources/test_source_read.py b/airbyte-cdk/python/unit_tests/sources/test_source_read.py new file mode 100644 index 000000000000..05c71d1eae39 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/test_source_read.py @@ -0,0 +1,422 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import logging +from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union +from unittest.mock import Mock + +import freezegun +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStream, + AirbyteStreamStatus, + AirbyteStreamStatusTraceMessage, + AirbyteTraceMessage, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + StreamDescriptor, + SyncMode, + TraceType, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources import AbstractSource +from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource +from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter +from airbyte_cdk.sources.message import InMemoryMessageRepository +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade +from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor +from airbyte_cdk.sources.streams.core import StreamData +from airbyte_cdk.utils import AirbyteTracedException +from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_stream_source_builder import NeverLogSliceLogger + + +class _MockStream(Stream): + def __init__(self, slice_to_records: Mapping[str, List[Mapping[str, Any]]], name: str): + self._slice_to_records = slice_to_records + self._name = name + + @property + def name(self) -> str: + return self._name + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return None + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + for partition in self._slice_to_records.keys(): + yield {"partition": partition} + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[StreamData]: + for record_or_exception in self._slice_to_records[stream_slice["partition"]]: + if isinstance(record_or_exception, Exception): + raise record_or_exception + else: + yield record_or_exception + + def get_json_schema(self) -> Mapping[str, Any]: + return {} + + +class _MockSource(AbstractSource): + message_repository = InMemoryMessageRepository() + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + pass + + def set_streams(self, streams): + self._streams = streams + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + return self._streams + + +class _MockConcurrentSource(ConcurrentSourceAdapter): + message_repository = InMemoryMessageRepository() + + def __init__(self, logger): + concurrent_source = ConcurrentSource.create(1, 1, logger, NeverLogSliceLogger(), self.message_repository) + super().__init__(concurrent_source) + + def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]: + pass + + def set_streams(self, streams): + self._streams = streams + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + return self._streams + + +@freezegun.freeze_time("2020-01-01T00:00:00") +def test_concurrent_source_yields_the_same_messages_as_abstract_source_when_no_exceptions_are_raised(): + records_stream_1_partition_1 = [ + {"id": 1, "partition": "1"}, + {"id": 2, "partition": "1"}, + ] + records_stream_1_partition_2 = [ + {"id": 3, "partition": "2"}, + {"id": 4, "partition": "2"}, + ] + records_stream_2_partition_1 = [ + {"id": 100, "partition": "A"}, + {"id": 200, "partition": "A"}, + ] + records_stream_2_partition_2 = [ + {"id": 300, "partition": "B"}, + {"id": 400, "partition": "B"}, + ] + stream_1_slice_to_partition = {"1": records_stream_1_partition_1, "2": records_stream_1_partition_2} + stream_2_slice_to_partition = {"A": records_stream_2_partition_1, "B": records_stream_2_partition_2} + state = None + logger = _init_logger() + + source, concurrent_source = _init_sources([stream_1_slice_to_partition, stream_2_slice_to_partition], state, logger) + + config = {} + catalog = _create_configured_catalog(source._streams) + # FIXME this is currently unused in this test + # messages_from_abstract_source = _read_from_source(source, logger, config, catalog, state, None) + messages_from_concurrent_source = _read_from_source(concurrent_source, logger, config, catalog, state, None) + + expected_messages = [ + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + error=None, + estimate=None, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="stream0"), status=AirbyteStreamStatus(AirbyteStreamStatus.STARTED) + ), + ), + ), + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + error=None, + estimate=None, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="stream0"), status=AirbyteStreamStatus(AirbyteStreamStatus.RUNNING) + ), + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream0", + data=records_stream_1_partition_1[0], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream0", + data=records_stream_1_partition_1[1], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream0", + data=records_stream_1_partition_2[0], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream0", + data=records_stream_1_partition_2[1], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + error=None, + estimate=None, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="stream0"), status=AirbyteStreamStatus(AirbyteStreamStatus.COMPLETE) + ), + ), + ), + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + error=None, + estimate=None, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="stream1"), status=AirbyteStreamStatus(AirbyteStreamStatus.STARTED) + ), + ), + ), + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + error=None, + estimate=None, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="stream1"), status=AirbyteStreamStatus(AirbyteStreamStatus.RUNNING) + ), + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream1", + data=records_stream_2_partition_1[0], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream1", + data=records_stream_2_partition_1[1], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream1", + data=records_stream_2_partition_2[0], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage( + stream="stream1", + data=records_stream_2_partition_2[1], + emitted_at=1577836800000, + ), + ), + AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=1577836800000.0, + error=None, + estimate=None, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="stream1"), status=AirbyteStreamStatus(AirbyteStreamStatus.COMPLETE) + ), + ), + ), + ] + _verify_messages(expected_messages, messages_from_concurrent_source) + + +@freezegun.freeze_time("2020-01-01T00:00:00") +def test_concurrent_source_yields_the_same_messages_as_abstract_source_when_a_traced_exception_is_raised(): + records = [{"id": 1, "partition": "1"}, AirbyteTracedException()] + stream_slice_to_partition = {"1": records} + + logger = _init_logger() + state = None + source, concurrent_source = _init_sources([stream_slice_to_partition], state, logger) + config = {} + catalog = _create_configured_catalog(source._streams) + messages_from_abstract_source = _read_from_source(source, logger, config, catalog, state, AirbyteTracedException) + messages_from_concurrent_source = _read_from_source(concurrent_source, logger, config, catalog, state, AirbyteTracedException) + + _assert_status_messages(messages_from_abstract_source, messages_from_concurrent_source) + _assert_record_messages(messages_from_abstract_source, messages_from_concurrent_source) + _assert_errors(messages_from_abstract_source, messages_from_concurrent_source) + + +@freezegun.freeze_time("2020-01-01T00:00:00") +def test_concurrent_source_yields_the_same_messages_as_abstract_source_when_an_exception_is_raised(): + records = [{"id": 1, "partition": "1"}, RuntimeError()] + stream_slice_to_partition = {"1": records} + logger = _init_logger() + + state = None + + source, concurrent_source = _init_sources([stream_slice_to_partition], state, logger) + config = {} + catalog = _create_configured_catalog(source._streams) + messages_from_abstract_source = _read_from_source(source, logger, config, catalog, state, AirbyteTracedException) + messages_from_concurrent_source = _read_from_source(concurrent_source, logger, config, catalog, state, AirbyteTracedException) + + _assert_status_messages(messages_from_abstract_source, messages_from_concurrent_source) + _assert_record_messages(messages_from_abstract_source, messages_from_concurrent_source) + _assert_errors(messages_from_abstract_source, messages_from_concurrent_source) + + +def _assert_status_messages(messages_from_abstract_source, messages_from_concurrent_source): + status_from_concurrent_source = [ + message + for message in messages_from_concurrent_source + if message.type == MessageType.TRACE and message.trace.type == TraceType.STREAM_STATUS + ] + + assert status_from_concurrent_source + _verify_messages( + [ + message + for message in messages_from_abstract_source + if message.type == MessageType.TRACE and message.trace.type == TraceType.STREAM_STATUS + ], + status_from_concurrent_source, + ) + + +def _assert_record_messages(messages_from_abstract_source, messages_from_concurrent_source): + records_from_concurrent_source = [message for message in messages_from_concurrent_source if message.type == MessageType.RECORD] + + assert records_from_concurrent_source + _verify_messages( + [message for message in messages_from_abstract_source if message.type == MessageType.RECORD], + records_from_concurrent_source, + ) + + +def _assert_errors(messages_from_abstract_source, messages_from_concurrent_source): + errors_from_concurrent_source = [ + message + for message in messages_from_concurrent_source + if message.type == MessageType.TRACE and message.trace.type == TraceType.ERROR + ] + errors_from_abstract_source = [ + message for message in messages_from_abstract_source if message.type == MessageType.TRACE and message.trace.type == TraceType.ERROR + ] + + assert errors_from_concurrent_source + # exceptions might differ from both framework hence we only assert the count + assert len(errors_from_concurrent_source) == len(errors_from_abstract_source) + + +def _init_logger(): + logger = Mock() + logger.level = logging.INFO + logger.isEnabledFor.return_value = False + return logger + + +def _init_sources(stream_slice_to_partitions, state, logger): + source = _init_source(stream_slice_to_partitions, state, logger, _MockSource()) + concurrent_source = _init_source(stream_slice_to_partitions, state, logger, _MockConcurrentSource(logger)) + return source, concurrent_source + + +def _init_source(stream_slice_to_partitions, state, logger, source): + streams = [ + StreamFacade.create_from_stream( + _MockStream(stream_slices, f"stream{i}"), + source, + logger, + state, + FinalStateCursor(stream_name=f"stream{i}", stream_namespace=None, message_repository=InMemoryMessageRepository()), + ) + for i, stream_slices in enumerate(stream_slice_to_partitions) + ] + source.set_streams(streams) + return source + + +def _create_configured_catalog(streams): + return ConfiguredAirbyteCatalog( + streams=[ + ConfiguredAirbyteStream( + stream=AirbyteStream(name=s.name, json_schema={}, supported_sync_modes=[SyncMode.full_refresh]), + sync_mode=SyncMode.full_refresh, + cursor_field=None, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + for s in streams + ] + ) + + +def _read_from_source(source, logger, config, catalog, state, expected_exception): + messages = [] + try: + for m in source.read(logger, config, catalog, state): + messages.append(m) + except Exception as e: + if expected_exception: + assert isinstance(e, expected_exception) + return messages + + +def _verify_messages(expected_messages, messages_from_concurrent_source): + assert _compare(expected_messages, messages_from_concurrent_source) + + +def _compare(s, t): + # Use a compare method that does not require ordering or hashing the elements + # We can't rely on the ordering because of the multithreading + # AirbyteMessage does not implement __eq__ and __hash__ + t = list(t) + try: + for elem in s: + t.remove(elem) + except ValueError: + print(f"ValueError: {elem}") + return False + return not t diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_record_helper.py b/airbyte-cdk/python/unit_tests/sources/utils/test_record_helper.py new file mode 100644 index 000000000000..b5476180309b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_record_helper.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.models import ( + AirbyteLogMessage, + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateMessage, + AirbyteStateType, + AirbyteTraceMessage, + Level, + TraceType, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message + +NOW = 1234567 +STREAM_NAME = "my_stream" + + +@pytest.mark.parametrize( + "test_name, data, expected_message", + [ + ( + "test_data_to_airbyte_record", + {"id": 0, "field_A": 1.0, "field_B": "airbyte"}, + AirbyteMessage( + type=MessageType.RECORD, + record=AirbyteRecordMessage(stream="my_stream", data={"id": 0, "field_A": 1.0, "field_B": "airbyte"}, emitted_at=NOW), + ), + ), + ], +) +def test_data_or_record_to_airbyte_record(test_name, data, expected_message): + transformer = MagicMock() + schema = {} + message = stream_data_to_airbyte_message(STREAM_NAME, data, transformer, schema) + message.record.emitted_at = NOW + + if isinstance(data, dict): + transformer.transform.assert_called_with(data, schema) + else: + assert not transformer.transform.called + assert message == expected_message + + +@pytest.mark.parametrize( + "test_name, data, expected_message", + [ + ( + "test_log_message_to_airbyte_record", + AirbyteLogMessage(level=Level.INFO, message="Hello, this is a log message"), + AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message="Hello, this is a log message")), + ), + ( + "test_trace_message_to_airbyte_record", + AirbyteTraceMessage(type=TraceType.ERROR, emitted_at=101), + AirbyteMessage(type=MessageType.TRACE, trace=AirbyteTraceMessage(type=TraceType.ERROR, emitted_at=101)), + ), + ], +) +def test_log_or_trace_to_message(test_name, data, expected_message): + transformer = MagicMock() + schema = {} + message = stream_data_to_airbyte_message(STREAM_NAME, data, transformer, schema) + + assert not transformer.transform.called + assert message == expected_message + + +@pytest.mark.parametrize( + "test_name, data", + [ + ("test_log_message_to_airbyte_record", AirbyteStateMessage(type=AirbyteStateType.STREAM)), + ], +) +def test_state_message_to_message(test_name, data): + transformer = MagicMock() + schema = {} + with pytest.raises(ValueError): + stream_data_to_airbyte_message(STREAM_NAME, data, transformer, schema) diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_schema_helpers.py b/airbyte-cdk/python/unit_tests/sources/utils/test_schema_helpers.py new file mode 100644 index 000000000000..76b7a9b1c772 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_schema_helpers.py @@ -0,0 +1,206 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import json +import logging +import os +import shutil +import sys +import traceback +from collections.abc import Mapping +from pathlib import Path + +import jsonref +import pytest +from airbyte_cdk.models import ConnectorSpecification, ConnectorSpecificationSerializer, FailureType +from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, ResourceSchemaLoader, check_config_against_spec_or_exit +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from pytest import fixture +from pytest import raises as pytest_raises + +logger = logging.getLogger("airbyte") + + +MODULE = sys.modules[__name__] +MODULE_NAME = MODULE.__name__.split(".")[0] +SCHEMAS_ROOT = "/".join(os.path.abspath(MODULE.__file__).split("/")[:-1]) / Path("schemas") + + +@fixture(autouse=True, scope="session") +def create_and_teardown_schemas_dir(): + os.mkdir(SCHEMAS_ROOT) + os.mkdir(SCHEMAS_ROOT / "shared") + yield + shutil.rmtree(SCHEMAS_ROOT) + + +def create_schema(name: str, content: Mapping): + with open(SCHEMAS_ROOT / f"{name}.json", "w") as f: + f.write(json.dumps(content)) + + +@fixture +def spec_object() -> ConnectorSpecification: + spec = { + "connectionSpecification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": { + "api_token": {"title": "API Token", "type": "string"}, + }, + }, + } + yield ConnectorSpecificationSerializer.load(spec) + + +def test_check_config_against_spec_or_exit_does_not_print_schema(capsys, spec_object): + config = {"super_secret_token": "really_a_secret"} + + with pytest_raises(AirbyteTracedException) as ex_info: + check_config_against_spec_or_exit(config, spec_object) + + exc = ex_info.value + traceback.print_exception(type(exc), exc, exc.__traceback__) + out, err = capsys.readouterr() + assert "really_a_secret" not in out + err + assert exc.failure_type == FailureType.config_error, "failure_type should be config_error" + + +def test_should_not_fail_validation_for_valid_config(spec_object): + config = {"api_token": "something"} + check_config_against_spec_or_exit(config, spec_object) + assert True, "should pass validation with valid config" + + +class TestResourceSchemaLoader: + # Test that a simple schema is loaded correctly + @staticmethod + def test_inline_schema_resolves(): + expected_schema = { + "type": ["null", "object"], + "properties": { + "str": {"type": "string"}, + "int": {"type": "integer"}, + "obj": { + "type": ["null", "object"], + "properties": {"k1": {"type": "string"}}, + }, + }, + } + + create_schema("simple_schema", expected_schema) + resolver = ResourceSchemaLoader(MODULE_NAME) + actual_schema = resolver.get_schema("simple_schema") + assert actual_schema == expected_schema + + @staticmethod + def test_shared_schemas_resolves(): + expected_schema = { + "type": ["null", "object"], + "properties": { + "str": {"type": "string"}, + "int": {"type": "integer"}, + "obj": { + "type": ["null", "object"], + "properties": {"k1": {"type": "string"}}, + }, + }, + } + + partial_schema = { + "type": ["null", "object"], + "properties": { + "str": {"type": "string"}, + "int": {"type": "integer"}, + "obj": {"$ref": "shared_schema.json"}, + }, + } + + referenced_schema = { + "type": ["null", "object"], + "properties": {"k1": {"type": "string"}}, + } + + create_schema("complex_schema", partial_schema) + create_schema("shared/shared_schema", referenced_schema) + + resolver = ResourceSchemaLoader(MODULE_NAME) + + actual_schema = resolver.get_schema("complex_schema") + assert actual_schema == expected_schema + + @staticmethod + def test_shared_schemas_resolves_nested(): + expected_schema = { + "type": ["null", "object"], + "properties": { + "str": {"type": "string"}, + "int": {"type": "integer"}, + "one_of": { + "oneOf": [ + {"type": "string"}, + { + "type": ["null", "object"], + "properties": {"k1": {"type": "string"}}, + }, + ] + }, + "obj": { + "type": ["null", "object"], + "properties": {"k1": {"type": "string"}}, + }, + }, + } + partial_schema = { + "type": ["null", "object"], + "properties": { + "str": {"type": "string"}, + "int": {"type": "integer"}, + "one_of": { + "oneOf": [ + {"type": "string"}, + {"$ref": "shared_schema.json#/definitions/type_one"}, + ] + }, + "obj": {"$ref": "shared_schema.json#/definitions/type_one"}, + }, + } + + referenced_schema = { + "definitions": { + "type_one": {"$ref": "shared_schema.json#/definitions/type_nested"}, + "type_nested": { + "type": ["null", "object"], + "properties": {"k1": {"type": "string"}}, + }, + } + } + + create_schema("complex_schema", partial_schema) + create_schema("shared/shared_schema", referenced_schema) + + resolver = ResourceSchemaLoader(MODULE_NAME) + + actual_schema = resolver.get_schema("complex_schema") + assert actual_schema == expected_schema + # Make sure generated schema is JSON serializable + assert json.dumps(actual_schema) + assert jsonref.JsonRef.replace_refs(actual_schema) + + +@pytest.mark.parametrize( + "limit, record_count, expected", + [ + pytest.param(None, sys.maxsize, False, id="test_no_limit"), + pytest.param(1, 1, True, id="test_record_count_is_exactly_the_limit"), + pytest.param(1, 2, True, id="test_record_count_is_more_than_the_limit"), + pytest.param(1, 0, False, id="test_record_count_is_less_than_the_limit"), + ], +) +def test_internal_config(limit, record_count, expected): + config = InternalConfig(_limit=limit) + assert config.is_limit_reached(record_count) == expected diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_slice_logger.py b/airbyte-cdk/python/unit_tests/sources/utils/test_slice_logger.py new file mode 100644 index 000000000000..0796e8769179 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_slice_logger.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging + +import pytest +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.utils.slice_logger import AlwaysLogSliceLogger, DebugSliceLogger + + +@pytest.mark.parametrize( + "slice_logger, level, should_log", + [ + pytest.param(DebugSliceLogger(), logging.DEBUG, True, id="debug_logger_should_log_if_level_is_debug"), + pytest.param(DebugSliceLogger(), logging.INFO, False, id="debug_logger_should_not_log_if_level_is_info"), + pytest.param(DebugSliceLogger(), logging.WARN, False, id="debug_logger_should_not_log_if_level_is_warn"), + pytest.param(DebugSliceLogger(), logging.WARNING, False, id="debug_logger_should_not_log_if_level_is_warning"), + pytest.param(DebugSliceLogger(), logging.ERROR, False, id="debug_logger_should_not_log_if_level_is_error"), + pytest.param(DebugSliceLogger(), logging.CRITICAL, False, id="always_log_logger_should_not_log_if_level_is_critical"), + pytest.param(AlwaysLogSliceLogger(), logging.DEBUG, True, id="always_log_logger_should_log_if_level_is_debug"), + pytest.param(AlwaysLogSliceLogger(), logging.INFO, True, id="always_log_logger_should_log_if_level_is_info"), + pytest.param(AlwaysLogSliceLogger(), logging.WARN, True, id="always_log_logger_should_log_if_level_is_warn"), + pytest.param(AlwaysLogSliceLogger(), logging.WARNING, True, id="always_log_logger_should_log_if_level_is_warning"), + pytest.param(AlwaysLogSliceLogger(), logging.ERROR, True, id="always_log_logger_should_log_if_level_is_error"), + pytest.param(AlwaysLogSliceLogger(), logging.CRITICAL, True, id="always_log_logger_should_log_if_level_is_critical"), + ], +) +def test_should_log_slice_message(slice_logger, level, should_log): + logger = logging.Logger(name="name", level=level) + assert slice_logger.should_log_slice_message(logger) == should_log + + +@pytest.mark.parametrize( + "_slice, expected_message", + [ + pytest.param(None, "slice:null", id="test_none_slice"), + pytest.param({}, "slice:{}", id="test_empty_slice"), + pytest.param({"key": "value"}, 'slice:{"key": "value"}', id="test_dict"), + ], +) +def test_create_slice_log_message(_slice, expected_message): + expected_log_message = AirbyteMessage(type=MessageType.LOG, log=AirbyteLogMessage(level=Level.INFO, message=expected_message)) + log_message = DebugSliceLogger().create_slice_log_message(_slice) + assert log_message == expected_log_message diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_transform.py b/airbyte-cdk/python/unit_tests/sources/utils/test_transform.py new file mode 100644 index 000000000000..9b3f73989926 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_transform.py @@ -0,0 +1,266 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer + +SIMPLE_SCHEMA = {"type": "object", "properties": {"value": {"type": "string"}}} +COMPLEX_SCHEMA = { + "type": "object", + "properties": { + "value": {"type": "boolean", "format": "even", "is_positive": True}, + "prop": {"type": "string"}, + "prop_with_null": {"type": ["string", "null"]}, + "number_prop": {"type": "number"}, + "int_prop": {"type": ["integer", "null"]}, + "too_many_types": {"type": ["boolean", "null", "string"]}, + "def": { + "type": "object", + "properties": {"dd": {"$ref": "#/definitions/my_type"}}, + }, + "array": {"type": "array", "items": {"$ref": "#/definitions/str_type"}}, + "nested": {"$ref": "#/definitions/nested_type"}, + "list_of_lists": { + "type": "array", + "items": {"type": "array", "items": {"type": "string"}}, + }, + }, + "definitions": { + "str_type": {"type": "string"}, + "nested_type": {"type": "object", "properties": {"a": {"type": "string"}}}, + }, +} +VERY_NESTED_SCHEMA = { + "type": ["null", "object"], + "properties": { + "very_nested_value": { + "type": ["null", "object"], + "properties": { + "very_nested_value": { + "type": ["null", "object"], + "properties": { + "very_nested_value": { + "type": ["null", "object"], + "properties": { + "very_nested_value": { + "type": ["null", "object"], + "properties": {"very_nested_value": {"type": ["null", "number"]}}, + } + }, + } + }, + } + }, + } + }, +} + + +@pytest.mark.parametrize( + "schema, actual, expected, expected_warns", + [ + (SIMPLE_SCHEMA, {"value": 12}, {"value": "12"}, None), + (SIMPLE_SCHEMA, {"value": 12}, {"value": "12"}, None), + (SIMPLE_SCHEMA, {"value": 12, "unexpected_value": "unexpected"}, {"value": "12", "unexpected_value": "unexpected"}, None), + (COMPLEX_SCHEMA, {"value": 1, "array": ["111", 111, {1: 111}]}, {"value": True, "array": ["111", "111", "{1: 111}"]}, None), + ( + COMPLEX_SCHEMA, + {"value": 1, "list_of_lists": [["111"], [111], [11], [{1: 1}]]}, + {"value": True, "list_of_lists": [["111"], ["111"], ["11"], ["{1: 1}"]]}, + None, + ), + (COMPLEX_SCHEMA, {"value": 1, "nested": {"a": [1, 2, 3]}}, {"value": True, "nested": {"a": "[1, 2, 3]"}}, None), + (COMPLEX_SCHEMA, {"value": "false", "nested": {"a": [1, 2, 3]}}, {"value": False, "nested": {"a": "[1, 2, 3]"}}, None), + (COMPLEX_SCHEMA, {}, {}, None), + (COMPLEX_SCHEMA, {"int_prop": "12"}, {"int_prop": 12}, None), + # Skip invalid formattted field and process other fields. + ( + COMPLEX_SCHEMA, + {"prop": 12, "number_prop": "aa12", "array": [12]}, + {"prop": "12", "number_prop": "aa12", "array": ["12"]}, + "Failed to transform value 'aa12' of type 'string' to 'number', key path: '.number_prop'", + ), + # Field too_many_types have ambigious type, skip formatting + ( + COMPLEX_SCHEMA, + {"prop": 12, "too_many_types": 1212, "array": [12]}, + {"prop": "12", "too_many_types": 1212, "array": ["12"]}, + "Failed to transform value 1212 of type 'integer' to '['boolean', 'null', 'string']', key path: '.too_many_types'", + ), + # Test null field + (COMPLEX_SCHEMA, {"prop": None, "array": [12]}, {"prop": "None", "array": ["12"]}, None), + # If field can be null do not convert + (COMPLEX_SCHEMA, {"prop_with_null": None, "array": [12]}, {"prop_with_null": None, "array": ["12"]}, None), + ( + VERY_NESTED_SCHEMA, + {"very_nested_value": {"very_nested_value": {"very_nested_value": {"very_nested_value": {"very_nested_value": "2"}}}}}, + {"very_nested_value": {"very_nested_value": {"very_nested_value": {"very_nested_value": {"very_nested_value": 2.0}}}}}, + None, + ), + (VERY_NESTED_SCHEMA, {"very_nested_value": {"very_nested_value": None}}, {"very_nested_value": {"very_nested_value": None}}, None), + # Object without properties + ({"type": "object"}, {"value": 12}, {"value": 12}, None), + ( + # Array without items + {"type": "object", "properties": {"value": {"type": "array"}}}, + {"value": [12]}, + {"value": [12]}, + None, + ), + ( + # Array without items and value is not an array + {"type": "object", "properties": {"value": {"type": "array"}}}, + {"value": "12"}, + {"value": ["12"]}, + None, + ), + ( + {"type": "object", "properties": {"value": {"type": "array"}}}, + {"value": 12}, + {"value": [12]}, + None, + ), + ( + {"type": "object", "properties": {"value": {"type": "array"}}}, + {"value": None}, + {"value": [None]}, + None, + ), + ( + {"type": "object", "properties": {"value": {"type": ["null", "array"]}}}, + {"value": None}, + {"value": None}, + None, + ), + ( + {"type": "object", "properties": {"value": {"type": "array", "items": {"type": ["string"]}}}}, + {"value": 10}, + {"value": ["10"]}, + None, + ), + ( + {"type": "object", "properties": {"value": {"type": "array", "items": {"type": ["object"]}}}}, + {"value": "string"}, + {"value": "string"}, + "Failed to transform value 'string' of type 'string' to 'array', key path: '.value'", + ), + ( + {"type": "object", "properties": {"value": {"type": "array", "items": {"type": ["string"]}}}}, + {"value": {"key": "value"}}, + {"value": {"key": "value"}}, + "Failed to transform value {'key': 'value'} of type 'object' to 'array', key path: '.value'", + ), + ( + # Schema root object is not an object, no convertion should happen + {"type": "integer"}, + {"value": "12"}, + {"value": "12"}, + "Failed to transform value {'value': '12'} of type 'object' to 'integer', key path: '.'", + ), + ( + # More than one type except null, no conversion should happen + {"type": "object", "properties": {"value": {"type": ["string", "boolean", "null"]}}}, + {"value": 12}, + {"value": 12}, + "Failed to transform value 12 of type 'integer' to '['string', 'boolean', 'null']', key path: '.value'", + ), + ( + # Oneof not suported, no conversion for one_of_value should happen + {"type": "object", "properties": {"one_of_value": {"oneOf": ["string", "boolean", "null"]}, "value_2": {"type": "string"}}}, + {"one_of_value": 12, "value_2": 12}, + {"one_of_value": 12, "value_2": "12"}, + None, + ), + ( + # Case for #7076 issue (Facebook marketing: print tons of WARN message) + { + "properties": { + "cpc": {"type": ["null", "number"]}, + }, + }, + {"cpc": "6.6666"}, + {"cpc": 6.6666}, + None, + ), + ( + {"type": "object", "properties": {"value": {"type": "array", "items": {"type": "string"}}}}, + {"value": {"key": "value"}}, + {"value": {"key": "value"}}, + "Failed to transform value {'key': 'value'} of type 'object' to 'array', key path: '.value'", + ), + ( + {"type": "object", "properties": {"value1": {"type": "object", "properties": {"value2": {"type": "string"}}}}}, + {"value1": "value2"}, + {"value1": "value2"}, + "Failed to transform value 'value2' of type 'string' to 'object', key path: '.value1'", + ), + ( + {"type": "object", "properties": {"value": {"type": "array", "items": {"type": "object"}}}}, + {"value": ["one", "two"]}, + {"value": ["one", "two"]}, + "Failed to transform value 'one' of type 'string' to 'object', key path: '.value.0'", + ), + ], +) +def test_transform(schema, actual, expected, expected_warns, caplog): + t = TypeTransformer(TransformConfig.DefaultSchemaNormalization) + t.transform(actual, schema) + assert json.dumps(actual) == json.dumps(expected) + if expected_warns: + record = caplog.records[0] + assert record.name == "airbyte" + assert record.levelname == "WARNING" + assert record.message == expected_warns + else: + assert len(caplog.records) == 0 + + +def test_transform_wrong_config(): + with pytest.raises(Exception, match="NoTransform option cannot be combined with other flags."): + TypeTransformer(TransformConfig.NoTransform | TransformConfig.DefaultSchemaNormalization) + + with pytest.raises(Exception, match="Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer"): + + class NotAStream: + transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) + + @transformer.registerCustomTransform + def transform_cb(instance, schema): + pass + + +def test_custom_transform(): + class NotAStream: + transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization) + + @transformer.registerCustomTransform + def transform_cb(instance, schema): + # Check no default conversion applied + assert instance == 12 + assert schema == SIMPLE_SCHEMA["properties"]["value"] + return "transformed" + + s = NotAStream() + obj = {"value": 12} + s.transformer.transform(obj, SIMPLE_SCHEMA) + assert obj == {"value": "transformed"} + + +def test_custom_transform_with_default_normalization(): + class NotAStream: + transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization | TransformConfig.DefaultSchemaNormalization) + + @transformer.registerCustomTransform + def transform_cb(instance, schema): + # Check default conversion applied + assert instance == "12" + assert schema == SIMPLE_SCHEMA["properties"]["value"] + return "transformed" + + s = NotAStream() + obj = {"value": 12} + s.transformer.transform(obj, SIMPLE_SCHEMA) + assert obj == {"value": "transformed"} diff --git a/airbyte-cdk/python/unit_tests/test/__init__.py b/airbyte-cdk/python/unit_tests/test/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/test/mock_http/__init__.py b/airbyte-cdk/python/unit_tests/test/mock_http/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/test/mock_http/test_matcher.py b/airbyte-cdk/python/unit_tests/test/mock_http/test_matcher.py new file mode 100644 index 000000000000..61a9ecfec2f9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test/mock_http/test_matcher.py @@ -0,0 +1,53 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from unittest import TestCase +from unittest.mock import Mock + +from airbyte_cdk.test.mock_http.matcher import HttpRequestMatcher +from airbyte_cdk.test.mock_http.request import HttpRequest + + +class HttpRequestMatcherTest(TestCase): + def setUp(self) -> None: + self._a_request = Mock(spec=HttpRequest) + self._another_request = Mock(spec=HttpRequest) + self._request_to_match = Mock(spec=HttpRequest) + self._matcher = HttpRequestMatcher(self._request_to_match, 1) + + def test_given_request_matches_when_matches_then_has_expected_match_count(self): + self._a_request.matches.return_value = True + self._matcher.matches(self._a_request) + assert self._matcher.has_expected_match_count() + + def test_given_request_does_not_match_when_matches_then_does_not_have_expected_match_count(self): + self._a_request.matches.return_value = False + self._matcher.matches(self._a_request) + + assert not self._matcher.has_expected_match_count() + assert self._matcher.actual_number_of_matches == 0 + + def test_given_many_requests_with_some_match_when_matches_then_has_expected_match_count(self): + self._a_request.matches.return_value = True + self._another_request.matches.return_value = False + self._matcher.matches(self._a_request) + self._matcher.matches(self._another_request) + + assert self._matcher.has_expected_match_count() + assert self._matcher.actual_number_of_matches == 1 + + def test_given_expected_number_of_requests_met_when_matches_then_has_expected_match_count(self): + _matcher = HttpRequestMatcher(self._request_to_match, 2) + self._a_request.matches.return_value = True + _matcher.matches(self._a_request) + _matcher.matches(self._a_request) + + assert _matcher.has_expected_match_count() + assert _matcher.actual_number_of_matches == 2 + + def test_given_expected_number_of_requests_not_met_when_matches_then_does_not_have_expected_match_count(self): + _matcher = HttpRequestMatcher(self._request_to_match, 2) + self._a_request.matches.side_effect = [True, False] + _matcher.matches(self._a_request) + _matcher.matches(self._a_request) + + assert not _matcher.has_expected_match_count() diff --git a/airbyte-cdk/python/unit_tests/test/mock_http/test_mocker.py b/airbyte-cdk/python/unit_tests/test/mock_http/test_mocker.py new file mode 100644 index 000000000000..2b086a1748c6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test/mock_http/test_mocker.py @@ -0,0 +1,259 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from unittest import TestCase + +import pytest +import requests +from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse + +# Ensure that the scheme is HTTP as requests only partially supports other schemes +# see https://github.com/psf/requests/blob/0b4d494192de489701d3a2e32acef8fb5d3f042e/src/requests/models.py#L424-L429 +_A_URL = "http://test.com/" +_ANOTHER_URL = "http://another-test.com/" +_A_RESPONSE_BODY = "a body" +_ANOTHER_RESPONSE_BODY = "another body" +_A_RESPONSE = HttpResponse("any response") +_SOME_QUERY_PARAMS = {"q1": "query value"} +_SOME_HEADERS = {"h1": "header value"} +_OTHER_HEADERS = {"h2": "another header value"} +_SOME_REQUEST_BODY_MAPPING = {"first_field": "first_value", "second_field": 2} +_SOME_REQUEST_BODY_STR = "some_request_body" + + +class HttpMockerTest(TestCase): + @HttpMocker() + def test_given_get_request_match_when_decorate_then_return_response(self, http_mocker): + http_mocker.get( + HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS), + HttpResponse(_A_RESPONSE_BODY, 474, _OTHER_HEADERS), + ) + + response = requests.get(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS) + + assert response.text == _A_RESPONSE_BODY + assert response.status_code == 474 + assert response.headers == _OTHER_HEADERS + + @HttpMocker() + def test_given_delete_request_match_when_decorate_then_return_response(self, http_mocker): + http_mocker.delete( + HttpRequest(_A_URL, headers=_SOME_HEADERS), + HttpResponse(_A_RESPONSE_BODY, 204, _OTHER_HEADERS), + ) + + response = requests.delete(_A_URL, headers=_SOME_HEADERS) + + assert response.text == _A_RESPONSE_BODY + assert response.status_code == 204 + assert response.headers == _OTHER_HEADERS + + @HttpMocker() + def test_given_loose_headers_matching_when_decorate_then_match(self, http_mocker): + http_mocker.get( + HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS), + HttpResponse(_A_RESPONSE_BODY, 474), + ) + + requests.get(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS | {"more strict query param key": "any value"}) + + @HttpMocker() + def test_given_post_request_match_when_decorate_then_return_response(self, http_mocker): + http_mocker.post( + HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS, _SOME_REQUEST_BODY_STR), + HttpResponse(_A_RESPONSE_BODY, 474), + ) + + response = requests.post(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS, data=_SOME_REQUEST_BODY_STR) + + assert response.text == _A_RESPONSE_BODY + assert response.status_code == 474 + + @HttpMocker() + def test_given_multiple_responses_when_decorate_get_request_then_return_response(self, http_mocker): + http_mocker.get( + HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS), + [HttpResponse(_A_RESPONSE_BODY, 1), HttpResponse(_ANOTHER_RESPONSE_BODY, 2)], + ) + + first_response = requests.get(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS) + second_response = requests.get(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS) + + assert first_response.text == _A_RESPONSE_BODY + assert first_response.status_code == 1 + assert second_response.text == _ANOTHER_RESPONSE_BODY + assert second_response.status_code == 2 + + @HttpMocker() + def test_given_multiple_responses_when_decorate_delete_request_then_return_response(self, http_mocker): + http_mocker.delete( + HttpRequest(_A_URL, headers=_SOME_HEADERS), + [HttpResponse(_A_RESPONSE_BODY, 1), HttpResponse(_ANOTHER_RESPONSE_BODY, 2)], + ) + + first_response = requests.delete(_A_URL, headers=_SOME_HEADERS) + second_response = requests.delete(_A_URL, headers=_SOME_HEADERS) + + assert first_response.text == _A_RESPONSE_BODY + assert first_response.status_code == 1 + assert second_response.text == _ANOTHER_RESPONSE_BODY + assert second_response.status_code == 2 + + @HttpMocker() + def test_given_multiple_responses_when_decorate_post_request_then_return_response(self, http_mocker): + http_mocker.post( + HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS, _SOME_REQUEST_BODY_STR), + [HttpResponse(_A_RESPONSE_BODY, 1), HttpResponse(_ANOTHER_RESPONSE_BODY, 2)], + ) + + first_response = requests.post(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS, data=_SOME_REQUEST_BODY_STR) + second_response = requests.post(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS, data=_SOME_REQUEST_BODY_STR) + + assert first_response.text == _A_RESPONSE_BODY + assert first_response.status_code == 1 + assert second_response.text == _ANOTHER_RESPONSE_BODY + assert second_response.status_code == 2 + + @HttpMocker() + def test_given_more_requests_than_responses_when_decorate_then_raise_error(self, http_mocker): + http_mocker.get( + HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS), + [HttpResponse(_A_RESPONSE_BODY, 1), HttpResponse(_ANOTHER_RESPONSE_BODY, 2)], + ) + + last_response = [requests.get(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS) for _ in range(10)][-1] + + assert last_response.text == _ANOTHER_RESPONSE_BODY + assert last_response.status_code == 2 + + @HttpMocker() + def test_given_all_requests_match_when_decorate_then_do_not_raise(self, http_mocker): + http_mocker.get( + HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS), + _A_RESPONSE, + ) + requests.get(_A_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS) + + def test_given_missing_requests_when_decorate_then_raise(self): + @HttpMocker() + def decorated_function(http_mocker): + http_mocker.get( + HttpRequest(_A_URL), + _A_RESPONSE, + ) + + with pytest.raises(ValueError) as exc_info: + decorated_function() + assert "Invalid number of matches" in str(exc_info.value) + + def test_given_assertion_error_when_decorate_then_raise_assertion_error(self): + @HttpMocker() + def decorated_function(http_mocker): + http_mocker.get( + HttpRequest(_A_URL), + _A_RESPONSE, + ) + requests.get(_A_URL) + assert False + + with pytest.raises(AssertionError): + decorated_function() + + def test_given_assertion_error_but_missing_request_when_decorate_then_raise_missing_http_request(self): + @HttpMocker() + def decorated_function(http_mocker): + http_mocker.get( + HttpRequest(_A_URL), + _A_RESPONSE, + ) + assert False + + with pytest.raises(ValueError) as exc_info: + decorated_function() + assert "Invalid number of matches" in str(exc_info.value) + + def test_given_request_does_not_match_when_decorate_then_raise(self): + @HttpMocker() + def decorated_function(http_mocker): + http_mocker.get( + HttpRequest(_A_URL), + _A_RESPONSE, + ) + requests.get(_ANOTHER_URL, params=_SOME_QUERY_PARAMS, headers=_SOME_HEADERS) + + with pytest.raises(ValueError) as exc_info: + decorated_function() + assert "No matcher matches" in str(exc_info.value) + + def test_given_request_matches_multiple_matchers_when_decorate_then_match_first_one(self): + less_granular_headers = {"less_granular": "1"} + more_granular_headers = {"more_granular": "2"} | less_granular_headers + + @HttpMocker() + def decorated_function(http_mocker): + http_mocker.get( + HttpRequest(_A_URL, headers=more_granular_headers), + _A_RESPONSE, + ) + http_mocker.get( + HttpRequest(_A_URL, headers=less_granular_headers), + _A_RESPONSE, + ) + requests.get(_A_URL, headers=more_granular_headers) + + with pytest.raises(ValueError) as exc_info: + decorated_function() + assert "more_granular" in str(exc_info.value) # the matcher corresponding to the first `http_mocker.get` is not matched + + def test_given_exact_number_of_call_provided_when_assert_number_of_calls_then_do_not_raise(self): + @HttpMocker() + def decorated_function(http_mocker): + request = HttpRequest(_A_URL) + http_mocker.get(request, _A_RESPONSE) + + requests.get(_A_URL) + requests.get(_A_URL) + + http_mocker.assert_number_of_calls(request, 2) + + decorated_function() + # then do not raise + + def test_given_invalid_number_of_call_provided_when_assert_number_of_calls_then_raise(self): + @HttpMocker() + def decorated_function(http_mocker): + request = HttpRequest(_A_URL) + http_mocker.get(request, _A_RESPONSE) + + requests.get(_A_URL) + requests.get(_A_URL) + + http_mocker.assert_number_of_calls(request, 1) + + with pytest.raises(AssertionError): + decorated_function() + + def test_given_unknown_request_when_assert_number_of_calls_then_raise(self): + @HttpMocker() + def decorated_function(http_mocker): + http_mocker.get(HttpRequest(_A_URL), _A_RESPONSE) + http_mocker.assert_number_of_calls(HttpRequest(_ANOTHER_URL), 1) + + with pytest.raises(ValueError): + decorated_function() + + def test_given_unknown_request_when_assert_number_of_calls_then_raise(self): + @HttpMocker() + def decorated_function(http_mocker): + http_mocker.get(HttpRequest(_A_URL), _A_RESPONSE) + http_mocker.assert_number_of_calls(HttpRequest(_ANOTHER_URL), 1) + + with pytest.raises(ValueError): + decorated_function() + + def test_given_request_already_mocked_when_decorate_then_raise(self): + with HttpMocker() as http_mocker: + a_request = HttpRequest(_A_URL, _SOME_QUERY_PARAMS, _SOME_HEADERS) + http_mocker.get(a_request, _A_RESPONSE) + + with pytest.raises(ValueError): + http_mocker.get(a_request, _A_RESPONSE) diff --git a/airbyte-cdk/python/unit_tests/test/mock_http/test_request.py b/airbyte-cdk/python/unit_tests/test/mock_http/test_request.py new file mode 100644 index 000000000000..a5a94ea05580 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test/mock_http/test_request.py @@ -0,0 +1,117 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +from unittest import TestCase + +import pytest +from airbyte_cdk.test.mock_http.request import ANY_QUERY_PARAMS, HttpRequest + + +class HttpRequestMatcherTest(TestCase): + def test_given_query_params_as_dict_and_string_then_query_params_are_properly_considered(self): + with_string = HttpRequest("mock://test.com/path", query_params="a_query_param=q1&a_list_param=first&a_list_param=second") + with_dict = HttpRequest("mock://test.com/path", query_params={"a_query_param": "q1", "a_list_param": ["first", "second"]}) + assert with_string.matches(with_dict) and with_dict.matches(with_string) + + def test_given_query_params_in_url_and_also_provided_then_raise_error(self): + with pytest.raises(ValueError): + HttpRequest("mock://test.com/path?a_query_param=1", query_params={"another_query_param": "2"}) + + def test_given_same_url_query_params_and_subset_headers_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", {"a_query_param": "q1"}, {"first_header": "h1"}) + actual_request = HttpRequest("mock://test.com/path", {"a_query_param": "q1"}, {"first_header": "h1", "second_header": "h2"}) + assert actual_request.matches(request_to_match) + + def test_given_url_differs_when_matches_then_return_false(self): + assert not HttpRequest("mock://test.com/another_path").matches(HttpRequest("mock://test.com/path")) + + def test_given_query_params_differs_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", {"a_query_param": "q1"}) + actual_request = HttpRequest("mock://test.com/path", {"another_query_param": "q2"}) + assert not actual_request.matches(request_to_match) + + def test_given_query_params_is_subset_differs_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", {"a_query_param": "q1"}) + actual_request = HttpRequest("mock://test.com/path", {"a_query_param": "q1", "another_query_param": "q2"}) + assert not actual_request.matches(request_to_match) + + def test_given_headers_is_subset_differs_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", headers={"first_header": "h1"}) + actual_request = HttpRequest("mock://test.com/path", headers={"first_header": "h1", "second_header": "h2"}) + assert actual_request.matches(request_to_match) + + def test_given_headers_value_does_not_match_differs_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", headers={"first_header": "h1"}) + actual_request = HttpRequest("mock://test.com/path", headers={"first_header": "value does not match"}) + assert not actual_request.matches(request_to_match) + + def test_given_same_body_mappings_value_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", body={"first_field": "first_value", "second_field": 2}) + actual_request = HttpRequest("mock://test.com/path", body={"first_field": "first_value", "second_field": 2}) + assert actual_request.matches(request_to_match) + + def test_given_bodies_are_mapping_and_differs_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", body={"first_field": "first_value"}) + actual_request = HttpRequest("mock://test.com/path", body={"first_field": "value does not match"}) + assert not actual_request.matches(request_to_match) + + def test_given_same_mapping_and_bytes_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", body={"first_field": "first_value"}) + actual_request = HttpRequest("mock://test.com/path", body=b'{"first_field": "first_value"}') + assert actual_request.matches(request_to_match) + + def test_given_different_mapping_and_bytes_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", body={"first_field": "first_value"}) + actual_request = HttpRequest("mock://test.com/path", body=b'{"first_field": "another value"}') + assert not actual_request.matches(request_to_match) + + def test_given_same_mapping_and_str_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", body={"first_field": "first_value"}) + actual_request = HttpRequest("mock://test.com/path", body='{"first_field": "first_value"}') + assert actual_request.matches(request_to_match) + + def test_given_different_mapping_and_str_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", body={"first_field": "first_value"}) + actual_request = HttpRequest("mock://test.com/path", body='{"first_field": "another value"}') + assert not actual_request.matches(request_to_match) + + def test_given_same_bytes_and_mapping_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", body=b'{"first_field": "first_value"}') + actual_request = HttpRequest("mock://test.com/path", body={"first_field": "first_value"}) + assert actual_request.matches(request_to_match) + + def test_given_different_bytes_and_mapping_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", body=b'{"first_field": "first_value"}') + actual_request = HttpRequest("mock://test.com/path", body={"first_field": "another value"}) + assert not actual_request.matches(request_to_match) + + def test_given_same_str_and_mapping_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", body='{"first_field": "first_value"}') + actual_request = HttpRequest("mock://test.com/path", body={"first_field": "first_value"}) + assert actual_request.matches(request_to_match) + + def test_given_different_str_and_mapping_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", body='{"first_field": "first_value"}') + actual_request = HttpRequest("mock://test.com/path", body={"first_field": "another value"}) + assert not actual_request.matches(request_to_match) + + def test_given_same_body_str_value_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", body="some_request_body") + actual_request = HttpRequest("mock://test.com/path", body="some_request_body") + assert actual_request.matches(request_to_match) + + def test_given_body_str_value_differs_when_matches_then_return_false(self): + request_to_match = HttpRequest("mock://test.com/path", body="some_request_body") + actual_request = HttpRequest("mock://test.com/path", body="another_request_body") + assert not actual_request.matches(request_to_match) + + def test_given_any_matcher_for_query_param_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", {"a_query_param": "q1"}) + actual_request = HttpRequest("mock://test.com/path", ANY_QUERY_PARAMS) + + assert actual_request.matches(request_to_match) + assert request_to_match.matches(actual_request) + + def test_given_any_matcher_for_both_when_matches_then_return_true(self): + request_to_match = HttpRequest("mock://test.com/path", ANY_QUERY_PARAMS) + actual_request = HttpRequest("mock://test.com/path", ANY_QUERY_PARAMS) + assert actual_request.matches(request_to_match) diff --git a/airbyte-cdk/python/unit_tests/test/mock_http/test_response_builder.py b/airbyte-cdk/python/unit_tests/test/mock_http/test_response_builder.py new file mode 100644 index 000000000000..c8ccdc41b9bf --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test/mock_http/test_response_builder.py @@ -0,0 +1,175 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +import json +from copy import deepcopy +from pathlib import Path as FilePath +from typing import Any, Dict, Optional, Union +from unittest import TestCase +from unittest.mock import Mock + +import pytest +from airbyte_cdk.test.mock_http.response import HttpResponse +from airbyte_cdk.test.mock_http.response_builder import ( + FieldPath, + FieldUpdatePaginationStrategy, + HttpResponseBuilder, + NestedPath, + PaginationStrategy, + Path, + RecordBuilder, + create_record_builder, + create_response_builder, + find_template, +) + +_RECORDS_FIELD = "records_field" +_ID_FIELD = "record_id" +_CURSOR_FIELD = "record_cursor" +_ANY_RECORD = {"a_record_field": "a record value"} +_SOME_RECORDS = {_RECORDS_FIELD: [_ANY_RECORD]} +_A_RESPONSE_TEMPLATE = _SOME_RECORDS + +_RECORD_BUILDER = 0 +_RESPONSE_BUILDER = 1 + + +def _record_builder( + response_template: Dict[str, Any], + records_path: Union[FieldPath, NestedPath], + record_id_path: Optional[Path] = None, + record_cursor_path: Optional[Union[FieldPath, NestedPath]] = None, +) -> RecordBuilder: + return create_record_builder(deepcopy(response_template), records_path, record_id_path, record_cursor_path) + + +def _any_record_builder() -> RecordBuilder: + return create_record_builder({"record_path": [{"a_record": "record value"}]}, FieldPath("record_path")) + + +def _response_builder( + response_template: Dict[str, Any], records_path: Union[FieldPath, NestedPath], pagination_strategy: Optional[PaginationStrategy] = None +) -> HttpResponseBuilder: + return create_response_builder(deepcopy(response_template), records_path, pagination_strategy=pagination_strategy) + + +def _body(response: HttpResponse) -> Dict[str, Any]: + return json.loads(response.body) + + +class RecordBuilderTest(TestCase): + def test_given_with_id_when_build_then_set_id(self) -> None: + builder = _record_builder({_RECORDS_FIELD: [{_ID_FIELD: "an id"}]}, FieldPath(_RECORDS_FIELD), FieldPath(_ID_FIELD)) + record = builder.with_id("another id").build() + assert record[_ID_FIELD] == "another id" + + def test_given_nested_id_when_build_then_set_id(self) -> None: + builder = _record_builder( + {_RECORDS_FIELD: [{"nested": {_ID_FIELD: "id"}}]}, FieldPath(_RECORDS_FIELD), NestedPath(["nested", _ID_FIELD]) + ) + record = builder.with_id("another id").build() + assert record["nested"][_ID_FIELD] == "another id" + + def test_given_id_path_not_provided_but_with_id_when_build_then_raise_error(self) -> None: + builder = _record_builder(_A_RESPONSE_TEMPLATE, FieldPath(_RECORDS_FIELD), None) + with pytest.raises(ValueError): + builder.with_id("any_id").build() + + def test_given_no_id_in_template_for_path_when_build_then_raise_error(self) -> None: + with pytest.raises(ValueError): + _record_builder({_RECORDS_FIELD: [{"record without id": "should fail"}]}, FieldPath(_RECORDS_FIELD), FieldPath(_ID_FIELD)) + + def test_given_with_cursor_when_build_then_set_id(self) -> None: + builder = _record_builder( + {_RECORDS_FIELD: [{_CURSOR_FIELD: "a cursor"}]}, FieldPath(_RECORDS_FIELD), record_cursor_path=FieldPath(_CURSOR_FIELD) + ) + record = builder.with_cursor("another cursor").build() + assert record[_CURSOR_FIELD] == "another cursor" + + def test_given_nested_cursor_when_build_then_set_cursor(self) -> None: + builder = _record_builder( + {_RECORDS_FIELD: [{"nested": {_CURSOR_FIELD: "a cursor"}}]}, + FieldPath(_RECORDS_FIELD), + record_cursor_path=NestedPath(["nested", _CURSOR_FIELD]), + ) + record = builder.with_cursor("another cursor").build() + assert record["nested"][_CURSOR_FIELD] == "another cursor" + + def test_given_with_field_when_build_then_write_field(self) -> None: + builder = _any_record_builder() + record = builder.with_field(FieldPath("to_write_field"), "a field value").build() + assert record["to_write_field"] == "a field value" + + def test_given_nested_cursor_when_build_then_write_field(self) -> None: + builder = _any_record_builder() + record = builder.with_field(NestedPath(["path", "to_write_field"]), "a field value").build() + assert record["path"]["to_write_field"] == "a field value" + + def test_given_cursor_path_not_provided_but_with_id_when_build_then_raise_error(self) -> None: + builder = _record_builder(_A_RESPONSE_TEMPLATE, FieldPath(_RECORDS_FIELD)) + with pytest.raises(ValueError): + builder.with_cursor("any cursor").build() + + def test_given_no_cursor_in_template_for_path_when_build_then_raise_error(self) -> None: + with pytest.raises(ValueError): + _record_builder( + {_RECORDS_FIELD: [{"record without cursor": "should fail"}]}, + FieldPath(_RECORDS_FIELD), + record_cursor_path=FieldPath(_ID_FIELD), + ) + + +class HttpResponseBuilderTest(TestCase): + def test_given_records_in_template_but_no_with_records_when_build_then_no_records(self) -> None: + builder = _response_builder({_RECORDS_FIELD: [{"a_record_field": "a record value"}]}, FieldPath(_RECORDS_FIELD)) + response = builder.build() + assert len(_body(response)[_RECORDS_FIELD]) == 0 + + def test_given_many_records_when_build_then_response_has_records(self) -> None: + builder = _response_builder(_A_RESPONSE_TEMPLATE, FieldPath(_RECORDS_FIELD)) + a_record_builder = Mock(spec=RecordBuilder) + a_record_builder.build.return_value = {"a record": 1} + another_record_builder = Mock(spec=RecordBuilder) + another_record_builder.build.return_value = {"another record": 2} + + response = builder.with_record(a_record_builder).with_record(another_record_builder).build() + + assert len(_body(response)[_RECORDS_FIELD]) == 2 + + def test_when_build_then_default_status_code_is_200(self) -> None: + builder = _response_builder(_A_RESPONSE_TEMPLATE, FieldPath(_RECORDS_FIELD)) + response = builder.build() + assert response.status_code == 200 + + def test_given_status_code_when_build_then_status_code_is_set(self) -> None: + builder = _response_builder(_A_RESPONSE_TEMPLATE, FieldPath(_RECORDS_FIELD)) + response = builder.with_status_code(239).build() + assert response.status_code == 239 + + def test_given_pagination_with_strategy_when_build_then_apply_strategy(self) -> None: + builder = _response_builder( + {"has_more_pages": False} | _SOME_RECORDS, + FieldPath(_RECORDS_FIELD), + pagination_strategy=FieldUpdatePaginationStrategy(FieldPath("has_more_pages"), "yes more page"), + ) + + response = builder.with_pagination().build() + + assert _body(response)["has_more_pages"] == "yes more page" + + def test_given_no_pagination_strategy_but_pagination_when_build_then_raise_error(self) -> None: + builder = _response_builder(_A_RESPONSE_TEMPLATE, FieldPath(_RECORDS_FIELD)) + with pytest.raises(ValueError): + builder.with_pagination() + + +class UtilMethodsTest(TestCase): + def test_from_resource_file(self) -> None: + template = find_template("test-resource", __file__) + assert template == {"test-source template": "this is a template for test-resource"} + + def test_given_cwd_doesnt_have_unit_tests_as_parent_when_from_resource_file__then_raise_error(self) -> None: + with pytest.raises(ValueError): + find_template("test-resource", str(FilePath(__file__).parent.parent.parent.parent)) + + def test_given_records_path_invalid_when_create_builders_from_resource_then_raise_exception(self) -> None: + with pytest.raises(ValueError): + create_record_builder(_A_RESPONSE_TEMPLATE, NestedPath(["invalid", "record", "path"])) diff --git a/airbyte-cdk/python/unit_tests/test/test_entrypoint_wrapper.py b/airbyte-cdk/python/unit_tests/test/test_entrypoint_wrapper.py new file mode 100644 index 000000000000..11dfc5877572 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test/test_entrypoint_wrapper.py @@ -0,0 +1,348 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +import json +import logging +import os +from typing import Any, Iterator, List, Mapping, Optional +from unittest import TestCase +from unittest.mock import Mock, patch + +from airbyte_cdk.models import ( + AirbyteAnalyticsTraceMessage, + AirbyteCatalog, + AirbyteErrorTraceMessage, + AirbyteLogMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteRecordMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStreamState, + AirbyteStreamStateSerializer, + AirbyteStreamStatus, + AirbyteStreamStatusTraceMessage, + AirbyteTraceMessage, + ConfiguredAirbyteCatalogSerializer, + Level, + StreamDescriptor, + TraceType, + Type, +) +from airbyte_cdk.sources.abstract_source import AbstractSource +from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput, discover, read +from airbyte_cdk.test.state_builder import StateBuilder +from orjson import orjson + + +def _a_state_message(stream_name: str, stream_state: Mapping[str, Any]) -> AirbyteMessage: + return AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + stream=AirbyteStreamState(stream_descriptor=StreamDescriptor(name=stream_name), stream_state=AirbyteStateBlob(**stream_state)) + ), + ) + + +def _a_status_message(stream_name: str, status: AirbyteStreamStatus) -> AirbyteMessage: + return AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + emitted_at=0, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name=stream_name), + status=status, + ), + ), + ) + + +_A_CATALOG_MESSAGE = AirbyteMessage( + type=Type.CATALOG, + catalog=AirbyteCatalog(streams=[]), +) +_A_RECORD = AirbyteMessage( + type=Type.RECORD, record=AirbyteRecordMessage(stream="stream", data={"record key": "record value"}, emitted_at=0) +) +_A_STATE_MESSAGE = _a_state_message("stream_name", {"state key": "state value for _A_STATE_MESSAGE"}) +_A_LOG = AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=Level.INFO, message="This is an Airbyte log message")) +_AN_ERROR_MESSAGE = AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=0, + error=AirbyteErrorTraceMessage(message="AirbyteErrorTraceMessage message"), + ), +) +_AN_ANALYTIC_MESSAGE = AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ANALYTICS, + emitted_at=0, + analytics=AirbyteAnalyticsTraceMessage(type="an analytic type", value="an analytic value"), + ), +) + +_A_STREAM_NAME = "a stream name" +_A_CONFIG = {"config_key": "config_value"} +_A_CATALOG = ConfiguredAirbyteCatalogSerializer.load( + { + "streams": [ + { + "stream": { + "name": "a_stream_name", + "json_schema": {}, + "supported_sync_modes": ["full_refresh"], + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "append", + } + ] + } +) +_A_STATE = StateBuilder().with_stream_state(_A_STREAM_NAME, {"state_key": "state_value"}).build() +_A_LOG_MESSAGE = "a log message" + + +def _to_entrypoint_output(messages: List[AirbyteMessage]) -> Iterator[str]: + return (orjson.dumps(AirbyteMessageSerializer.dump(message)).decode() for message in messages) + + +def _a_mocked_source() -> AbstractSource: + source = Mock(spec=AbstractSource) + source.message_repository = None + return source + + +def _validate_tmp_json_file(expected, file_path) -> None: + with open(file_path) as file: + assert json.load(file) == expected + + +def _validate_tmp_catalog(expected, file_path) -> None: + assert ConfiguredAirbyteCatalogSerializer.load( + orjson.loads( + open(file_path).read() + ) + ) == expected + + +def _create_tmp_file_validation(entrypoint, expected_config, expected_catalog: Optional[Any] = None, expected_state: Optional[Any] = None): + def _validate_tmp_files(self): + _validate_tmp_json_file(expected_config, entrypoint.parse_args.call_args.args[0][2]) + if expected_catalog: + _validate_tmp_catalog(expected_catalog, entrypoint.parse_args.call_args.args[0][4]) + if expected_state: + _validate_tmp_json_file(expected_state, entrypoint.parse_args.call_args.args[0][6]) + return entrypoint.return_value.run.return_value + + return _validate_tmp_files + + +class EntrypointWrapperDiscoverTest(TestCase): + def setUp(self) -> None: + self._a_source = _a_mocked_source() + + @staticmethod + def test_init_validation_error(): + invalid_message = '{"type": "INVALID_TYPE"}' + entrypoint_output = EntrypointOutput([invalid_message]) + messages = entrypoint_output._messages + assert len(messages) == 1 + assert messages[0].type == Type.LOG + assert messages[0].log.level == Level.INFO + assert messages[0].log.message == invalid_message + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_when_discover_then_ensure_parameters(self, entrypoint): + entrypoint.return_value.run.side_effect = _create_tmp_file_validation(entrypoint, _A_CONFIG) + + discover(self._a_source, _A_CONFIG) + + entrypoint.assert_called_once_with(self._a_source) + entrypoint.return_value.run.assert_called_once_with(entrypoint.parse_args.return_value) + assert entrypoint.parse_args.call_count == 1 + assert entrypoint.parse_args.call_args.args[0][0] == "discover" + assert entrypoint.parse_args.call_args.args[0][1] == "--config" + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_when_discover_then_ensure_files_are_temporary(self, entrypoint): + discover(self._a_source, _A_CONFIG) + + assert not os.path.exists(entrypoint.parse_args.call_args.args[0][2]) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_logging_during_discover_when_discover_then_output_has_logs(self, entrypoint): + def _do_some_logging(self): + logging.getLogger("any logger").info(_A_LOG_MESSAGE) + return entrypoint.return_value.run.return_value + + entrypoint.return_value.run.side_effect = _do_some_logging + + output = discover(self._a_source, _A_CONFIG) + + assert len(output.logs) == 1 + assert output.logs[0].log.message == _A_LOG_MESSAGE + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_record_when_discover_then_output_has_record(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_A_CATALOG_MESSAGE]) + output = discover(self._a_source, _A_CONFIG) + assert AirbyteMessageSerializer.dump(output.catalog) == AirbyteMessageSerializer.dump(_A_CATALOG_MESSAGE) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_log_when_discover_then_output_has_log(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_A_LOG]) + output = discover(self._a_source, _A_CONFIG) + assert AirbyteMessageSerializer.dump(output.logs[0]) == AirbyteMessageSerializer.dump(_A_LOG) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_trace_message_when_discover_then_output_has_trace_messages(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_AN_ANALYTIC_MESSAGE]) + output = discover(self._a_source, _A_CONFIG) + assert AirbyteMessageSerializer.dump(output.analytics_messages[0]) == AirbyteMessageSerializer.dump(_AN_ANALYTIC_MESSAGE) + + @patch("airbyte_cdk.test.entrypoint_wrapper.print", create=True) + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_unexpected_exception_when_discover_then_print(self, entrypoint, print_mock): + entrypoint.return_value.run.side_effect = ValueError("This error should be printed") + discover(self._a_source, _A_CONFIG) + assert print_mock.call_count > 0 + + @patch("airbyte_cdk.test.entrypoint_wrapper.print", create=True) + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_expected_exception_when_discover_then_do_not_print(self, entrypoint, print_mock): + entrypoint.return_value.run.side_effect = ValueError("This error should not be printed") + discover(self._a_source, _A_CONFIG, expecting_exception=True) + assert print_mock.call_count == 0 + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_uncaught_exception_when_read_then_output_has_error(self, entrypoint): + entrypoint.return_value.run.side_effect = ValueError("An error") + output = discover(self._a_source, _A_CONFIG) + assert output.errors + + +class EntrypointWrapperReadTest(TestCase): + def setUp(self) -> None: + self._a_source = _a_mocked_source() + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_when_read_then_ensure_parameters(self, entrypoint): + entrypoint.return_value.run.side_effect = _create_tmp_file_validation(entrypoint, _A_CONFIG, _A_CATALOG, _A_STATE) + + read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + + entrypoint.assert_called_once_with(self._a_source) + entrypoint.return_value.run.assert_called_once_with(entrypoint.parse_args.return_value) + assert entrypoint.parse_args.call_count == 1 + assert entrypoint.parse_args.call_args.args[0][0] == "read" + assert entrypoint.parse_args.call_args.args[0][1] == "--config" + assert entrypoint.parse_args.call_args.args[0][3] == "--catalog" + assert entrypoint.parse_args.call_args.args[0][5] == "--state" + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_when_read_then_ensure_files_are_temporary(self, entrypoint): + read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + + assert not os.path.exists(entrypoint.parse_args.call_args.args[0][2]) + assert not os.path.exists(entrypoint.parse_args.call_args.args[0][4]) + assert not os.path.exists(entrypoint.parse_args.call_args.args[0][6]) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_logging_during_run_when_read_then_output_has_logs(self, entrypoint): + def _do_some_logging(self): + logging.getLogger("any logger").info(_A_LOG_MESSAGE) + return entrypoint.return_value.run.return_value + + entrypoint.return_value.run.side_effect = _do_some_logging + + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + + assert len(output.logs) == 1 + assert output.logs[0].log.message == _A_LOG_MESSAGE + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_record_when_read_then_output_has_record(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_A_RECORD]) + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert AirbyteMessageSerializer.dump(output.records[0]) == AirbyteMessageSerializer.dump(_A_RECORD) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_state_message_when_read_then_output_has_state_message(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_A_STATE_MESSAGE]) + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert AirbyteMessageSerializer.dump(output.state_messages[0]) == AirbyteMessageSerializer.dump(_A_STATE_MESSAGE) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_state_message_and_records_when_read_then_output_has_records_and_state_message(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_A_RECORD, _A_STATE_MESSAGE]) + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert [AirbyteMessageSerializer.dump(message) for message in output.records_and_state_messages] == [ + AirbyteMessageSerializer.dump(message) for message in (_A_RECORD, _A_STATE_MESSAGE) + ] + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_many_state_messages_and_records_when_read_then_output_has_records_and_state_message(self, entrypoint): + state_value = {"state_key": "last state value"} + last_emitted_state = AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="stream_name"), stream_state=AirbyteStateBlob(**state_value) + ) + entrypoint.return_value.run.return_value = _to_entrypoint_output([_A_STATE_MESSAGE, _a_state_message("stream_name", state_value)]) + + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + + assert AirbyteStreamStateSerializer.dump(output.most_recent_state) == AirbyteStreamStateSerializer.dump(last_emitted_state) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_log_when_read_then_output_has_log(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_A_LOG]) + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert AirbyteMessageSerializer.dump(output.logs[0]) == AirbyteMessageSerializer.dump(_A_LOG) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_trace_message_when_read_then_output_has_trace_messages(self, entrypoint): + entrypoint.return_value.run.return_value = _to_entrypoint_output([_AN_ANALYTIC_MESSAGE]) + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert AirbyteMessageSerializer.dump(output.analytics_messages[0]) == AirbyteMessageSerializer.dump(_AN_ANALYTIC_MESSAGE) + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_stream_statuses_when_read_then_return_statuses(self, entrypoint): + status_messages = [ + _a_status_message(_A_STREAM_NAME, AirbyteStreamStatus.STARTED), + _a_status_message(_A_STREAM_NAME, AirbyteStreamStatus.COMPLETE), + ] + entrypoint.return_value.run.return_value = _to_entrypoint_output(status_messages) + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert output.get_stream_statuses(_A_STREAM_NAME) == [AirbyteStreamStatus.STARTED, AirbyteStreamStatus.COMPLETE] + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_stream_statuses_for_many_streams_when_read_then_filter_other_streams(self, entrypoint): + status_messages = [ + _a_status_message(_A_STREAM_NAME, AirbyteStreamStatus.STARTED), + _a_status_message("another stream name", AirbyteStreamStatus.INCOMPLETE), + _a_status_message(_A_STREAM_NAME, AirbyteStreamStatus.COMPLETE), + ] + entrypoint.return_value.run.return_value = _to_entrypoint_output(status_messages) + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert len(output.get_stream_statuses(_A_STREAM_NAME)) == 2 + + @patch("airbyte_cdk.test.entrypoint_wrapper.print", create=True) + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_unexpected_exception_when_read_then_print(self, entrypoint, print_mock): + entrypoint.return_value.run.side_effect = ValueError("This error should be printed") + read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert print_mock.call_count > 0 + + @patch("airbyte_cdk.test.entrypoint_wrapper.print", create=True) + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_expected_exception_when_read_then_do_not_print(self, entrypoint, print_mock): + entrypoint.return_value.run.side_effect = ValueError("This error should not be printed") + read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE, expecting_exception=True) + assert print_mock.call_count == 0 + + @patch("airbyte_cdk.test.entrypoint_wrapper.AirbyteEntrypoint") + def test_given_uncaught_exception_when_read_then_output_has_error(self, entrypoint): + entrypoint.return_value.run.side_effect = ValueError("An error") + output = read(self._a_source, _A_CONFIG, _A_CATALOG, _A_STATE) + assert output.errors diff --git a/airbyte-cdk/python/unit_tests/test_config_observation.py b/airbyte-cdk/python/unit_tests/test_config_observation.py new file mode 100644 index 000000000000..a182854493dc --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_config_observation.py @@ -0,0 +1,88 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import time + +import pytest +from airbyte_cdk.config_observation import ConfigObserver, ObservedDict, create_connector_config_control_message, observe_connector_config +from airbyte_cdk.models import AirbyteControlConnectorConfigMessage, OrchestratorType, Type + + +class TestObservedDict: + def test_update_called_on_set_item(self, mocker): + mock_observer = mocker.Mock() + my_observed_dict = ObservedDict( + {"key": "value", "nested_dict": {"key": "value"}, "list_of_dict": [{"key": "value"}, {"key": "value"}]}, mock_observer + ) + assert mock_observer.update.call_count == 0 + + my_observed_dict["nested_dict"]["key"] = "new_value" + assert mock_observer.update.call_count == 1 + + # Setting the same value again should call observer's update + my_observed_dict["key"] = "new_value" + assert mock_observer.update.call_count == 2 + + my_observed_dict["nested_dict"]["new_key"] = "value" + assert mock_observer.update.call_count == 3 + + my_observed_dict["list_of_dict"][0]["key"] = "new_value" + assert mock_observer.update.call_count == 4 + + my_observed_dict["list_of_dict"][0]["new_key"] = "new_value" + assert mock_observer.update.call_count == 5 + + my_observed_dict["new_list_of_dicts"] = [{"foo": "bar"}] + assert mock_observer.update.call_count == 6 + + my_observed_dict["new_list_of_dicts"][0]["new_key"] = "new_value" + assert mock_observer.update.call_count == 7 + + +class TestConfigObserver: + def test_update(self, capsys): + config_observer = ConfigObserver() + config_observer.set_config(ObservedDict({"key": "value"}, config_observer)) + before_time = time.time() * 1000 + config_observer.update() + after_time = time.time() * 1000 + captured = capsys.readouterr() + airbyte_message = json.loads(captured.out) + assert airbyte_message["type"] == "CONTROL" + assert "control" in airbyte_message + raw_control_message = airbyte_message["control"] + assert raw_control_message["type"] == "CONNECTOR_CONFIG" + assert raw_control_message["connectorConfig"] == {"config": dict(config_observer.config)} + assert before_time < raw_control_message["emitted_at"] < after_time + + +def test_observe_connector_config(capsys): + non_observed_config = {"foo": "bar"} + observed_config = observe_connector_config(non_observed_config) + observer = observed_config.observer + assert isinstance(observed_config, ObservedDict) + assert isinstance(observer, ConfigObserver) + assert observed_config.observer.config == observed_config + observed_config["foo"] = "foo" + captured = capsys.readouterr() + airbyte_message = json.loads(captured.out) + assert airbyte_message["control"]["connectorConfig"] == {"config": {"foo": "foo"}} + + +def test_observe_already_observed_config(): + observed_config = observe_connector_config({"foo": "bar"}) + with pytest.raises(ValueError): + observe_connector_config(observed_config) + + +def test_create_connector_config_control_message(): + A_CONFIG = {"config key": "config value"} + + message = create_connector_config_control_message(A_CONFIG) + + assert message.type == Type.CONTROL + assert message.control.type == OrchestratorType.CONNECTOR_CONFIG + assert message.control.connectorConfig == AirbyteControlConnectorConfigMessage(config=A_CONFIG) + assert message.control.emitted_at is not None diff --git a/airbyte-cdk/python/unit_tests/test_connector.py b/airbyte-cdk/python/unit_tests/test_connector.py new file mode 100644 index 000000000000..ea7de2e40695 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_connector.py @@ -0,0 +1,133 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import json +import logging +import os +import sys +import tempfile +from pathlib import Path +from typing import Any, Mapping + +import pytest +import yaml +from airbyte_cdk import Connector +from airbyte_cdk.models import AirbyteConnectionStatus + +logger = logging.getLogger("airbyte") + +MODULE = sys.modules[__name__] +MODULE_PATH = os.path.abspath(MODULE.__file__) +SPEC_ROOT = os.path.dirname(MODULE_PATH) + + +class MockConnector(Connector): + def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: + pass + + +@pytest.fixture() +def mock_config(): + return {"bogus": "file"} + + +@pytest.fixture +def nonempty_file(mock_config): + with tempfile.NamedTemporaryFile("w") as file: + file.write(json.dumps(mock_config)) + file.flush() + yield file + + +@pytest.fixture +def nonjson_file(mock_config): + with tempfile.NamedTemporaryFile("w") as file: + file.write("the content of this file is not JSON") + file.flush() + yield file + + +@pytest.fixture +def integration(): + return MockConnector() + + +def test_read_config(nonempty_file, integration: Connector, mock_config): + actual = integration.read_config(nonempty_file.name) + assert actual == mock_config + + +def test_read_non_json_config(nonjson_file, integration: Connector): + with pytest.raises(ValueError, match="Could not read json file"): + integration.read_config(nonjson_file.name) + + +def test_write_config(integration, mock_config): + config_path = Path(tempfile.gettempdir()) / "config.json" + integration.write_config(mock_config, str(config_path)) + with open(config_path, "r") as actual: + assert json.loads(actual.read()) == mock_config + + +class TestConnectorSpec: + CONNECTION_SPECIFICATION = { + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": {"api_token": {"type": "string"}}, + } + + @pytest.fixture + def use_json_spec(self): + spec = { + "documentationUrl": "https://airbyte.com/#json", + "connectionSpecification": self.CONNECTION_SPECIFICATION, + } + + json_path = os.path.join(SPEC_ROOT, "spec.json") + with open(json_path, "w") as f: + f.write(json.dumps(spec)) + yield + os.remove(json_path) + + @pytest.fixture + def use_invalid_json_spec(self): + json_path = os.path.join(SPEC_ROOT, "spec.json") + with open(json_path, "w") as f: + f.write("the content of this file is not JSON") + yield + os.remove(json_path) + + @pytest.fixture + def use_yaml_spec(self): + spec = {"documentationUrl": "https://airbyte.com/#yaml", "connectionSpecification": self.CONNECTION_SPECIFICATION} + + yaml_path = os.path.join(SPEC_ROOT, "spec.yaml") + with open(yaml_path, "w") as f: + f.write(yaml.dump(spec)) + yield + os.remove(yaml_path) + + def test_spec_from_json_file(self, integration, use_json_spec): + connector_spec = integration.spec(logger) + assert connector_spec.documentationUrl == "https://airbyte.com/#json" + assert connector_spec.connectionSpecification == self.CONNECTION_SPECIFICATION + + def test_spec_from_improperly_formatted_json_file(self, integration, use_invalid_json_spec): + with pytest.raises(ValueError, match="Could not read json spec file"): + integration.spec(logger) + + def test_spec_from_yaml_file(self, integration, use_yaml_spec): + connector_spec = integration.spec(logger) + assert connector_spec.documentationUrl == "https://airbyte.com/#yaml" + assert connector_spec.connectionSpecification == self.CONNECTION_SPECIFICATION + + def test_multiple_spec_files_raises_exception(self, integration, use_yaml_spec, use_json_spec): + with pytest.raises(RuntimeError, match="spec.yaml or spec.json"): + integration.spec(logger) + + def test_no_spec_file_raises_exception(self, integration): + with pytest.raises(FileNotFoundError, match="Unable to find spec."): + integration.spec(logger) diff --git a/airbyte-cdk/python/unit_tests/test_counter.py b/airbyte-cdk/python/unit_tests/test_counter.py new file mode 100644 index 000000000000..7f17ff7634c2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_counter.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from unittest import mock + +from airbyte_cdk.utils.event_timing import create_timer + + +def test_counter_init(): + with create_timer("Counter") as timer: + assert timer.name == "Counter" + + +def test_counter_start_event(): + with create_timer("Counter") as timer: + with mock.patch("airbyte_cdk.utils.event_timing.EventTimer.start_event") as mock_start_event: + timer.start_event("test_event") + mock_start_event.assert_called_with("test_event") + + +def test_counter_finish_event(): + with create_timer("Counter") as timer: + with mock.patch("airbyte_cdk.utils.event_timing.EventTimer.finish_event") as mock_finish_event: + timer.finish_event("test_event") + mock_finish_event.assert_called_with("test_event") + + +def test_timer_multiple_events(): + with create_timer("Counter") as timer: + for i in range(10): + timer.start_event("test_event") + timer.finish_event() + assert timer.count == 10 + + +def test_report_is_ordered_by_name_by_default(): + names = ["j", "b", "g", "d", "e", "f", "c", "h", "i", "a"] + + with create_timer("Source Counter") as timer: + for name in names: + timer.start_event(name) + timer.finish_event() + report = timer.report().split("\n")[1:] # ignore the first line + report_names = [line.split(" ")[0] for line in report] + assert report_names == sorted(names) + + +def test_double_finish_is_safely_ignored(): + with create_timer("Source Counter") as timer: + timer.start_event("test_event") + timer.finish_event() + timer.finish_event() + assert timer.count == 1 diff --git a/airbyte-cdk/python/unit_tests/test_entrypoint.py b/airbyte-cdk/python/unit_tests/test_entrypoint.py new file mode 100644 index 000000000000..123a445054f2 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_entrypoint.py @@ -0,0 +1,554 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import os +from argparse import Namespace +from collections import defaultdict +from copy import deepcopy +from typing import Any, List, Mapping, MutableMapping, Union +from unittest import mock +from unittest.mock import MagicMock, patch + +import freezegun +import pytest +import requests +from airbyte_cdk import AirbyteEntrypoint +from airbyte_cdk import entrypoint as entrypoint_module +from airbyte_cdk.models import ( + AirbyteCatalog, + AirbyteConnectionStatus, + AirbyteControlConnectorConfigMessage, + AirbyteControlMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteRecordMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateStats, + AirbyteStateType, + AirbyteStream, + AirbyteStreamState, + AirbyteStreamStatus, + AirbyteStreamStatusTraceMessage, + AirbyteTraceMessage, + ConnectorSpecification, + FailureType, + OrchestratorType, + Status, + StreamDescriptor, + SyncMode, + TraceType, + Type, +) +from airbyte_cdk.sources import Source +from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor +from airbyte_cdk.utils import AirbyteTracedException +from orjson import orjson + + +class MockSource(Source): + def read(self, **kwargs): + pass + + def discover(self, **kwargs): + pass + + def check(self, **kwargs): + pass + + @property + def message_repository(self): + pass + + +def _as_arglist(cmd: str, named_args: Mapping[str, Any]) -> List[str]: + out = [cmd] + for k, v in named_args.items(): + out.append(f"--{k}") + if v: + out.append(v) + return out + + +@pytest.fixture +def spec_mock(mocker): + expected = ConnectorSpecification(connectionSpecification={}) + mock = MagicMock(return_value=expected) + mocker.patch.object(MockSource, "spec", mock) + return mock + + +MESSAGE_FROM_REPOSITORY = AirbyteMessage( + type=Type.CONTROL, + control=AirbyteControlMessage( + type=OrchestratorType.CONNECTOR_CONFIG, + emitted_at=10, + connectorConfig=AirbyteControlConnectorConfigMessage(config={"any config": "a config value"}), + ), +) + + +@pytest.fixture +def entrypoint(mocker) -> AirbyteEntrypoint: + message_repository = MagicMock() + message_repository.consume_queue.side_effect = [[message for message in [MESSAGE_FROM_REPOSITORY]], [], []] + mocker.patch.object(MockSource, "message_repository", new_callable=mocker.PropertyMock, return_value=message_repository) + return AirbyteEntrypoint(MockSource()) + + +def test_airbyte_entrypoint_init(mocker): + mocker.patch.object(entrypoint_module, "init_uncaught_exception_handler") + AirbyteEntrypoint(MockSource()) + entrypoint_module.init_uncaught_exception_handler.assert_called_once_with(entrypoint_module.logger) + + +@pytest.mark.parametrize( + ["cmd", "args", "expected_args"], + [ + ("spec", {"debug": ""}, {"command": "spec", "debug": True}), + ("check", {"config": "config_path"}, {"command": "check", "config": "config_path", "debug": False}), + ("discover", {"config": "config_path", "debug": ""}, {"command": "discover", "config": "config_path", "debug": True}), + ( + "read", + {"config": "config_path", "catalog": "catalog_path", "state": "None"}, + {"command": "read", "config": "config_path", "catalog": "catalog_path", "state": "None", "debug": False}, + ), + ( + "read", + {"config": "config_path", "catalog": "catalog_path", "state": "state_path", "debug": ""}, + {"command": "read", "config": "config_path", "catalog": "catalog_path", "state": "state_path", "debug": True}, + ), + ], +) +def test_parse_valid_args(cmd: str, args: Mapping[str, Any], expected_args, entrypoint: AirbyteEntrypoint): + arglist = _as_arglist(cmd, args) + parsed_args = entrypoint.parse_args(arglist) + assert vars(parsed_args) == expected_args + + +@pytest.mark.parametrize( + ["cmd", "args"], + [ + ("check", {"config": "config_path"}), + ("discover", {"config": "config_path"}), + ("read", {"config": "config_path", "catalog": "catalog_path"}), + ], +) +def test_parse_missing_required_args(cmd: str, args: MutableMapping[str, Any], entrypoint: AirbyteEntrypoint): + required_args = {"check": ["config"], "discover": ["config"], "read": ["config", "catalog"]} + for required_arg in required_args[cmd]: + argcopy = deepcopy(args) + del argcopy[required_arg] + with pytest.raises(BaseException): + entrypoint.parse_args(_as_arglist(cmd, argcopy)) + + +def _wrap_message(submessage: Union[AirbyteConnectionStatus, ConnectorSpecification, AirbyteRecordMessage, AirbyteCatalog]) -> str: + if isinstance(submessage, AirbyteConnectionStatus): + message = AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=submessage) + elif isinstance(submessage, ConnectorSpecification): + message = AirbyteMessage(type=Type.SPEC, spec=submessage) + elif isinstance(submessage, AirbyteCatalog): + message = AirbyteMessage(type=Type.CATALOG, catalog=submessage) + elif isinstance(submessage, AirbyteRecordMessage): + message = AirbyteMessage(type=Type.RECORD, record=submessage) + elif isinstance(submessage, AirbyteTraceMessage): + message = AirbyteMessage(type=Type.TRACE, trace=submessage) + else: + raise Exception(f"Unknown message type: {submessage}") + + return orjson.dumps(AirbyteMessageSerializer.dump(message)).decode() + + +def test_run_spec(entrypoint: AirbyteEntrypoint, mocker): + parsed_args = Namespace(command="spec") + expected = ConnectorSpecification(connectionSpecification={"hi": "hi"}) + mocker.patch.object(MockSource, "spec", return_value=expected) + + messages = list(entrypoint.run(parsed_args)) + + assert [orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode(), _wrap_message(expected)] == messages + + +@pytest.fixture +def config_mock(mocker, request): + config = request.param if hasattr(request, "param") else {"username": "fake"} + mocker.patch.object(MockSource, "read_config", return_value=config) + mocker.patch.object(MockSource, "configure", return_value=config) + return config + + +@pytest.mark.parametrize( + "config_mock, schema, config_valid", + [ + ({"username": "fake"}, {"type": "object", "properties": {"name": {"type": "string"}}, "additionalProperties": False}, False), + ({"username": "fake"}, {"type": "object", "properties": {"username": {"type": "string"}}, "additionalProperties": False}, True), + ({"username": "fake"}, {"type": "object", "properties": {"user": {"type": "string"}}}, True), + ({"username": "fake"}, {"type": "object", "properties": {"user": {"type": "string", "airbyte_secret": True}}}, True), + ( + {"username": "fake", "_limit": 22}, + {"type": "object", "properties": {"username": {"type": "string"}}, "additionalProperties": False}, + True, + ), + ], + indirect=["config_mock"], +) +def test_config_validate(entrypoint: AirbyteEntrypoint, mocker, config_mock, schema, config_valid): + parsed_args = Namespace(command="check", config="config_path") + check_value = AirbyteConnectionStatus(status=Status.SUCCEEDED) + mocker.patch.object(MockSource, "check", return_value=check_value) + mocker.patch.object(MockSource, "spec", return_value=ConnectorSpecification(connectionSpecification=schema)) + + messages = list(entrypoint.run(parsed_args)) + if config_valid: + assert [orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode(), _wrap_message(check_value)] == messages + else: + assert len(messages) == 2 + assert messages[0] == orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode() + connection_status_message = AirbyteMessage(**orjson.loads(messages[1])) + assert connection_status_message.type == Type.CONNECTION_STATUS.value + assert connection_status_message.connectionStatus.get("status") == Status.FAILED.value + assert connection_status_message.connectionStatus.get("message").startswith("Config validation error:") + + +def test_run_check(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + parsed_args = Namespace(command="check", config="config_path") + check_value = AirbyteConnectionStatus(status=Status.SUCCEEDED) + mocker.patch.object(MockSource, "check", return_value=check_value) + + messages = list(entrypoint.run(parsed_args)) + + assert [orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode(), _wrap_message(check_value)] == messages + assert spec_mock.called + + +@freezegun.freeze_time("1970-01-01T00:00:00.001Z") +def test_run_check_with_exception(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + exception = ValueError("Any error") + parsed_args = Namespace(command="check", config="config_path") + mocker.patch.object(MockSource, "check", side_effect=exception) + + with pytest.raises(ValueError): + list(entrypoint.run(parsed_args)) + + +@freezegun.freeze_time("1970-01-01T00:00:00.001Z") +def test_run_check_with_traced_exception(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + exception = AirbyteTracedException.from_exception(ValueError("Any error")) + parsed_args = Namespace(command="check", config="config_path") + mocker.patch.object(MockSource, "check", side_effect=exception) + + with pytest.raises(AirbyteTracedException): + list(entrypoint.run(parsed_args)) + + +@freezegun.freeze_time("1970-01-01T00:00:00.001Z") +def test_run_check_with_config_error(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + exception = AirbyteTracedException.from_exception(ValueError("Any error")) + exception.failure_type = FailureType.config_error + parsed_args = Namespace(command="check", config="config_path") + mocker.patch.object(MockSource, "check", side_effect=exception) + + messages = list(entrypoint.run(parsed_args)) + expected_trace = exception.as_airbyte_message() + expected_trace.emitted_at = 1 + expected_trace.trace.emitted_at = 1 + expected_messages = [ + orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode(), + orjson.dumps(AirbyteMessageSerializer.dump(expected_trace)).decode(), + _wrap_message( + AirbyteConnectionStatus( + status=Status.FAILED, + message=AirbyteTracedException.from_exception(exception).message + ) + ), + ] + assert messages == expected_messages + + +@freezegun.freeze_time("1970-01-01T00:00:00.001Z") +def test_run_check_with_transient_error(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + exception = AirbyteTracedException.from_exception(ValueError("Any error")) + exception.failure_type = FailureType.transient_error + parsed_args = Namespace(command="check", config="config_path") + mocker.patch.object(MockSource, "check", side_effect=exception) + + with pytest.raises(AirbyteTracedException): + list(entrypoint.run(parsed_args)) + + +def test_run_discover(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + parsed_args = Namespace(command="discover", config="config_path") + expected = AirbyteCatalog(streams=[AirbyteStream(name="stream", json_schema={"k": "v"}, supported_sync_modes=[SyncMode.full_refresh])]) + mocker.patch.object(MockSource, "discover", return_value=expected) + + messages = list(entrypoint.run(parsed_args)) + + assert [orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode(), _wrap_message(expected)] == messages + assert spec_mock.called + + +def test_run_discover_with_exception(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + parsed_args = Namespace(command="discover", config="config_path") + mocker.patch.object(MockSource, "discover", side_effect=ValueError("Any error")) + + with pytest.raises(ValueError): + messages = list(entrypoint.run(parsed_args)) + assert [orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode()] == messages + + +def test_run_read(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + parsed_args = Namespace(command="read", config="config_path", state="statepath", catalog="catalogpath") + expected = AirbyteRecordMessage(stream="stream", data={"data": "stuff"}, emitted_at=1) + mocker.patch.object(MockSource, "read_state", return_value={}) + mocker.patch.object(MockSource, "read_catalog", return_value={}) + mocker.patch.object(MockSource, "read", return_value=[AirbyteMessage(record=expected, type=Type.RECORD)]) + + messages = list(entrypoint.run(parsed_args)) + + assert [orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode(), _wrap_message(expected)] == messages + assert spec_mock.called + + +def test_given_message_emitted_during_config_when_read_then_emit_message_before_next_steps( + entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock +): + parsed_args = Namespace(command="read", config="config_path", state="statepath", catalog="catalogpath") + mocker.patch.object(MockSource, "read_catalog", side_effect=ValueError) + + messages = entrypoint.run(parsed_args) + assert next(messages) == orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode() + with pytest.raises(ValueError): + next(messages) + + +def test_run_read_with_exception(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): + parsed_args = Namespace(command="read", config="config_path", state="statepath", catalog="catalogpath") + mocker.patch.object(MockSource, "read_state", return_value={}) + mocker.patch.object(MockSource, "read_catalog", return_value={}) + mocker.patch.object(MockSource, "read", side_effect=ValueError("Any error")) + + with pytest.raises(ValueError): + messages = list(entrypoint.run(parsed_args)) + assert [orjson.dumps(AirbyteMessageSerializer.dump(MESSAGE_FROM_REPOSITORY)).decode()] == messages + + +def test_invalid_command(entrypoint: AirbyteEntrypoint, config_mock): + with pytest.raises(Exception): + list(entrypoint.run(Namespace(command="invalid", config="conf"))) + + +@pytest.mark.parametrize( + "deployment_mode, url, expected_error", + [ + pytest.param("CLOUD", "https://airbyte.com", None, id="test_cloud_public_endpoint_is_successful"), + pytest.param("CLOUD", "https://192.168.27.30", AirbyteTracedException, id="test_cloud_private_ip_address_is_rejected"), + pytest.param("CLOUD", "https://localhost:8080/api/v1/cast", AirbyteTracedException, id="test_cloud_private_endpoint_is_rejected"), + pytest.param("CLOUD", "http://past.lives.net/api/v1/inyun", ValueError, id="test_cloud_unsecured_endpoint_is_rejected"), + pytest.param("CLOUD", "https://not:very/cash:443.money", ValueError, id="test_cloud_invalid_url_format"), + pytest.param("CLOUD", "https://192.168.27.30 ", ValueError, id="test_cloud_incorrect_ip_format_is_rejected"), + pytest.param("cloud", "https://192.168.27.30", AirbyteTracedException, id="test_case_insensitive_cloud_environment_variable"), + pytest.param("OSS", "https://airbyte.com", None, id="test_oss_public_endpoint_is_successful"), + pytest.param("OSS", "https://192.168.27.30", None, id="test_oss_private_endpoint_is_successful"), + pytest.param("OSS", "https://localhost:8080/api/v1/cast", None, id="test_oss_private_endpoint_is_successful"), + pytest.param("OSS", "http://past.lives.net/api/v1/inyun", None, id="test_oss_unsecured_endpoint_is_successful"), + ], +) +@patch.object(requests.Session, "send", lambda self, request, **kwargs: requests.Response()) +def test_filter_internal_requests(deployment_mode, url, expected_error): + with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): + AirbyteEntrypoint(source=MockSource()) + + session = requests.Session() + + prepared_request = requests.PreparedRequest() + prepared_request.method = "GET" + prepared_request.headers = {"header": "value"} + prepared_request.url = url + + if expected_error: + with pytest.raises(expected_error): + session.send(request=prepared_request) + else: + actual_response = session.send(request=prepared_request) + assert isinstance(actual_response, requests.Response) + + +@pytest.mark.parametrize( + "incoming_message, stream_message_count, expected_message, expected_records_by_stream", + [ + pytest.param( + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="customers", data={"id": "12345"}, emitted_at=1)), + {HashableStreamDescriptor(name="customers"): 100.0}, + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="customers", data={"id": "12345"}, emitted_at=1)), + {HashableStreamDescriptor(name="customers"): 101.0}, + id="test_handle_record_message", + ), + pytest.param( + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="customers"), stream_state=AirbyteStateBlob(updated_at="2024-02-02") + ), + ), + ), + {HashableStreamDescriptor(name="customers"): 100.0}, + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="customers"), stream_state=AirbyteStateBlob(updated_at="2024-02-02") + ), + sourceStats=AirbyteStateStats(recordCount=100.0), + ), + ), + {HashableStreamDescriptor(name="customers"): 0.0}, + id="test_handle_state_message", + ), + pytest.param( + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="customers", data={"id": "12345"}, emitted_at=1)), + defaultdict(float), + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="customers", data={"id": "12345"}, emitted_at=1)), + {HashableStreamDescriptor(name="customers"): 1.0}, + id="test_handle_first_record_message", + ), + pytest.param( + AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="customers"), status=AirbyteStreamStatus.COMPLETE + ), + emitted_at=1, + ), + ), + {HashableStreamDescriptor(name="customers"): 5.0}, + AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.STREAM_STATUS, + stream_status=AirbyteStreamStatusTraceMessage( + stream_descriptor=StreamDescriptor(name="customers"), status=AirbyteStreamStatus.COMPLETE + ), + emitted_at=1, + ), + ), + {HashableStreamDescriptor(name="customers"): 5.0}, + id="test_handle_other_message_type", + ), + pytest.param( + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="others", data={"id": "12345"}, emitted_at=1)), + {HashableStreamDescriptor(name="customers"): 100.0, HashableStreamDescriptor(name="others"): 27.0}, + AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="others", data={"id": "12345"}, emitted_at=1)), + {HashableStreamDescriptor(name="customers"): 100.0, HashableStreamDescriptor(name="others"): 28.0}, + id="test_handle_record_message_for_other_stream", + ), + pytest.param( + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="others"), stream_state=AirbyteStateBlob(updated_at="2024-02-02") + ), + ), + ), + {HashableStreamDescriptor(name="customers"): 100.0, HashableStreamDescriptor(name="others"): 27.0}, + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="others"), stream_state=AirbyteStateBlob(updated_at="2024-02-02") + ), + sourceStats=AirbyteStateStats(recordCount=27.0), + ), + ), + {HashableStreamDescriptor(name="customers"): 100.0, HashableStreamDescriptor(name="others"): 0.0}, + id="test_handle_state_message_for_other_stream", + ), + pytest.param( + AirbyteMessage( + type=Type.RECORD, record=AirbyteRecordMessage(stream="customers", namespace="public", data={"id": "12345"}, emitted_at=1) + ), + {HashableStreamDescriptor(name="customers", namespace="public"): 100.0}, + AirbyteMessage( + type=Type.RECORD, record=AirbyteRecordMessage(stream="customers", namespace="public", data={"id": "12345"}, emitted_at=1) + ), + {HashableStreamDescriptor(name="customers", namespace="public"): 101.0}, + id="test_handle_record_message_with_descriptor", + ), + pytest.param( + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="customers", namespace="public"), + stream_state=AirbyteStateBlob(updated_at="2024-02-02"), + ), + ), + ), + {HashableStreamDescriptor(name="customers", namespace="public"): 100.0}, + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="customers", namespace="public"), + stream_state=AirbyteStateBlob(updated_at="2024-02-02"), + ), + sourceStats=AirbyteStateStats(recordCount=100.0), + ), + ), + {HashableStreamDescriptor(name="customers", namespace="public"): 0.0}, + id="test_handle_state_message_with_descriptor", + ), + pytest.param( + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="others", namespace="public"), + stream_state=AirbyteStateBlob(updated_at="2024-02-02"), + ), + ), + ), + {HashableStreamDescriptor(name="customers", namespace="public"): 100.0}, + AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="others", namespace="public"), + stream_state=AirbyteStateBlob(updated_at="2024-02-02"), + ), + sourceStats=AirbyteStateStats(recordCount=0.0), + ), + ), + { + HashableStreamDescriptor(name="customers", namespace="public"): 100.0, + HashableStreamDescriptor(name="others", namespace="public"): 0.0, + }, + id="test_handle_state_message_no_records", + ), + ], +) +def test_handle_record_counts(incoming_message, stream_message_count, expected_message, expected_records_by_stream): + entrypoint = AirbyteEntrypoint(source=MockSource()) + actual_message = entrypoint.handle_record_counts(message=incoming_message, stream_message_count=stream_message_count) + assert actual_message == expected_message + + for stream_descriptor, message_count in stream_message_count.items(): + assert isinstance(message_count, float) + # Python assertions against different number types won't fail if the value is equivalent + assert message_count == expected_records_by_stream[stream_descriptor] + + if actual_message.type == Type.STATE: + assert isinstance(actual_message.state.sourceStats.recordCount, float), "recordCount value should be expressed as a float" diff --git a/airbyte-cdk/python/unit_tests/test_exception_handler.py b/airbyte-cdk/python/unit_tests/test_exception_handler.py new file mode 100644 index 000000000000..f135c19fd5a9 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_exception_handler.py @@ -0,0 +1,89 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import json +import subprocess +import sys + +import pytest +from airbyte_cdk.exception_handler import assemble_uncaught_exception +from airbyte_cdk.models import ( + AirbyteErrorTraceMessage, + AirbyteLogMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteTraceMessage, + FailureType, + Level, + TraceType, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +def test_given_exception_is_traced_exception_when_assemble_uncaught_exception_then_return_same_exception(): + exception = AirbyteTracedException() + assembled_exception = assemble_uncaught_exception(type(exception), exception) + assert exception == assembled_exception + + +def test_given_exception_not_traced_exception_when_assemble_uncaught_exception_then_return_traced_exception(): + exception = ValueError("any error") + assembled_exception = assemble_uncaught_exception(type(exception), exception) + assert isinstance(assembled_exception, AirbyteTracedException) + + +def test_given_exception_with_display_message_when_assemble_uncaught_exception_then_internal_message_contains_display_message(): + display_message = "some display message" + exception = ExceptionWithDisplayMessage(display_message) + assembled_exception = assemble_uncaught_exception(type(exception), exception) + assert display_message in assembled_exception.internal_message + + +def test_uncaught_exception_handler(): + cmd = "from airbyte_cdk.logger import init_logger; from airbyte_cdk.exception_handler import init_uncaught_exception_handler; logger = init_logger('airbyte'); init_uncaught_exception_handler(logger); raise 1" + exception_message = "exceptions must derive from BaseException" + exception_trace = ( + "Traceback (most recent call last):\n" + ' File "", line 1, in \n' + "TypeError: exceptions must derive from BaseException" + ) + + expected_log_message = AirbyteMessage( + type=MessageType.LOG, log=AirbyteLogMessage(level=Level.FATAL, message=f"{exception_message}\n{exception_trace}") + ) + + expected_trace_message = AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=0.0, + error=AirbyteErrorTraceMessage( + failure_type=FailureType.system_error, + message="Something went wrong in the connector. See the logs for more details.", + internal_message=exception_message, + stack_trace=f"{exception_trace}\n", + ), + ), + ) + + with pytest.raises(subprocess.CalledProcessError) as err: + subprocess.check_output([sys.executable, "-c", cmd], stderr=subprocess.STDOUT) + + assert not err.value.stderr, "nothing on the stderr" + + stdout_lines = err.value.output.decode("utf-8").strip().split("\n") + assert len(stdout_lines) == 2 + + log_output, trace_output = stdout_lines + + out_log_message = AirbyteMessageSerializer.load(json.loads(log_output)) + assert out_log_message == expected_log_message, "Log message should be emitted in expected form" + + out_trace_message = AirbyteMessageSerializer.load(json.loads(trace_output)) + assert out_trace_message.trace.emitted_at > 0 + out_trace_message.trace.emitted_at = 0.0 # set a specific emitted_at value for testing + assert out_trace_message == expected_trace_message, "Trace message should be emitted in expected form" diff --git a/airbyte-cdk/python/unit_tests/test_logger.py b/airbyte-cdk/python/unit_tests/test_logger.py new file mode 100644 index 000000000000..3b6db8b89232 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_logger.py @@ -0,0 +1,97 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from typing import Dict + +import pytest +from airbyte_cdk.logger import AirbyteLogFormatter + + +@pytest.fixture(scope="session") +def logger(): + logger = logging.getLogger("airbyte.Testlogger") + return logger + + +def test_formatter(logger, caplog): + formatter = AirbyteLogFormatter() + logger.info("Test formatter") + record = caplog.records[0] + formatted_record = formatter.format(record) + formatted_record_data = json.loads(formatted_record) + assert formatted_record_data.get("type") == "LOG" + log = formatted_record_data.get("log") + assert isinstance(log, Dict) + level = log.get("level") + message = log.get("message") + assert level == "INFO" + assert message == "Test formatter" + + +def test_level_transform(logger, caplog): + formatter = AirbyteLogFormatter() + logger.warning("Test level transform warn") + logger.critical("Test level transform critical") + record_warn = caplog.records[0] + record_critical = caplog.records[1] + formatted_record_warn = formatter.format(record_warn) + formatted_record_warn_data = json.loads(formatted_record_warn) + log_warn = formatted_record_warn_data.get("log") + level_warn = log_warn.get("level") + formatted_record_critical = formatter.format(record_critical) + formatted_record_critical_data = json.loads(formatted_record_critical) + log_critical = formatted_record_critical_data.get("log") + level_critical = log_critical.get("level") + assert level_warn == "WARN" + assert level_critical == "FATAL" + + +def test_debug(logger, caplog): + # Test debug logger in isolation since the default logger is initialized to TRACE (15) instead of DEBUG (10). + formatter = AirbyteLogFormatter() + debug_logger = logging.getLogger("airbyte.Debuglogger") + debug_logger.setLevel(logging.DEBUG) + debug_logger.debug("Test debug 1", extra={"extra_field": "extra value"}) + record = caplog.records[0] + formatted_record = json.loads(formatter.format(record)) + assert formatted_record["type"] == "DEBUG" + assert formatted_record["message"] == "Test debug 1" + assert formatted_record["data"]["extra_field"] == "extra value" + + +def test_default_debug_is_ignored(logger, caplog): + logger.debug("Test debug that is ignored since log level is TRACE") + assert len(caplog.records) == 0 + + +def test_info(logger, caplog): + logger.info("Test info 1") + logger.info("Test info 2") + assert len(caplog.records) == 2 + first_record = caplog.records[0] + assert first_record.levelname == "INFO" + assert first_record.message == "Test info 1" + + +def test_warn(logger, caplog): + logger.warn("Test warn 1") + record = caplog.records[0] + assert record.levelname == "WARNING" + assert record.message == "Test warn 1" + + +def test_error(logger, caplog): + logger.error("Test error 1") + record = caplog.records[0] + assert record.levelname == "ERROR" + assert record.message == "Test error 1" + + +def test_fatal(logger, caplog): + logger.fatal("Test fatal 1") + record = caplog.records[0] + assert record.levelname == "CRITICAL" + assert record.message == "Test fatal 1" diff --git a/airbyte-cdk/python/unit_tests/test_secure_logger.py b/airbyte-cdk/python/unit_tests/test_secure_logger.py new file mode 100644 index 000000000000..44e0a3d9006c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_secure_logger.py @@ -0,0 +1,240 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import logging +import sys +from argparse import Namespace +from typing import Any, Iterable, Mapping, MutableMapping + +import pytest +from airbyte_cdk import AirbyteEntrypoint +from airbyte_cdk.logger import AirbyteLogFormatter +from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, Type +from airbyte_cdk.sources import Source + +SECRET_PROPERTY = "api_token" +ANOTHER_SECRET_PROPERTY = "another_api_token" +ANOTHER_NOT_SECRET_PROPERTY = "not_secret_property" + +NOT_SECRET_PROPERTY = "explicitly_not_secret_property" + +I_AM_A_SECRET_VALUE = "I am a secret" +ANOTHER_SECRET_VALUE = "Another secret" +SECRET_INTEGER_VALUE = 123456789 +NOT_A_SECRET_VALUE = "I am not a secret" +ANOTHER_NOT_SECRET_VALUE = "I am not a secret" + + +class MockSource(Source): + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: MutableMapping[str, Any] = None, + ) -> Iterable[AirbyteMessage]: + logger.info(I_AM_A_SECRET_VALUE) + logger.info(I_AM_A_SECRET_VALUE + " plus Some non secret Value in the same log record" + NOT_A_SECRET_VALUE) + logger.info(NOT_A_SECRET_VALUE) + yield AirbyteMessage( + record=AirbyteRecordMessage(stream="stream", data={"data": "stuff"}, emitted_at=1), + type=Type.RECORD, + ) + + def discover(self, **kwargs): + pass + + def check(self, **kwargs): + pass + + +spec_with_airbyte_secrets = { + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": { + SECRET_PROPERTY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_PROPERTY: {"type": "string", "airbyte_secret": False}, + }, +} + +spec_with_airbyte_secrets_config = { + SECRET_PROPERTY: I_AM_A_SECRET_VALUE, + NOT_SECRET_PROPERTY: NOT_A_SECRET_VALUE, +} + +spec_with_multiple_airbyte_secrets = { + "type": "object", + "required": ["api_token"], + "additionalProperties": True, + "properties": { + SECRET_PROPERTY: {"type": "string", "airbyte_secret": True}, + ANOTHER_SECRET_PROPERTY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_PROPERTY: {"type": "string", "airbyte_secret": False}, + ANOTHER_NOT_SECRET_PROPERTY: {"type": "string"}, + }, +} + +spec_with_multiple_airbyte_secrets_config = { + SECRET_PROPERTY: I_AM_A_SECRET_VALUE, + NOT_SECRET_PROPERTY: NOT_A_SECRET_VALUE, + ANOTHER_SECRET_PROPERTY: ANOTHER_SECRET_VALUE, + ANOTHER_NOT_SECRET_PROPERTY: ANOTHER_NOT_SECRET_VALUE, +} + +spec_with_airbyte_secrets_not_string = { + "type": "object", + "required": ["api_token"], + "additionalProperties": True, + "properties": { + SECRET_PROPERTY: {"type": "string", "airbyte_secret": True}, + ANOTHER_SECRET_PROPERTY: {"type": "integer", "airbyte_secret": True}, + }, +} + +spec_with_airbyte_secrets_not_string_config = { + SECRET_PROPERTY: I_AM_A_SECRET_VALUE, + ANOTHER_SECRET_PROPERTY: SECRET_INTEGER_VALUE, +} + + +@pytest.fixture +def simple_config(): + yield { + SECRET_PROPERTY: I_AM_A_SECRET_VALUE, + ANOTHER_SECRET_PROPERTY: ANOTHER_SECRET_VALUE, + } + + +@pytest.mark.parametrize( + "source_spec, config", + [ + [spec_with_airbyte_secrets, spec_with_airbyte_secrets_config], + [spec_with_multiple_airbyte_secrets, spec_with_multiple_airbyte_secrets_config], + [ + spec_with_airbyte_secrets_not_string, + spec_with_airbyte_secrets_not_string_config, + ], + ], + ids=[ + "spec_with_airbyte_secrets", + "spec_with_multiple_airbyte_secrets", + "spec_with_airbyte_secrets_not_string", + ], +) +def test_airbyte_secret_is_masked_on_logger_output(source_spec, mocker, config, caplog): + caplog.set_level(logging.DEBUG, logger="airbyte.test") + caplog.handler.setFormatter(AirbyteLogFormatter()) + entrypoint = AirbyteEntrypoint(MockSource()) + parsed_args = Namespace(command="read", config="", state="", catalog="") + mocker.patch.object( + MockSource, + "spec", + return_value=ConnectorSpecification(connectionSpecification=source_spec), + ) + mocker.patch.object(MockSource, "configure", return_value=config) + mocker.patch.object(MockSource, "read_config", return_value=None) + mocker.patch.object(MockSource, "read_state", return_value={}) + mocker.patch.object(MockSource, "read_catalog", return_value={}) + list(entrypoint.run(parsed_args)) + log_result = caplog.text + expected_secret_values = [config[k] for k, v in source_spec["properties"].items() if v.get("airbyte_secret")] + expected_plain_text_values = [config[k] for k, v in source_spec["properties"].items() if not v.get("airbyte_secret")] + assert all([str(v) not in log_result for v in expected_secret_values]) + assert all([str(v) in log_result for v in expected_plain_text_values]) + + +def test_airbyte_secrets_are_masked_on_uncaught_exceptions(mocker, caplog, capsys): + caplog.set_level(logging.DEBUG, logger="airbyte.test") + caplog.handler.setFormatter(AirbyteLogFormatter()) + + class BrokenSource(MockSource): + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: MutableMapping[str, Any] = None, + ): + raise Exception("Exception:" + I_AM_A_SECRET_VALUE) + + entrypoint = AirbyteEntrypoint(BrokenSource()) + parsed_args = Namespace(command="read", config="", state="", catalog="") + source_spec = { + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": { + SECRET_PROPERTY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_PROPERTY: {"type": "string", "airbyte_secret": False}, + }, + } + simple_config = { + SECRET_PROPERTY: I_AM_A_SECRET_VALUE, + NOT_SECRET_PROPERTY: NOT_A_SECRET_VALUE, + } + mocker.patch.object( + MockSource, + "spec", + return_value=ConnectorSpecification(connectionSpecification=source_spec), + ) + mocker.patch.object(MockSource, "configure", return_value=simple_config) + mocker.patch.object(MockSource, "read_config", return_value=None) + mocker.patch.object(MockSource, "read_state", return_value={}) + mocker.patch.object(MockSource, "read_catalog", return_value={}) + + try: + list(entrypoint.run(parsed_args)) + except Exception: + sys.excepthook(*sys.exc_info()) + assert I_AM_A_SECRET_VALUE not in capsys.readouterr().out, "Should have filtered non-secret value from exception trace message" + assert I_AM_A_SECRET_VALUE not in caplog.text, "Should have filtered secret value from exception log message" + + +def test_non_airbyte_secrets_are_not_masked_on_uncaught_exceptions(mocker, caplog, capsys): + caplog.set_level(logging.DEBUG, logger="airbyte.test") + caplog.handler.setFormatter(AirbyteLogFormatter()) + + class BrokenSource(MockSource): + def read( + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: MutableMapping[str, Any] = None, + ): + raise Exception("Exception:" + NOT_A_SECRET_VALUE) + + entrypoint = AirbyteEntrypoint(BrokenSource()) + parsed_args = Namespace(command="read", config="", state="", catalog="") + source_spec = { + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": { + SECRET_PROPERTY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_PROPERTY: {"type": "string", "airbyte_secret": False}, + }, + } + simple_config = { + SECRET_PROPERTY: I_AM_A_SECRET_VALUE, + NOT_SECRET_PROPERTY: NOT_A_SECRET_VALUE, + } + mocker.patch.object( + MockSource, + "spec", + return_value=ConnectorSpecification(connectionSpecification=source_spec), + ) + mocker.patch.object(MockSource, "configure", return_value=simple_config) + mocker.patch.object(MockSource, "read_config", return_value=None) + mocker.patch.object(MockSource, "read_state", return_value={}) + mocker.patch.object(MockSource, "read_catalog", return_value={}) + mocker.patch.object(MockSource, "read", side_effect=Exception("Exception:" + NOT_A_SECRET_VALUE)) + + try: + list(entrypoint.run(parsed_args)) + except Exception: + sys.excepthook(*sys.exc_info()) + assert NOT_A_SECRET_VALUE in capsys.readouterr().out, "Should not have filtered non-secret value from exception trace message" + assert NOT_A_SECRET_VALUE in caplog.text, "Should not have filtered non-secret value from exception log message" diff --git a/airbyte-cdk/python/unit_tests/utils/__init__.py b/airbyte-cdk/python/unit_tests/utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/airbyte-cdk/python/unit_tests/utils/test_datetime_format_inferrer.py b/airbyte-cdk/python/unit_tests/utils/test_datetime_format_inferrer.py new file mode 100644 index 000000000000..5e76b9cfa193 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_datetime_format_inferrer.py @@ -0,0 +1,60 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Dict, List + +import pytest +from airbyte_cdk.models import AirbyteRecordMessage +from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer + +NOW = 1234567 + + +@pytest.mark.parametrize( + "test_name,input_records,expected_candidate_fields", + [ + ("empty", [], {}), + ("simple_match", [{"d": "2022-02-03"}], {"d": "%Y-%m-%d"}), + ("timestamp_match_integer", [{"d": 1686058051}], {"d": "%s"}), + ("timestamp_match_string", [{"d": "1686058051"}], {"d": "%s"}), + ("timestamp_ms_match_integer", [{"d": 1686058051000}], {"d": "%ms"}), + ("timestamp_ms_match_string", [{"d": "1686058051000"}], {"d": "%ms"}), + ("timestamp_no_match_integer", [{"d": 99}], {}), + ("timestamp_no_match_string", [{"d": "99999999999999999999"}], {}), + ("timestamp_overflow", [{"d": f"{10**100}_100"}], {}), # this case was previously causing OverflowError hence this test + ("simple_no_match", [{"d": "20220203"}], {}), + ("multiple_match", [{"d": "2022-02-03", "e": "2022-02-03"}], {"d": "%Y-%m-%d", "e": "%Y-%m-%d"}), + ( + "multiple_no_match", + [{"d": "20220203", "r": "ccc", "e": {"something-else": "2023-03-03"}, "s": ["2023-03-03"], "x": False, "y": 123}], + {}, + ), + ("format_1", [{"d": "2022-02-03"}], {"d": "%Y-%m-%d"}), + ("format_2", [{"d": "2022-02-03 12:34:56"}], {"d": "%Y-%m-%d %H:%M:%S"}), + ("format_3", [{"d": "2022-02-03T12:34:56Z"}], {"d": "%Y-%m-%dT%H:%M:%SZ"}), + ("format_4 1", [{"d": "2022-02-03T12:34:56.000Z"}], {"d": "%Y-%m-%dT%H:%M:%S.%fZ"}), + ("format_4 2", [{"d": "2022-02-03T12:34:56.000000Z"}], {"d": "%Y-%m-%dT%H:%M:%S.%fZ"}), + ("format_5", [{"d": "2022-02-03 12:34:56.123456+00:00"}], {"d": "%Y-%m-%d %H:%M:%S.%f%z"}), + ("format_5 2", [{"d": "2022-02-03 12:34:56.123456+02:00"}], {"d": "%Y-%m-%d %H:%M:%S.%f%z"}), + ("format_6", [{"d": "2022-02-03T12:34:56.123456+0000"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}), + ("format_6 2", [{"d": "2022-02-03T12:34:56.123456+00:00"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}), + ("format_6 3", [{"d": "2022-02-03T12:34:56.123456-03:00"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}), + ("format_7", [{"d": "03/02/2022 12:34"}], {"d": "%d/%m/%Y %H:%M"}), + ("format_8", [{"d": "2022-02"}], {"d": "%Y-%m"}), + ("format_9", [{"d": "03-02-2022"}], {"d": "%d-%m-%Y"}), + ("limit_down", [{"d": "2022-02-03", "x": "2022-02-03"}, {"d": "2022-02-03", "x": "another thing"}], {"d": "%Y-%m-%d"}), + ("limit_down all", [{"d": "2022-02-03", "x": "2022-02-03"}, {"d": "also another thing", "x": "another thing"}], {}), + ("limit_down empty", [{"d": "2022-02-03", "x": "2022-02-03"}, {}], {}), + ("limit_down unsupported type", [{"d": "2022-02-03"}, {"d": False}], {}), + ("limit_down complex type", [{"d": "2022-02-03"}, {"d": {"date": "2022-03-03"}}], {}), + ("limit_down different format", [{"d": "2022-02-03"}, {"d": 1686058051}], {}), + ("limit_down different format", [{"d": "2022-02-03"}, {"d": "2022-02-03T12:34:56.000000Z"}], {}), + ("no scope expand", [{}, {"d": "2022-02-03"}], {}), + ], +) +def test_schema_inferrer(test_name, input_records: List, expected_candidate_fields: Dict[str, str]): + inferrer = DatetimeFormatInferrer() + for record in input_records: + inferrer.accumulate(AirbyteRecordMessage(stream="abc", data=record, emitted_at=NOW)) + assert inferrer.get_inferred_datetime_formats() == expected_candidate_fields diff --git a/airbyte-cdk/python/unit_tests/utils/test_mapping_helpers.py b/airbyte-cdk/python/unit_tests/utils/test_mapping_helpers.py new file mode 100644 index 000000000000..f5dc979e3477 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_mapping_helpers.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.utils.mapping_helpers import combine_mappings + + +def test_basic_merge(): + mappings = [{"a": 1}, {"b": 2}, {"c": 3}, {}] + result = combine_mappings(mappings) + assert result == {"a": 1, "b": 2, "c": 3} + + +def test_combine_with_string(): + mappings = [{"a": 1}, "option"] + with pytest.raises(ValueError, match="Cannot combine multiple options if one is a string"): + combine_mappings(mappings) + + +def test_overlapping_keys(): + mappings = [{"a": 1, "b": 2}, {"b": 3}] + with pytest.raises(ValueError, match="Duplicate keys found"): + combine_mappings(mappings) + + +def test_multiple_strings(): + mappings = ["option1", "option2"] + with pytest.raises(ValueError, match="Cannot combine multiple string options"): + combine_mappings(mappings) + + +def test_handle_none_values(): + mappings = [{"a": 1}, None, {"b": 2}] + result = combine_mappings(mappings) + assert result == {"a": 1, "b": 2} + + +def test_empty_mappings(): + mappings = [] + result = combine_mappings(mappings) + assert result == {} + + +def test_single_mapping(): + mappings = [{"a": 1}] + result = combine_mappings(mappings) + assert result == {"a": 1} + + +def test_combine_with_string_and_empty_mappings(): + mappings = ["option", {}] + result = combine_mappings(mappings) + assert result == "option" diff --git a/airbyte-cdk/python/unit_tests/utils/test_message_utils.py b/airbyte-cdk/python/unit_tests/utils/test_message_utils.py new file mode 100644 index 000000000000..84fabf1a8fa8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_message_utils.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +import pytest +from airbyte_cdk.models import ( + AirbyteControlConnectorConfigMessage, + AirbyteControlMessage, + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateStats, + AirbyteStateType, + AirbyteStreamState, + OrchestratorType, + StreamDescriptor, + Type, +) +from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor +from airbyte_cdk.utils.message_utils import get_stream_descriptor + + +def test_get_record_message_stream_descriptor(): + message = AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage( + stream="test_stream", + namespace="test_namespace", + data={"id": "12345"}, + emitted_at=1, + ), + ) + expected_descriptor = HashableStreamDescriptor(name="test_stream", namespace="test_namespace") + assert get_stream_descriptor(message) == expected_descriptor + + +def test_get_record_message_stream_descriptor_no_namespace(): + message = AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage(stream="test_stream", data={"id": "12345"}, emitted_at=1), + ) + expected_descriptor = HashableStreamDescriptor(name="test_stream", namespace=None) + assert get_stream_descriptor(message) == expected_descriptor + + +def test_get_state_message_stream_descriptor(): + message = AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="test_stream", namespace="test_namespace"), + stream_state=AirbyteStateBlob(updated_at="2024-02-02"), + ), + sourceStats=AirbyteStateStats(recordCount=27.0), + ), + ) + expected_descriptor = HashableStreamDescriptor(name="test_stream", namespace="test_namespace") + assert get_stream_descriptor(message) == expected_descriptor + + +def test_get_state_message_stream_descriptor_no_namespace(): + message = AirbyteMessage( + type=Type.STATE, + state=AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="test_stream"), + stream_state=AirbyteStateBlob(updated_at="2024-02-02"), + ), + sourceStats=AirbyteStateStats(recordCount=27.0), + ), + ) + expected_descriptor = HashableStreamDescriptor(name="test_stream", namespace=None) + assert get_stream_descriptor(message) == expected_descriptor + + +def test_get_other_message_stream_descriptor_fails(): + message = AirbyteMessage( + type=Type.CONTROL, + control=AirbyteControlMessage( + type=OrchestratorType.CONNECTOR_CONFIG, + emitted_at=10, + connectorConfig=AirbyteControlConnectorConfigMessage(config={"any config": "a config value"}), + ), + ) + with pytest.raises(NotImplementedError): + get_stream_descriptor(message) diff --git a/airbyte-cdk/python/unit_tests/utils/test_rate_limiting.py b/airbyte-cdk/python/unit_tests/utils/test_rate_limiting.py new file mode 100644 index 000000000000..bc9d3ece61b8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_rate_limiting.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.streams.http.rate_limiting import default_backoff_handler +from requests import exceptions + + +def helper_with_exceptions(exception_type): + raise exception_type + + +@pytest.mark.usefixtures("mock_sleep") +@pytest.mark.parametrize( + "max_tries, max_time, factor, exception_to_raise", + [ + (1, None, 1, exceptions.ConnectTimeout), + (1, 1, 0, exceptions.ReadTimeout), + (2, 2, 1, exceptions.ConnectionError), + (3, 3, 1, exceptions.ChunkedEncodingError), + ], +) +def test_default_backoff_handler(max_tries: int, max_time: int, factor: int, exception_to_raise: Exception): + backoff_handler = default_backoff_handler(max_tries=max_tries, max_time=max_time, factor=factor)(helper_with_exceptions) + with pytest.raises(exception_to_raise): + backoff_handler(exception_to_raise) diff --git a/airbyte-cdk/python/unit_tests/utils/test_schema_inferrer.py b/airbyte-cdk/python/unit_tests/utils/test_schema_inferrer.py new file mode 100644 index 000000000000..98d227c40ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_schema_inferrer.py @@ -0,0 +1,351 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import List, Mapping + +import pytest +from airbyte_cdk.models import AirbyteRecordMessage +from airbyte_cdk.utils.schema_inferrer import SchemaInferrer, SchemaValidationException + +NOW = 1234567 + + +@pytest.mark.parametrize( + "input_records,expected_schemas", + [ + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": "abc"}}, + {"stream": "my_stream", "data": {"field_A": "def"}}, + ], + {"my_stream": {"field_A": {"type": ["string", "null"]}}}, + id="test_basic", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": 1.0}}, + {"stream": "my_stream", "data": {"field_A": "abc"}}, + ], + {"my_stream": {"field_A": {"type": ["number", "string", "null"]}}}, + id="test_deriving_schema_refine", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"obj": {"data": [1.0, 2.0, 3.0]}}}, + {"stream": "my_stream", "data": {"obj": {"other_key": "xyz"}}}, + ], + { + "my_stream": { + "obj": { + "type": ["object", "null"], + "properties": { + "data": {"type": ["array", "null"], "items": {"type": ["number", "null"]}}, + "other_key": {"type": ["string", "null"]}, + }, + } + } + }, + id="test_derive_schema_for_nested_structures", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": 1}}, + {"stream": "my_stream", "data": {"field_A": 2}}, + ], + {"my_stream": {"field_A": {"type": ["number", "null"]}}}, + id="test_integer_number", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": None}}, + ], + {"my_stream": {}}, + id="test_null", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": None}}, + {"stream": "my_stream", "data": {"field_A": "abc"}}, + ], + {"my_stream": {"field_A": {"type": ["string", "null"]}}}, + id="test_null_optional", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": None}}, + {"stream": "my_stream", "data": {"field_A": {"nested": "abc"}}}, + ], + {"my_stream": {"field_A": {"type": ["object", "null"], "properties": {"nested": {"type": ["string", "null"]}}}}}, + id="test_any_of", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": None}}, + {"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": None}}}, + ], + {"my_stream": {"field_A": {"type": ["object", "null"], "properties": {"nested": {"type": ["string", "null"]}}}}}, + id="test_any_of_with_null", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": None}}, + {"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": None}}}, + {"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": "a string"}}}, + ], + { + "my_stream": { + "field_A": { + "type": ["object", "null"], + "properties": {"nested": {"type": ["string", "null"]}, "nully": {"type": ["string", "null"]}}, + } + } + }, + id="test_any_of_with_null_union", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": "a string"}}}, + {"stream": "my_stream", "data": {"field_A": None}}, + {"stream": "my_stream", "data": {"field_A": {"nested": "abc", "nully": None}}}, + ], + { + "my_stream": { + "field_A": { + "type": ["object", "null"], + "properties": {"nested": {"type": ["string", "null"]}, "nully": {"type": ["string", "null"]}}, + } + } + }, + id="test_any_of_with_null_union_changed_order", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": "abc", "nested": {"field_B": None}}}, + ], + {"my_stream": {"field_A": {"type": ["string", "null"]}, "nested": {"type": ["object", "null"], "properties": {}}}}, + id="test_nested_null", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": "abc", "nested": [{"field_B": None, "field_C": "abc"}]}}, + ], + { + "my_stream": { + "field_A": {"type": ["string", "null"]}, + "nested": { + "type": ["array", "null"], + "items": {"type": ["object", "null"], "properties": {"field_C": {"type": ["string", "null"]}}}, + }, + } + }, + id="test_array_nested_null", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": "abc", "nested": None}}, + {"stream": "my_stream", "data": {"field_A": "abc", "nested": [{"field_B": None, "field_C": "abc"}]}}, + ], + { + "my_stream": { + "field_A": {"type": ["string", "null"]}, + "nested": { + "type": ["array", "null"], + "items": {"type": ["object", "null"], "properties": {"field_C": {"type": ["string", "null"]}}}, + }, + } + }, + id="test_array_top_level_null", + ), + pytest.param( + [ + {"stream": "my_stream", "data": {"field_A": None}}, + {"stream": "my_stream", "data": {"field_A": "abc"}}, + ], + {"my_stream": {"field_A": {"type": ["string", "null"]}}}, + id="test_null_string", + ), + pytest.param( + [ + { + "stream": "data_with_nested_arrays", + "data": { + "root_property_object": { + "property_array": [ + {"title": "Nested_1", "type": "multi-value", "value": ["XL"]}, + { + "title": "Nested_2", + "type": "location", + "value": {"nested_key_1": "GB", "nested_key_2": "United Kingdom"}, + }, + ], + } + }, + }, + ], + { + "data_with_nested_arrays": { + "root_property_object": { + "type": ["object", "null"], + "properties": { + "property_array": { + "type": ["array", "null"], + "items": { + "type": ["object", "null"], + "properties": { + "title": {"type": ["string", "null"]}, + "type": {"type": ["string", "null"]}, + "value": { + "anyOf": [ + {"type": "array", "items": {"type": "string"}}, + { + "type": "object", + "properties": {"nested_key_1": {"type": "string"}, "nested_key_2": {"type": "string"}}, + }, + ] + }, + }, + }, + } + }, + } + } + }, + id="test_data_with_nested_arrays", + ), + ], +) +def test_schema_derivation(input_records: List, expected_schemas: Mapping): + inferrer = SchemaInferrer() + for record in input_records: + inferrer.accumulate(AirbyteRecordMessage(stream=record["stream"], data=record["data"], emitted_at=NOW)) + + for stream_name, expected_schema in expected_schemas.items(): + assert inferrer.get_stream_schema(stream_name) == { + "$schema": "http://json-schema.org/schema#", + "type": "object", + "properties": expected_schema, + } + + +_STREAM_NAME = "a stream name" +_ANY_VALUE = "any value" +_IS_PK = True +_IS_CURSOR_FIELD = True + + +def _create_inferrer_with_required_field(is_pk: bool, field: List[List[str]]) -> SchemaInferrer: + if is_pk: + return SchemaInferrer(field) + return SchemaInferrer([[]], field) + + +@pytest.mark.parametrize( + "is_pk", + [ + pytest.param(_IS_PK, id="required_field_is_pk"), + pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"), + ], +) +def test_field_is_on_root(is_pk: bool): + inferrer = _create_inferrer_with_required_field(is_pk, [["property"]]) + + inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"property": _ANY_VALUE}, emitted_at=NOW)) + + assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property"] + assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property"]["type"] == "string" + + +@pytest.mark.parametrize( + "is_pk", + [ + pytest.param(_IS_PK, id="required_field_is_pk"), + pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"), + ], +) +def test_field_is_nested(is_pk: bool): + inferrer = _create_inferrer_with_required_field(is_pk, [["property", "nested_property"]]) + + inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"property": {"nested_property": _ANY_VALUE}}, emitted_at=NOW)) + + assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property"] + assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property"]["type"] == "object" + assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property"]["required"] == ["nested_property"] + + +@pytest.mark.parametrize( + "is_pk", + [ + pytest.param(_IS_PK, id="required_field_is_pk"), + pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"), + ], +) +def test_field_is_composite(is_pk: bool): + inferrer = _create_inferrer_with_required_field(is_pk, [["property 1"], ["property 2"]]) + inferrer.accumulate( + AirbyteRecordMessage(stream=_STREAM_NAME, data={"property 1": _ANY_VALUE, "property 2": _ANY_VALUE}, emitted_at=NOW) + ) + assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property 1", "property 2"] + + +@pytest.mark.parametrize( + "is_pk", + [ + pytest.param(_IS_PK, id="required_field_is_pk"), + pytest.param(_IS_CURSOR_FIELD, id="required_field_is_cursor_field"), + ], +) +def test_field_is_composite_and_nested(is_pk: bool): + inferrer = _create_inferrer_with_required_field(is_pk, [["property 1", "nested"], ["property 2"]]) + + inferrer.accumulate( + AirbyteRecordMessage(stream=_STREAM_NAME, data={"property 1": {"nested": _ANY_VALUE}, "property 2": _ANY_VALUE}, emitted_at=NOW) + ) + + assert inferrer.get_stream_schema(_STREAM_NAME)["required"] == ["property 1", "property 2"] + assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 1"]["type"] == "object" + assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 2"]["type"] == "string" + assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 1"]["required"] == ["nested"] + assert inferrer.get_stream_schema(_STREAM_NAME)["properties"]["property 1"]["properties"]["nested"]["type"] == "string" + + +def test_given_pk_does_not_exist_when_get_inferred_schemas_then_raise_error(): + inferrer = SchemaInferrer([["pk does not exist"]]) + inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id": _ANY_VALUE}, emitted_at=NOW)) + + with pytest.raises(SchemaValidationException) as exception: + inferrer.get_stream_schema(_STREAM_NAME) + + assert len(exception.value.validation_errors) == 1 + + +def test_given_pk_path_is_partially_valid_when_get_inferred_schemas_then_validation_error_mentions_where_the_issue_is(): + inferrer = SchemaInferrer([["id", "nested pk that does not exist"]]) + inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id": _ANY_VALUE}, emitted_at=NOW)) + + with pytest.raises(SchemaValidationException) as exception: + inferrer.get_stream_schema(_STREAM_NAME) + + assert len(exception.value.validation_errors) == 1 + assert "Path ['id']" in exception.value.validation_errors[0] + + +def test_given_composite_pk_but_only_one_path_valid_when_get_inferred_schemas_then_valid_path_is_required(): + inferrer = SchemaInferrer([["id 1"], ["id 2"]]) + inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id 1": _ANY_VALUE}, emitted_at=NOW)) + + with pytest.raises(SchemaValidationException) as exception: + inferrer.get_stream_schema(_STREAM_NAME) + + assert exception.value.schema["required"] == ["id 1"] + + +def test_given_composite_pk_but_only_one_path_valid_when_get_inferred_schemas_then_validation_error_mentions_where_the_issue_is(): + inferrer = SchemaInferrer([["id 1"], ["id 2"]]) + inferrer.accumulate(AirbyteRecordMessage(stream=_STREAM_NAME, data={"id 1": _ANY_VALUE}, emitted_at=NOW)) + + with pytest.raises(SchemaValidationException) as exception: + inferrer.get_stream_schema(_STREAM_NAME) + + assert len(exception.value.validation_errors) == 1 + assert "id 2" in exception.value.validation_errors[0] diff --git a/airbyte-cdk/python/unit_tests/utils/test_secret_utils.py b/airbyte-cdk/python/unit_tests/utils/test_secret_utils.py new file mode 100644 index 000000000000..39c6ff735da0 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_secret_utils.py @@ -0,0 +1,135 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.utils.airbyte_secrets_utils import add_to_secrets, filter_secrets, get_secret_paths, get_secrets, update_secrets + +SECRET_STRING_KEY = "secret_key1" +SECRET_STRING_VALUE = "secret_value" +SECRET_STRING_2_KEY = "secret_key2" +SECRET_STRING_2_VALUE = "second_secret_val" +SECRET_INT_KEY = "secret_int" +SECRET_INT_VALUE = 1337 +NOT_SECRET_KEY = "not_a_secret" +NOT_SECRET_VALUE = "unimportant value" + + +flat_spec_with_secret = {"properties": {SECRET_STRING_KEY: {"type": "string", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}}} +flat_config_with_secret = {SECRET_STRING_KEY: SECRET_STRING_VALUE, NOT_SECRET_KEY: NOT_SECRET_VALUE} + +flat_spec_with_secret_int = { + "properties": {SECRET_INT_KEY: {"type": "integer", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}} +} +flat_config_with_secret_int = {SECRET_INT_KEY: SECRET_INT_VALUE, NOT_SECRET_KEY: NOT_SECRET_VALUE} + +flat_spec_without_secrets = {"properties": {NOT_SECRET_KEY: {"type": "string"}}} +flat_config_without_secrets = {NOT_SECRET_KEY: NOT_SECRET_VALUE} + +spec_with_oneof_secrets = { + "properties": { + SECRET_STRING_KEY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_KEY: {"type": "string"}, + "credentials": { + "type": "object", + "oneOf": [ + { + "type": "object", + "properties": {SECRET_STRING_2_KEY: {"type": "string", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}}, + }, + { + "type": "object", + "properties": {SECRET_INT_KEY: {"type": "integer", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}}, + }, + ], + }, + } +} +config_with_oneof_secrets_1 = { + SECRET_STRING_KEY: SECRET_STRING_VALUE, + NOT_SECRET_KEY: NOT_SECRET_VALUE, + "credentials": {SECRET_STRING_2_KEY: SECRET_STRING_2_VALUE}, +} +config_with_oneof_secrets_2 = { + SECRET_STRING_KEY: SECRET_STRING_VALUE, + NOT_SECRET_KEY: NOT_SECRET_VALUE, + "credentials": {SECRET_INT_KEY: SECRET_INT_VALUE}, +} + +spec_with_nested_secrets = { + "properties": { + SECRET_STRING_KEY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_KEY: {"type": "string"}, + "credentials": { + "type": "object", + "properties": { + SECRET_STRING_2_KEY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_KEY: {"type": "string"}, + SECRET_INT_KEY: {"type": "integer", "airbyte_secret": True}, + }, + }, + } +} +config_with_nested_secrets = { + SECRET_STRING_KEY: SECRET_STRING_VALUE, + NOT_SECRET_KEY: NOT_SECRET_VALUE, + "credentials": {SECRET_STRING_2_KEY: SECRET_STRING_2_VALUE, SECRET_INT_KEY: SECRET_INT_VALUE}, +} + + +@pytest.mark.parametrize( + ["spec", "expected"], + [ + (flat_spec_with_secret, [[SECRET_STRING_KEY]]), + (flat_spec_without_secrets, []), + (flat_spec_with_secret_int, [[SECRET_INT_KEY]]), + (spec_with_oneof_secrets, [[SECRET_STRING_KEY], ["credentials", SECRET_STRING_2_KEY], ["credentials", SECRET_INT_KEY]]), + (spec_with_nested_secrets, [[SECRET_STRING_KEY], ["credentials", SECRET_STRING_2_KEY], ["credentials", SECRET_INT_KEY]]), + ], +) +def test_get_secret_paths(spec, expected): + assert get_secret_paths(spec) == expected, f"Expected {spec} to yield secret paths {expected}" + + +@pytest.mark.parametrize( + ["spec", "config", "expected"], + [ + (flat_spec_with_secret, flat_config_with_secret, [SECRET_STRING_VALUE]), + (flat_spec_without_secrets, flat_config_without_secrets, []), + (flat_spec_with_secret_int, flat_config_with_secret_int, [SECRET_INT_VALUE]), + (spec_with_oneof_secrets, config_with_oneof_secrets_1, [SECRET_STRING_VALUE, SECRET_STRING_2_VALUE]), + (spec_with_oneof_secrets, config_with_oneof_secrets_2, [SECRET_STRING_VALUE, SECRET_INT_VALUE]), + (spec_with_nested_secrets, config_with_nested_secrets, [SECRET_STRING_VALUE, SECRET_STRING_2_VALUE, SECRET_INT_VALUE]), + ], +) +def test_get_secrets(spec, config, expected): + assert get_secrets(spec, config) == expected, f"Expected the spec {spec} and config {config} to produce {expected}" + + +def test_secret_filtering(): + sensitive_str = f"{SECRET_STRING_VALUE} {NOT_SECRET_VALUE} {SECRET_STRING_VALUE} {SECRET_STRING_2_VALUE}" + + update_secrets([]) + filtered = filter_secrets(sensitive_str) + assert filtered == sensitive_str + + # the empty secret should not affect the result + update_secrets([""]) + filtered = filter_secrets(sensitive_str) + assert filtered == sensitive_str + + update_secrets([SECRET_STRING_VALUE, SECRET_STRING_2_VALUE]) + filtered = filter_secrets(sensitive_str) + assert filtered == f"**** {NOT_SECRET_VALUE} **** ****" + + +def test_secrets_added_are_filtered(): + ADDED_SECRET = "only_a_secret_if_added" + sensitive_str = f"{ADDED_SECRET} {NOT_SECRET_VALUE}" + + filtered = filter_secrets(sensitive_str) + assert filtered == sensitive_str + + add_to_secrets(ADDED_SECRET) + filtered = filter_secrets(sensitive_str) + assert filtered == f"**** {NOT_SECRET_VALUE}" diff --git a/airbyte-cdk/python/unit_tests/utils/test_stream_status_utils.py b/airbyte-cdk/python/unit_tests/utils/test_stream_status_utils.py new file mode 100644 index 000000000000..4862a1e0118e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_stream_status_utils.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.models import AirbyteMessage, AirbyteStream, AirbyteStreamStatus, SyncMode, TraceType +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message + +stream = AirbyteStream(name="name", namespace="namespace", json_schema={}, supported_sync_modes=[SyncMode.full_refresh]) + + +def test_started_as_message(): + stream_status = AirbyteStreamStatus.STARTED + airbyte_message = stream_status_as_airbyte_message(stream, stream_status) + + assert isinstance(airbyte_message, AirbyteMessage) + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.STREAM_STATUS + assert airbyte_message.trace.emitted_at > 0 + assert airbyte_message.trace.stream_status.stream_descriptor.name == stream.name + assert airbyte_message.trace.stream_status.stream_descriptor.namespace == stream.namespace + assert airbyte_message.trace.stream_status.status == stream_status + + +def test_running_as_message(): + stream_status = AirbyteStreamStatus.RUNNING + airbyte_message = stream_status_as_airbyte_message(stream, stream_status) + + assert isinstance(airbyte_message, AirbyteMessage) + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.STREAM_STATUS + assert airbyte_message.trace.emitted_at > 0 + assert airbyte_message.trace.stream_status.stream_descriptor.name == stream.name + assert airbyte_message.trace.stream_status.stream_descriptor.namespace == stream.namespace + assert airbyte_message.trace.stream_status.status == stream_status + + +def test_complete_as_message(): + stream_status = AirbyteStreamStatus.COMPLETE + airbyte_message = stream_status_as_airbyte_message(stream, stream_status) + + assert isinstance(airbyte_message, AirbyteMessage) + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.STREAM_STATUS + assert airbyte_message.trace.emitted_at > 0 + assert airbyte_message.trace.stream_status.stream_descriptor.name == stream.name + assert airbyte_message.trace.stream_status.stream_descriptor.namespace == stream.namespace + assert airbyte_message.trace.stream_status.status == stream_status + + +def test_incomplete_failed_as_message(): + stream_status = AirbyteStreamStatus.INCOMPLETE + airbyte_message = stream_status_as_airbyte_message(stream, stream_status) + + assert isinstance(airbyte_message, AirbyteMessage) + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.STREAM_STATUS + assert airbyte_message.trace.emitted_at > 0 + assert airbyte_message.trace.stream_status.stream_descriptor.name == stream.name + assert airbyte_message.trace.stream_status.stream_descriptor.namespace == stream.namespace + assert airbyte_message.trace.stream_status.status == stream_status diff --git a/airbyte-cdk/python/unit_tests/utils/test_traced_exception.py b/airbyte-cdk/python/unit_tests/utils/test_traced_exception.py new file mode 100644 index 000000000000..ea559a319467 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_traced_exception.py @@ -0,0 +1,136 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +import pytest +from airbyte_cdk.models import ( + AirbyteErrorTraceMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteTraceMessage, + FailureType, + Status, + StreamDescriptor, + TraceType, +) +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from orjson import orjson + +_AN_EXCEPTION = ValueError("An exception") +_A_STREAM_DESCRIPTOR = StreamDescriptor(name="a_stream") +_ANOTHER_STREAM_DESCRIPTOR = StreamDescriptor(name="another_stream") + + +@pytest.fixture +def raised_exception(): + try: + raise RuntimeError("an error has occurred") + except RuntimeError as e: + return e + + +def test_build_from_existing_exception(raised_exception): + traced_exc = AirbyteTracedException.from_exception(raised_exception, message="my user-friendly message") + assert traced_exc.message == "my user-friendly message" + assert traced_exc.internal_message == "an error has occurred" + assert traced_exc.failure_type == FailureType.system_error + assert traced_exc._exception == raised_exception + + +def test_exception_as_airbyte_message(): + traced_exc = AirbyteTracedException("an internal message") + airbyte_message = traced_exc.as_airbyte_message() + + assert isinstance(airbyte_message, AirbyteMessage) + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.ERROR + assert airbyte_message.trace.emitted_at > 0 + assert airbyte_message.trace.error.failure_type == FailureType.system_error + assert airbyte_message.trace.error.message == "Something went wrong in the connector. See the logs for more details." + assert airbyte_message.trace.error.internal_message == "an internal message" + assert airbyte_message.trace.error.stack_trace == "airbyte_cdk.utils.traced_exception.AirbyteTracedException: an internal message\n" + + +def test_existing_exception_as_airbyte_message(raised_exception): + traced_exc = AirbyteTracedException.from_exception(raised_exception) + airbyte_message = traced_exc.as_airbyte_message() + + assert isinstance(airbyte_message, AirbyteMessage) + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.ERROR + assert airbyte_message.trace.error.message == "Something went wrong in the connector. See the logs for more details." + assert airbyte_message.trace.error.internal_message == "an error has occurred" + assert airbyte_message.trace.error.stack_trace.startswith("Traceback (most recent call last):") + assert airbyte_message.trace.error.stack_trace.endswith( + 'raise RuntimeError("an error has occurred")\n' "RuntimeError: an error has occurred\n" + ) + + +def test_config_error_as_connection_status_message(): + traced_exc = AirbyteTracedException("an internal message", message="Config validation error", failure_type=FailureType.config_error) + airbyte_message = traced_exc.as_connection_status_message() + + assert isinstance(airbyte_message, AirbyteMessage) + assert airbyte_message.type == MessageType.CONNECTION_STATUS + assert airbyte_message.connectionStatus.status == Status.FAILED + assert airbyte_message.connectionStatus.message == "Config validation error" + + +def test_other_error_as_connection_status_message(): + traced_exc = AirbyteTracedException("an internal message", failure_type=FailureType.system_error) + airbyte_message = traced_exc.as_connection_status_message() + + assert airbyte_message is None + + +def test_emit_message(capsys): + traced_exc = AirbyteTracedException( + internal_message="internal message", message="user-friendly message", exception=RuntimeError("oh no") + ) + + expected_message = AirbyteMessage( + type=MessageType.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=0.0, + error=AirbyteErrorTraceMessage( + failure_type=FailureType.system_error, + message="user-friendly message", + internal_message="internal message", + stack_trace="RuntimeError: oh no\n", + ), + ), + ) + + traced_exc.emit_message() + + stdout = capsys.readouterr().out + printed_message = AirbyteMessageSerializer.load(orjson.loads(stdout)) + printed_message.trace.emitted_at = 0.0 + assert printed_message == expected_message + + +def test_given_both_init_and_as_message_with_stream_descriptor_when_as_airbyte_message_use_init_stream_descriptor() -> None: + traced_exc = AirbyteTracedException(stream_descriptor=_A_STREAM_DESCRIPTOR) + message = traced_exc.as_airbyte_message(stream_descriptor=_ANOTHER_STREAM_DESCRIPTOR) + assert message.trace.error.stream_descriptor == _A_STREAM_DESCRIPTOR + + +def test_given_both_init_and_as_sanitized_airbyte_message_with_stream_descriptor_when_as_airbyte_message_use_init_stream_descriptor() -> None: + traced_exc = AirbyteTracedException(stream_descriptor=_A_STREAM_DESCRIPTOR) + message = traced_exc.as_sanitized_airbyte_message(stream_descriptor=_ANOTHER_STREAM_DESCRIPTOR) + assert message.trace.error.stream_descriptor == _A_STREAM_DESCRIPTOR + + +def test_given_both_from_exception_and_as_message_with_stream_descriptor_when_as_airbyte_message_use_init_stream_descriptor() -> None: + traced_exc = AirbyteTracedException.from_exception(_AN_EXCEPTION, stream_descriptor=_A_STREAM_DESCRIPTOR) + message = traced_exc.as_airbyte_message(stream_descriptor=_ANOTHER_STREAM_DESCRIPTOR) + assert message.trace.error.stream_descriptor == _A_STREAM_DESCRIPTOR + + +def test_given_both_from_exception_and_as_sanitized_airbyte_message_with_stream_descriptor_when_as_airbyte_message_use_init_stream_descriptor() -> None: + traced_exc = AirbyteTracedException.from_exception(_AN_EXCEPTION, stream_descriptor=_A_STREAM_DESCRIPTOR) + message = traced_exc.as_sanitized_airbyte_message(stream_descriptor=_ANOTHER_STREAM_DESCRIPTOR) + assert message.trace.error.stream_descriptor == _A_STREAM_DESCRIPTOR