diff --git a/.github/workflows/cdk-publish.yml b/.github/workflows/cdk-publish.yml new file mode 100644 index 00000000..2e5ab8e2 --- /dev/null +++ b/.github/workflows/cdk-publish.yml @@ -0,0 +1,93 @@ +name: Publish CDK and Source Declarative Manifest +on: + push: + paths: + - 'airbyte_cdk/pyproject.toml' # To only publish on CDK version change + - 'Dockerfile' + workflow_dispatch: + +jobs: + test: + name: Test Source Declarative Manifest Docker Build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up QEMU for multi-platform builds + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build test image + uses: docker/build-push-action@v5 + with: + context: . + platforms: linux/amd64 # Just build for the runner's architecture during test + load: true + tags: airbyte/source-declarative-manifest:build-test + + - name: Test image + run: | + docker run airbyte/source-declarative-manifest:build-test spec + + - name: Scan for vulnerabilities + uses: aquasecurity/trivy-action@master + continue-on-error: true # Prevent security scan from failing the build + with: + image-ref: airbyte/source-declarative-manifest:build-test + format: 'table,sarif' + output: 'trivy-results.sarif' + exit-code: 1 + severity: 'CRITICAL,HIGH' + timeout: '5m' + + publish: + name: Publish SDM Docker Image + runs-on: ubuntu-latest + needs: test + if: ${{ success() && (github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch') }} + permissions: + id-token: write # Required for trusted publishing + contents: write # Required for artifact uploads + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up QEMU for multi-platform builds + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Get CDK version + run: | + cdk_version="$(poetry version --short | tr -d '[:space:]')" + echo "CDK_VERSION=$cdk_version" >> $GITHUB_ENV + + - name: Check if tag already exists + run: | + tag="airbyte/source-declarative-manifest:${{ env.CDK_VERSION}}-${{ github.run_number }}" + if DOCKER_CLI_EXPERIMENTAL=enabled docker manifest inspect "$tag" > /dev/null 2>&1; then + echo "The tag $tag already exists on Dockerhub. Skipping publish to prevent overwrite." + exit 1 + fi + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + tags: | + airbyte/source-declarative-manifest:latest + airbyte/source-declarative-manifest:${{ env.CDK_VERSION }} + airbyte/source-declarative-manifest:${{ env.CDK_VERSION }}-${{ github.run_number }} diff --git a/.github/workflows/connector-tests.yml b/.github/workflows/connector-tests.yml index 329d7be1..fb25bc72 100644 --- a/.github/workflows/connector-tests.yml +++ b/.github/workflows/connector-tests.yml @@ -83,6 +83,12 @@ jobs: cdk_extra: vector-db-based - connector: destination-motherduck cdk_extra: sql + # TODO: These are manifest connectors and won't work as expected until we + # add `--use-local-cdk` support for manifest connectors. + - connector: source-the-guardian-api + cdk_extra: n/a + - connector: source-pokeapi + cdk_extra: n/a name: "Check: '${{matrix.connector}}' (skip=${{needs.cdk_changes.outputs[matrix.cdk_extra] == 'false'}})" steps: diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..dd6a841e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM docker.io/airbyte/python-connector-base:2.0.0@sha256:c44839ba84406116e8ba68722a0f30e8f6e7056c726f447681bb9e9ece8bd916 + +WORKDIR /airbyte/integration_code + +# Copy project files needed for build +COPY pyproject.toml poetry.lock README.md ./ + +# Install dependencies - ignore keyring warnings +RUN poetry config virtualenvs.create false \ + && poetry install --only main --no-interaction --no-ansi || true + +# Copy source code +COPY airbyte_cdk ./airbyte_cdk + +# Build and install the package +RUN poetry build && pip install dist/*.whl + +ENTRYPOINT ["poetry", "run", "source-declarative-manifest"] diff --git a/airbyte_cdk/cli/__init__.py b/airbyte_cdk/cli/__init__.py new file mode 100644 index 00000000..7f66676b --- /dev/null +++ b/airbyte_cdk/cli/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. diff --git a/airbyte_cdk/cli/source_declarative_manifest/__init__.py b/airbyte_cdk/cli/source_declarative_manifest/__init__.py new file mode 100644 index 00000000..8a8d225e --- /dev/null +++ b/airbyte_cdk/cli/source_declarative_manifest/__init__.py @@ -0,0 +1,6 @@ +from airbyte_cdk.cli.source_declarative_manifest._run import run + + +__all__ = [ + "run", +] diff --git a/airbyte_cdk/cli/source_declarative_manifest/_run.py b/airbyte_cdk/cli/source_declarative_manifest/_run.py new file mode 100644 index 00000000..ece30309 --- /dev/null +++ b/airbyte_cdk/cli/source_declarative_manifest/_run.py @@ -0,0 +1,223 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Defines the `source-declarative-manifest` connector, which installs alongside CDK. + +This file was originally imported from the dedicated connector directory, under the +`airbyte` monorepo. + +Usage: + +``` +pipx install airbyte-cdk +source-declarative-manifest --help +source-declarative-manifest spec +... +``` +""" + +from __future__ import annotations + +import json +import pkgutil +import sys +import traceback +from collections.abc import Mapping +from datetime import datetime +from pathlib import Path +from typing import Any, cast + +from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch +from airbyte_cdk.models import ( + AirbyteErrorTraceMessage, + AirbyteMessage, + AirbyteMessageSerializer, + AirbyteStateMessage, + AirbyteTraceMessage, + ConfiguredAirbyteCatalog, + ConnectorSpecificationSerializer, + TraceType, + Type, +) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) +from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource +from airbyte_cdk.sources.source import TState +from orjson import orjson + + +class SourceLocalYaml(YamlDeclarativeSource): + """ + Declarative source defined by a yaml file in the local filesystem + """ + + def __init__( + self, + catalog: ConfiguredAirbyteCatalog | None, + config: Mapping[str, Any] | None, + state: TState, + **kwargs: Any, + ) -> None: + """ + HACK! + Problem: YamlDeclarativeSource relies on the calling module name/path to find the yaml file. + Implication: If you call YamlDeclarativeSource directly it will look for the yaml file in the wrong place. (e.g. the airbyte-cdk package) + Solution: Subclass YamlDeclarativeSource from the same location as the manifest to load. + + When can we remove this? + When the airbyte-cdk is updated to not rely on the calling module name/path to find the yaml file. + When all manifest connectors are updated to use the new airbyte-cdk. + When all manifest connectors are updated to use the source-declarative-manifest as the base image. + """ + super().__init__( + catalog=catalog, + config=config, + state=state, + path_to_yaml="manifest.yaml", + ) + + +def _is_local_manifest_command(args: list[str]) -> bool: + # Check for a local manifest.yaml file + return Path("/airbyte/integration_code/source_declarative_manifest/manifest.yaml").exists() + + +def handle_command(args: list[str]) -> None: + if _is_local_manifest_command(args): + handle_local_manifest_command(args) + else: + handle_remote_manifest_command(args) + + +def _get_local_yaml_source(args: list[str]) -> SourceLocalYaml: + try: + config, catalog, state = _parse_inputs_into_config_catalog_state(args) + return SourceLocalYaml(config=config, catalog=catalog, state=state) + except Exception as error: + print( + orjson.dumps( + AirbyteMessageSerializer.dump( + AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=int(datetime.now().timestamp() * 1000), + error=AirbyteErrorTraceMessage( + message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}", + stack_trace=traceback.format_exc(), + ), + ), + ) + ) + ).decode() + ) + raise error + + +def handle_local_manifest_command(args: list[str]) -> None: + source = _get_local_yaml_source(args) + launch( + source=source, + args=args, + ) + + +def handle_remote_manifest_command(args: list[str]) -> None: + """Overrides the spec command to return the generalized spec for the declarative manifest source. + + This is different from a typical low-code, but built and published separately source built as a ManifestDeclarativeSource, + because that will have a spec method that returns the spec for that specific source. Other than spec, + the generalized connector behaves the same as any other, since the manifest is provided in the config. + """ + if args[0] == "spec": + json_spec = pkgutil.get_data( + "airbyte_cdk.cli.source_declarative_manifest", + "spec.json", + ) + if json_spec is None: + raise FileNotFoundError( + "Could not find `spec.json` file for source-declarative-manifest" + ) + + spec_obj = json.loads(json_spec) + spec = ConnectorSpecificationSerializer.load(spec_obj) + + message = AirbyteMessage(type=Type.SPEC, spec=spec) + print(AirbyteEntrypoint.airbyte_message_to_string(message)) + else: + source = create_declarative_source(args) + launch( + source=source, + args=args, + ) + + +def create_declarative_source(args: list[str]) -> ConcurrentDeclarativeSource: + """Creates the source with the injected config. + + This essentially does what other low-code sources do at build time, but at runtime, + with a user-provided manifest in the config. This better reflects what happens in the + connector builder. + """ + try: + config, catalog, state = _parse_inputs_into_config_catalog_state(args) + if "__injected_declarative_manifest" not in config: + raise ValueError( + f"Invalid config: `__injected_declarative_manifest` should be provided at the root of the config but config only has keys {list(config.keys())}" + ) + return ConcurrentDeclarativeSource( + config=config, + catalog=catalog, + state=state, + source_config=cast(dict[str, Any], config["__injected_declarative_manifest"]), + ) + except Exception as error: + print( + orjson.dumps( + AirbyteMessageSerializer.dump( + AirbyteMessage( + type=Type.TRACE, + trace=AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=int(datetime.now().timestamp() * 1000), + error=AirbyteErrorTraceMessage( + message=f"Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance. Error: {error}", + stack_trace=traceback.format_exc(), + ), + ), + ) + ) + ).decode() + ) + raise error + + +def _parse_inputs_into_config_catalog_state( + args: list[str], +) -> tuple[ + Mapping[str, Any] | None, + ConfiguredAirbyteCatalog | None, + list[AirbyteStateMessage], +]: + parsed_args = AirbyteEntrypoint.parse_args(args) + config = ( + ConcurrentDeclarativeSource.read_config(parsed_args.config) + if hasattr(parsed_args, "config") + else None + ) + catalog = ( + ConcurrentDeclarativeSource.read_catalog(parsed_args.catalog) + if hasattr(parsed_args, "catalog") + else None + ) + state = ( + ConcurrentDeclarativeSource.read_state(parsed_args.state) + if hasattr(parsed_args, "state") + else [] + ) + + return config, catalog, state + + +def run() -> None: + args: list[str] = sys.argv[1:] + handle_command(args) diff --git a/airbyte_cdk/cli/source_declarative_manifest/spec.json b/airbyte_cdk/cli/source_declarative_manifest/spec.json new file mode 100644 index 00000000..73d6a81a --- /dev/null +++ b/airbyte_cdk/cli/source_declarative_manifest/spec.json @@ -0,0 +1,17 @@ +{ + "documentationUrl": "https://docs.airbyte.com/integrations/sources/low-code", + "connectionSpecification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Low-code source spec", + "type": "object", + "required": ["__injected_declarative_manifest"], + "additionalProperties": true, + "properties": { + "__injected_declarative_manifest": { + "title": "Low-code manifest", + "type": "object", + "description": "The low-code manifest that defines the components of the source." + } + } + } +} diff --git a/poetry.lock b/poetry.lock index 9dffe6a6..fc666274 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -3874,7 +3874,7 @@ files = [ name = "rapidfuzz" version = "3.10.1" description = "rapid fuzzy string matching" -optional = true +optional = false python-versions = ">=3.9" files = [ {file = "rapidfuzz-3.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f17d9f21bf2f2f785d74f7b0d407805468b4c173fa3e52c86ec94436b338e74a"}, @@ -4223,6 +4223,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -5231,4 +5236,4 @@ vector-db-based = ["cohere", "langchain", "openai", "tiktoken"] [metadata] lock-version = "2.0" python-versions = "^3.10,<3.13" -content-hash = "e7ecc6f9875e1403a581a81c13595e4fed001e649face2bc3e466aa676a71fae" +content-hash = "e7ba21b5836e45357136ba5eff70d137327fdb6d20a64bcfee9423ce447774e5" diff --git a/pyproject.toml b/pyproject.toml index 392ebd5a..f783e96c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ pyrate-limiter = "~3.1.0" python-dateutil = "*" python-ulid = "^3.0.0" PyYAML = "^6.0.1" +rapidfuzz = "^3.10.1" requests = "*" requests_cache = "*" wcmatch = "10.0" @@ -104,6 +105,10 @@ sphinx-docs = ["Sphinx", "sphinx-rtd-theme"] vector-db-based = ["langchain", "openai", "cohere", "tiktoken"] sql = ["sqlalchemy"] +[tool.poetry.scripts] + +source-declarative-manifest = "airbyte_cdk.cli.source_declarative_manifest:run" + [tool.isort] skip = ["__init__.py"] # TODO: Remove after this is fixed: https://github.com/airbytehq/airbyte-python-cdk/issues/12