From f4b51584fbe2a678a4de1eb5d8c3a7af0e25c59e Mon Sep 17 00:00:00 2001 From: Tao Sun Date: Wed, 17 Aug 2022 17:58:03 -0700 Subject: [PATCH] Add artifact example recipes --- .../3rd-party.dataplatform.json | 22 +++++++++++++++++++ .../3rd-party.dataplatform.recipe.yml | 9 ++++++++ .../affirm_artifact/3rd-party.recipe.yml | 12 ++++++++++ .../affirm_artifact/infra.dataplatform.json | 22 +++++++++++++++++++ .../infra.dataplatform.recipe.yml | 9 ++++++++ .../examples/affirm_artifact/infra.recipe.yml | 12 ++++++++++ .../unstructured-s3-recipe.yml | 12 ++++++++++ .../src/datahub/classification/classifier.py | 1 - .../ingestion/source/affirm/artifact.py | 18 +++++++++++++-- 9 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.json create mode 100644 metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.recipe.yml create mode 100644 metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml create mode 100644 metadata-ingestion/examples/affirm_artifact/infra.dataplatform.json create mode 100644 metadata-ingestion/examples/affirm_artifact/infra.dataplatform.recipe.yml create mode 100644 metadata-ingestion/examples/affirm_artifact/infra.recipe.yml create mode 100644 metadata-ingestion/examples/affirm_artifact/unstructured-s3-recipe.yml diff --git a/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.json b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.json new file mode 100644 index 00000000000000..dc9a2f06240495 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.json @@ -0,0 +1,22 @@ +[ + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:affirm3rdParty", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "affirm3rdParty", + "displayName": "3rd-party Artifact", + "type": "OTHERS", + "logoUrl": "https://cdn-assets.affirm.com/images/black_logo-white_bg.jpg" + } + } + ] + } + }, + "proposedDelta": null + } +] diff --git a/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.recipe.yml b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.recipe.yml new file mode 100644 index 00000000000000..1509c74fa569a9 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.recipe.yml @@ -0,0 +1,9 @@ +source: + type: file + config: + filename: './3rd-party.dataplatform.json' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml b/metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml new file mode 100644 index 00000000000000..b0289e196e9a81 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml @@ -0,0 +1,12 @@ +source: + type: affirm-artifact + config: + directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/3rd_party' + platform: 'affirm3rdParty' + platform_instance: '' + env: 'PROD' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.json b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.json new file mode 100644 index 00000000000000..730f6e81826814 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.json @@ -0,0 +1,22 @@ +[ + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:affirmInfra", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "affirmInfra", + "displayName": "Infra Artifact", + "type": "OTHERS", + "logoUrl": "https://cdn-assets.affirm.com/images/black_logo-white_bg.jpg" + } + } + ] + } + }, + "proposedDelta": null + } +] diff --git a/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.recipe.yml b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.recipe.yml new file mode 100644 index 00000000000000..dd8b91cbcbe438 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.recipe.yml @@ -0,0 +1,9 @@ +source: + type: file + config: + filename: './infra.dataplatform.json' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/infra.recipe.yml b/metadata-ingestion/examples/affirm_artifact/infra.recipe.yml new file mode 100644 index 00000000000000..2d0bc8cdd81174 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/infra.recipe.yml @@ -0,0 +1,12 @@ +source: + type: affirm-artifact + config: + directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/infra' + platform: 'affirmInfra' + platform_instance: '' + env: 'PROD' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/unstructured-s3-recipe.yml b/metadata-ingestion/examples/affirm_artifact/unstructured-s3-recipe.yml new file mode 100644 index 00000000000000..b19b3c6b871978 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/unstructured-s3-recipe.yml @@ -0,0 +1,12 @@ +source: + type: affirm-artifact + config: + directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/unstructured_s3' + platform: 's3' + platform_instance: 'unstructured_s3' + env: 'PROD' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/src/datahub/classification/classifier.py b/metadata-ingestion/src/datahub/classification/classifier.py index d44ab51712c740..2ccff85df92a5b 100644 --- a/metadata-ingestion/src/datahub/classification/classifier.py +++ b/metadata-ingestion/src/datahub/classification/classifier.py @@ -4,7 +4,6 @@ from typing import Dict, Set import pandas as pd -import spacy from datahub.classification.privacy.privacy.api import PIIEngine from datahub.ingestion.api.common import RecordEnvelope diff --git a/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py b/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py index ba5e09cc38b879..bb61c94ebd84b5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py +++ b/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py @@ -20,6 +20,7 @@ yaml = YAML(typ='rt') +logging.basicConfig(stream=sys.stderr, level=logging.INFO) @dataclass @@ -37,6 +38,8 @@ def __post_init__(self): self.privacy_entrypoint = '' if self.processing_purposes is None: self.processing_purposes = [] + if self.schema_name is None: + self.schema_name = '' class AffirmArtifactSourceConfig(ConfigModel): @@ -45,6 +48,7 @@ class AffirmArtifactSourceConfig(ConfigModel): ''' directory: str platform: str + platform_instance: str env: str @@ -106,11 +110,21 @@ def create(cls, config_dict, ctx): def get_workunits(self) -> Iterable[MetadataWorkUnit]: directory = self.config.directory platform = self.config.platform + platform_instance = self.config.platform_instance env = self.config.env for artifact in iterate_artifact(directory): - dataset_name = f'{artifact.schema_name}.{artifact.name}' if len(artifact.schema_name) > 0 else artifact.name + dataset_name = ( + f'{artifact.schema_name}.{artifact.name}' + if len(artifact.schema_name) > 0 + else artifact.name + ) logging.info(f'> Processing dataset {dataset_name}') - dataset_urn = builder.make_dataset_urn(platform, dataset_name, env) + dataset_urn = builder.make_dataset_urn_with_platform_instance( + platform=platform, + name=dataset_name, + platform_instance=platform_instance, + env=env + ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[],