diff --git a/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.json b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.json new file mode 100644 index 0000000000000..dc9a2f0624049 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.json @@ -0,0 +1,22 @@ +[ + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:affirm3rdParty", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "affirm3rdParty", + "displayName": "3rd-party Artifact", + "type": "OTHERS", + "logoUrl": "https://cdn-assets.affirm.com/images/black_logo-white_bg.jpg" + } + } + ] + } + }, + "proposedDelta": null + } +] diff --git a/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.recipe.yml b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.recipe.yml new file mode 100644 index 0000000000000..1509c74fa569a --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/3rd-party.dataplatform.recipe.yml @@ -0,0 +1,9 @@ +source: + type: file + config: + filename: './3rd-party.dataplatform.json' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml b/metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml new file mode 100644 index 0000000000000..b0289e196e9a8 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml @@ -0,0 +1,12 @@ +source: + type: affirm-artifact + config: + directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/3rd_party' + platform: 'affirm3rdParty' + platform_instance: '' + env: 'PROD' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.json b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.json new file mode 100644 index 0000000000000..730f6e8182681 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.json @@ -0,0 +1,22 @@ +[ + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": { + "urn": "urn:li:dataPlatform:affirmInfra", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": { + "datasetNameDelimiter": "/", + "name": "affirmInfra", + "displayName": "Infra Artifact", + "type": "OTHERS", + "logoUrl": "https://cdn-assets.affirm.com/images/black_logo-white_bg.jpg" + } + } + ] + } + }, + "proposedDelta": null + } +] diff --git a/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.recipe.yml b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.recipe.yml new file mode 100644 index 0000000000000..dd8b91cbcbe43 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/infra.dataplatform.recipe.yml @@ -0,0 +1,9 @@ +source: + type: file + config: + filename: './infra.dataplatform.json' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/infra.recipe.yml b/metadata-ingestion/examples/affirm_artifact/infra.recipe.yml new file mode 100644 index 0000000000000..2d0bc8cdd8117 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/infra.recipe.yml @@ -0,0 +1,12 @@ +source: + type: affirm-artifact + config: + directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/infra' + platform: 'affirmInfra' + platform_instance: '' + env: 'PROD' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/examples/affirm_artifact/unstructured-s3-recipe.yml b/metadata-ingestion/examples/affirm_artifact/unstructured-s3-recipe.yml new file mode 100644 index 0000000000000..b19b3c6b87197 --- /dev/null +++ b/metadata-ingestion/examples/affirm_artifact/unstructured-s3-recipe.yml @@ -0,0 +1,12 @@ +source: + type: affirm-artifact + config: + directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/unstructured_s3' + platform: 's3' + platform_instance: 'unstructured_s3' + env: 'PROD' + +sink: + type: 'datahub-rest' + config: + server: 'http://localhost:8080' diff --git a/metadata-ingestion/src/datahub/classification/classifier.py b/metadata-ingestion/src/datahub/classification/classifier.py index d44ab51712c74..2ccff85df92a5 100644 --- a/metadata-ingestion/src/datahub/classification/classifier.py +++ b/metadata-ingestion/src/datahub/classification/classifier.py @@ -4,7 +4,6 @@ from typing import Dict, Set import pandas as pd -import spacy from datahub.classification.privacy.privacy.api import PIIEngine from datahub.ingestion.api.common import RecordEnvelope diff --git a/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py b/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py index ba5e09cc38b87..a188ff82570b7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py +++ b/metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py @@ -6,7 +6,7 @@ from ruamel.yaml import YAML import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel +from datahub.configuration.source_common import PlatformSourceConfigBase from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent @@ -20,6 +20,7 @@ yaml = YAML(typ='rt') +logging.basicConfig(stream=sys.stderr, level=logging.INFO) @dataclass @@ -37,14 +38,15 @@ def __post_init__(self): self.privacy_entrypoint = '' if self.processing_purposes is None: self.processing_purposes = [] + if self.schema_name is None: + self.schema_name = '' -class AffirmArtifactSourceConfig(ConfigModel): +class AffirmArtifactSourceConfig(PlatformSourceConfigBase): ''' TODO support git repo to automate the whole process: git clone, locate artifact and ingest ''' directory: str - platform: str env: str @@ -106,11 +108,21 @@ def create(cls, config_dict, ctx): def get_workunits(self) -> Iterable[MetadataWorkUnit]: directory = self.config.directory platform = self.config.platform + platform_instance = self.config.platform_instance env = self.config.env for artifact in iterate_artifact(directory): - dataset_name = f'{artifact.schema_name}.{artifact.name}' if len(artifact.schema_name) > 0 else artifact.name + dataset_name = ( + f'{artifact.schema_name}.{artifact.name}' + if len(artifact.schema_name) > 0 + else artifact.name + ) logging.info(f'> Processing dataset {dataset_name}') - dataset_urn = builder.make_dataset_urn(platform, dataset_name, env) + dataset_urn = builder.make_dataset_urn_with_platform_instance( + platform=platform, + name=dataset_name, + platform_instance=platform_instance, + env=env + ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[],