Skip to content

Commit

Permalink
Add artifact example recipes
Browse files Browse the repository at this point in the history
  • Loading branch information
imtaos committed Aug 18, 2022
1 parent a96f293 commit f4b5158
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": {
"urn": "urn:li:dataPlatform:affirm3rdParty",
"aspects": [
{
"com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": {
"datasetNameDelimiter": "/",
"name": "affirm3rdParty",
"displayName": "3rd-party Artifact",
"type": "OTHERS",
"logoUrl": "https://cdn-assets.affirm.com/images/black_logo-white_bg.jpg"
}
}
]
}
},
"proposedDelta": null
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
source:
type: file
config:
filename: './3rd-party.dataplatform.json'

sink:
type: 'datahub-rest'
config:
server: 'http://localhost:8080'
12 changes: 12 additions & 0 deletions metadata-ingestion/examples/affirm_artifact/3rd-party.recipe.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
source:
type: affirm-artifact
config:
directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/3rd_party'
platform: 'affirm3rdParty'
platform_instance: ''
env: 'PROD'

sink:
type: 'datahub-rest'
config:
server: 'http://localhost:8080'
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[
{
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DataPlatformSnapshot": {
"urn": "urn:li:dataPlatform:affirmInfra",
"aspects": [
{
"com.linkedin.pegasus2avro.dataplatform.DataPlatformInfo": {
"datasetNameDelimiter": "/",
"name": "affirmInfra",
"displayName": "Infra Artifact",
"type": "OTHERS",
"logoUrl": "https://cdn-assets.affirm.com/images/black_logo-white_bg.jpg"
}
}
]
}
},
"proposedDelta": null
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
source:
type: file
config:
filename: './infra.dataplatform.json'

sink:
type: 'datahub-rest'
config:
server: 'http://localhost:8080'
12 changes: 12 additions & 0 deletions metadata-ingestion/examples/affirm_artifact/infra.recipe.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
source:
type: affirm-artifact
config:
directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/infra'
platform: 'affirmInfra'
platform_instance: ''
env: 'PROD'

sink:
type: 'datahub-rest'
config:
server: 'http://localhost:8080'
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
source:
type: affirm-artifact
config:
directory: '/Users/tao.sun/tao/datahub-metadata/artifacts/unstructured_s3'
platform: 's3'
platform_instance: 'unstructured_s3'
env: 'PROD'

sink:
type: 'datahub-rest'
config:
server: 'http://localhost:8080'
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import Dict, Set

import pandas as pd
import spacy

from datahub.classification.privacy.privacy.api import PIIEngine
from datahub.ingestion.api.common import RecordEnvelope
Expand Down
18 changes: 16 additions & 2 deletions metadata-ingestion/src/datahub/ingestion/source/affirm/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@


yaml = YAML(typ='rt')
logging.basicConfig(stream=sys.stderr, level=logging.INFO)


@dataclass
Expand All @@ -37,6 +38,8 @@ def __post_init__(self):
self.privacy_entrypoint = ''
if self.processing_purposes is None:
self.processing_purposes = []
if self.schema_name is None:
self.schema_name = ''


class AffirmArtifactSourceConfig(ConfigModel):
Expand All @@ -45,6 +48,7 @@ class AffirmArtifactSourceConfig(ConfigModel):
'''
directory: str
platform: str
platform_instance: str
env: str


Expand Down Expand Up @@ -106,11 +110,21 @@ def create(cls, config_dict, ctx):
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
directory = self.config.directory
platform = self.config.platform
platform_instance = self.config.platform_instance
env = self.config.env
for artifact in iterate_artifact(directory):
dataset_name = f'{artifact.schema_name}.{artifact.name}' if len(artifact.schema_name) > 0 else artifact.name
dataset_name = (
f'{artifact.schema_name}.{artifact.name}'
if len(artifact.schema_name) > 0
else artifact.name
)
logging.info(f'> Processing dataset {dataset_name}')
dataset_urn = builder.make_dataset_urn(platform, dataset_name, env)
dataset_urn = builder.make_dataset_urn_with_platform_instance(
platform=platform,
name=dataset_name,
platform_instance=platform_instance,
env=env
)
dataset_snapshot = DatasetSnapshot(
urn=dataset_urn,
aspects=[],
Expand Down

0 comments on commit f4b5158

Please sign in to comment.