From eceb799e634aa19340dbfe9da51714311f401996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 20 Dec 2024 08:37:21 +0100 Subject: [PATCH 01/49] fix(tableau): restart server object when reauthenticating (#12182) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/tableau/tableau.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 7838e5fa256b85..fadcb8ff8f3966 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -896,10 +896,9 @@ def dataset_browse_prefix(self) -> str: return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" def _re_authenticate(self): - tableau_auth: Union[ - TableauAuth, PersonalAccessTokenAuth - ] = self.config.get_tableau_auth(self.site_id) - self.server.auth.sign_in(tableau_auth) + # Sign-in again may not be enough because Tableau sometimes caches invalid sessions + # so we need to recreate the Tableau Server object + self.server = self.config.make_tableau_client(self.site_id) @property def site_content_url(self) -> Optional[str]: From 66df362c0f7f10f5f0230054977410c3f1eb688a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 20 Dec 2024 09:57:53 +0100 Subject: [PATCH 02/49] fix(dagster): support dagster v1.9.6 (#12189) --- .../src/datahub_dagster_plugin/client/dagster_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py index 2fdd0a41edf6cb..a87f490f2d947e 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py @@ -522,7 +522,7 @@ def generate_datajob( # Also, add datahub inputs/outputs if present in input/output metatdata. for input_def_snap in op_def_snap.input_def_snaps: job_property_bag[f"input.{input_def_snap.name}"] = str( - input_def_snap._asdict() + input_def_snap.__dict__ ) if Constant.DATAHUB_INPUTS in input_def_snap.metadata: datajob.inlets.extend( @@ -533,7 +533,7 @@ def generate_datajob( for output_def_snap in op_def_snap.output_def_snaps: job_property_bag[f"output_{output_def_snap.name}"] = str( - output_def_snap._asdict() + output_def_snap.__dict__ ) if ( Constant.DATAHUB_OUTPUTS in output_def_snap.metadata From 42d4254cdcc13b10e4955bfabff83bf09e56c0dd Mon Sep 17 00:00:00 2001 From: kevinkarchacryl Date: Fri, 20 Dec 2024 04:30:59 -0500 Subject: [PATCH 03/49] fix(graphql): add suspended to corpuserstatus (#12185) --- datahub-graphql-core/src/main/resources/entity.graphql | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 926cd256a5c5a4..e086273068ee53 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -3838,6 +3838,11 @@ enum CorpUserStatus { A User that has been provisioned and logged in """ ACTIVE + + """ + A user that has been suspended + """ + SUSPENDED } union ResolvedActor = CorpUser | CorpGroup From f4f9bd3bca62beb15741493b11003642cd5a6889 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:45:43 +0530 Subject: [PATCH 04/49] =?UTF-8?q?feat(ingest/snowflake):=20include=20exter?= =?UTF-8?q?nal=20table=20ddl=20lineage=20for=20queries=E2=80=A6=20(#12179)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../source/snowflake/snowflake_lineage_v2.py | 55 ++----------------- .../source/snowflake/snowflake_queries.py | 3 - .../source/snowflake/snowflake_schema_gen.py | 54 +++++++++++++++++- .../source/snowflake/snowflake_v2.py | 51 ++++++++--------- .../source_report/ingestion_stage.py | 1 + 5 files changed, 80 insertions(+), 84 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index c769c6705ac3f6..69f28a0e6e595a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -265,64 +265,17 @@ def _populate_external_upstreams(self, discovered_tables: List[str]) -> None: with PerfTimer() as timer: self.report.num_external_table_edges_scanned = 0 - for ( - known_lineage_mapping - ) in self._populate_external_lineage_from_copy_history(discovered_tables): - self.sql_aggregator.add(known_lineage_mapping) - logger.info( - "Done populating external lineage from copy history. " - f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." - ) - - for ( - known_lineage_mapping - ) in self._populate_external_lineage_from_show_query(discovered_tables): - self.sql_aggregator.add(known_lineage_mapping) - - logger.info( - "Done populating external lineage from show external tables. " - f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." - ) + for entry in self._get_copy_history_lineage(discovered_tables): + self.sql_aggregator.add(entry) + logger.info("Done populating external lineage from copy history. ") self.report.external_lineage_queries_secs = timer.elapsed_seconds() - # Handles the case for explicitly created external tables. - # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_show_query( - self, discovered_tables: List[str] - ) -> Iterable[KnownLineageMapping]: - external_tables_query: str = SnowflakeQuery.show_external_tables() - try: - for db_row in self.connection.query(external_tables_query): - key = self.identifiers.get_dataset_identifier( - db_row["name"], db_row["schema_name"], db_row["database_name"] - ) - - if key not in discovered_tables: - continue - if db_row["location"].startswith("s3://"): - yield KnownLineageMapping( - upstream_urn=make_s3_urn_for_lineage( - db_row["location"], self.config.env - ), - downstream_urn=self.identifiers.gen_dataset_urn(key), - ) - self.report.num_external_table_edges_scanned += 1 - - self.report.num_external_table_edges_scanned += 1 - except Exception as e: - logger.debug(e, exc_info=e) - self.structured_reporter.warning( - "Error populating external table lineage from Snowflake", - exc=e, - ) - self.report_status(EXTERNAL_LINEAGE, False) - # Handles the case where a table is populated from an external stage/s3 location via copy. # Eg: copy into category_english from @external_s3_stage; # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv'; # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_copy_history( + def _get_copy_history_lineage( self, discovered_tables: List[str] ) -> Iterable[KnownLineageMapping]: query: str = SnowflakeQuery.copy_lineage_history( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 2d2bdc50467c64..174aad0bddd4a8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -247,9 +247,6 @@ def get_workunits_internal( for entry in self.fetch_copy_history(): queries.append(entry) - # TODO: Add "show external tables" lineage to the main schema extractor. - # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor. - with self.report.query_log_fetch_timer: for entry in self.fetch_query_log(): queries.append(entry) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index bc64693b6a1084..4b72b09fafe2dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -16,6 +16,7 @@ ClassificationHandler, classification_workunit_processor, ) +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -35,6 +36,7 @@ ) from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report from datahub.ingestion.source.snowflake.snowflake_schema import ( SCHEMA_PARALLELISM, @@ -65,6 +67,7 @@ get_domain_wu, ) from datahub.ingestion.source_report.ingestion_stage import ( + EXTERNAL_TABLE_DDL_LINEAGE, METADATA_EXTRACTION, PROFILING, ) @@ -96,7 +99,10 @@ TimeType, ) from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties -from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator +from datahub.sql_parsing.sql_parsing_aggregator import ( + KnownLineageMapping, + SqlParsingAggregator, +) from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor @@ -180,7 +186,8 @@ def __init__( # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] - self.aggregator: Optional[SqlParsingAggregator] = aggregator + + self.aggregator = aggregator def get_connection(self) -> SnowflakeConnection: return self.connection @@ -212,6 +219,19 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION) yield from self._process_database(snowflake_db) + self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE) + discovered_tables: List[str] = [ + self.identifiers.get_dataset_identifier( + table_name, schema.name, db.name + ) + for db in self.databases + for schema in db.schemas + for table_name in schema.tables + ] + if self.aggregator: + for entry in self._external_tables_ddl_lineage(discovered_tables): + self.aggregator.add(entry) + except SnowflakePermissionError as e: self.structured_reporter.failure( GENERIC_PERMISSION_ERROR_KEY, @@ -1082,3 +1102,33 @@ def get_fk_constraints_for_table( # Access to table but none of its constraints - is this possible ? return constraints.get(table_name, []) + + # Handles the case for explicitly created external tables. + # NOTE: Snowflake does not log this information to the access_history table. + def _external_tables_ddl_lineage( + self, discovered_tables: List[str] + ) -> Iterable[KnownLineageMapping]: + external_tables_query: str = SnowflakeQuery.show_external_tables() + try: + for db_row in self.connection.query(external_tables_query): + key = self.identifiers.get_dataset_identifier( + db_row["name"], db_row["schema_name"], db_row["database_name"] + ) + + if key not in discovered_tables: + continue + if db_row["location"].startswith("s3://"): + yield KnownLineageMapping( + upstream_urn=make_s3_urn_for_lineage( + db_row["location"], self.config.env + ), + downstream_urn=self.identifiers.gen_dataset_urn(key), + ) + self.report.num_external_table_edges_scanned += 1 + + self.report.num_external_table_edges_scanned += 1 + except Exception as e: + self.structured_reporter.warning( + "External table ddl lineage extraction failed", + exc=e, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index e5883dd0349a3a..884e6c49f5b62a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -161,35 +161,32 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None - self.aggregator: Optional[SqlParsingAggregator] = None - - if self.config.use_queries_v2 or self.config.include_table_lineage: - self.aggregator = self._exit_stack.enter_context( - SqlParsingAggregator( - platform=self.identifiers.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - graph=self.ctx.graph, - eager_graph_load=( - # If we're ingestion schema metadata for tables/views, then we will populate - # schemas into the resolver as we go. We only need to do a bulk fetch - # if we're not ingesting schema metadata as part of ingestion. - not ( - self.config.include_technical_schema - and self.config.include_tables - and self.config.include_views - ) - and not self.config.lazy_schema_resolver - ), - generate_usage_statistics=False, - generate_operations=False, - format_queries=self.config.format_sql_queries, - ) + + self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context( + SqlParsingAggregator( + platform=self.identifiers.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + graph=self.ctx.graph, + eager_graph_load=( + # If we're ingestion schema metadata for tables/views, then we will populate + # schemas into the resolver as we go. We only need to do a bulk fetch + # if we're not ingesting schema metadata as part of ingestion. + not ( + self.config.include_technical_schema + and self.config.include_tables + and self.config.include_views + ) + and not self.config.lazy_schema_resolver + ), + generate_usage_statistics=False, + generate_operations=False, + format_queries=self.config.format_sql_queries, ) - self.report.sql_aggregator = self.aggregator.report + ) + self.report.sql_aggregator = self.aggregator.report if self.config.include_table_lineage: - assert self.aggregator is not None redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None @@ -487,8 +484,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: databases = schema_extractor.databases - # TODO: The checkpoint state for stale entity detection can be committed here. - if self.config.shares: yield from SnowflakeSharesHandler( self.config, self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 4308b405e46e37..92407eaae6e901 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -14,6 +14,7 @@ USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion" USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" +EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage" QUERIES_EXTRACTION = "Queries Extraction" PROFILING = "Profiling" From 157013949e32dc664eb85127ca3b3c78c936e88f Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Fri, 20 Dec 2024 21:42:10 +0530 Subject: [PATCH 05/49] fix(gms): Change names of charts in Analytics (#12192) --- .../datahub/graphql/analytics/resolver/GetChartsResolver.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java index 197ac87c1e22d8..d9b8008d46286a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java @@ -125,7 +125,7 @@ private AnalyticsChart getTopUsersChart(OperationContext opContext) { final DateRange trailingMonthDateRange = dateUtil.getTrailingMonthDateRange(); final List columns = ImmutableList.of("Name", "Title", "Email"); - final String topUsersTitle = "Top Users"; + final String topUsersTitle = "Top Users (Last 30 Days)"; final List topUserRows = _analyticsService.getTopNTableChart( _analyticsService.getUsageIndexName(), @@ -198,7 +198,7 @@ private Row buildNewUsersRow(@Nonnull final SearchEntity entity) { private AnalyticsChart getNewUsersChart(OperationContext opContext) { try { final List columns = ImmutableList.of("Name", "Title", "Email"); - final String newUsersTitle = "New Users"; + final String newUsersTitle = "Active Users (Last 30 Days)"; final SearchResult result = searchForNewUsers(opContext); final List newUserRows = new ArrayList<>(); for (SearchEntity entity : result.getEntities()) { From e52a4deba8a6d436093257437cb3ae5d6148e4f8 Mon Sep 17 00:00:00 2001 From: skrydal Date: Fri, 20 Dec 2024 17:41:18 +0100 Subject: [PATCH 06/49] fix(ingest/databricks): Fix profiling (#12060) --- .../src/datahub/emitter/rest_emitter.py | 17 +- .../auto_ensure_aspect_size.py | 96 +++++ .../datahub/ingestion/source/unity/source.py | 4 + .../source_helpers/test_ensure_aspect_size.py | 346 ++++++++++++++++++ 4 files changed, 462 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py create mode 100644 metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index e2bc14925ad383..675717b5ec4829 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -291,6 +291,7 @@ def emit_mcps( mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], async_flag: Optional[bool] = None, ) -> int: + logger.debug("Attempting to emit batch mcps") url = f"{self._gms_server}/aspects?action=ingestProposalBatch" for mcp in mcps: ensure_has_system_metadata(mcp) @@ -303,15 +304,22 @@ def emit_mcps( current_chunk_size = INGEST_MAX_PAYLOAD_BYTES for mcp_obj in mcp_objs: mcp_obj_size = len(json.dumps(mcp_obj)) + logger.debug( + f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}" + ) if ( mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH ): + logger.debug("Decided to create new chunk") mcp_obj_chunks.append([]) current_chunk_size = 0 mcp_obj_chunks[-1].append(mcp_obj) current_chunk_size += mcp_obj_size + logger.debug( + f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks" + ) for mcp_obj_chunk in mcp_obj_chunks: # TODO: We're calling json.dumps on each MCP object twice, once to estimate @@ -338,8 +346,15 @@ def emit_usage(self, usageStats: UsageAggregation) -> None: def _emit_generic(self, url: str, payload: str) -> None: curl_command = make_curl_command(self._session, "POST", url, payload) + payload_size = len(payload) + if payload_size > INGEST_MAX_PAYLOAD_BYTES: + # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail + logger.warning( + f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size" + ) logger.debug( - "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s", + "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s", + payload_size, curl_command, ) try: diff --git a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py new file mode 100644 index 00000000000000..559f0b77f59dfa --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py @@ -0,0 +1,96 @@ +import json +import logging +from typing import Iterable, List + +from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES +from datahub.emitter.serialization_helper import pre_json_transform +from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.schema_classes import ( + DatasetProfileClass, + SchemaFieldClass, + SchemaMetadataClass, +) + +logger = logging.getLogger(__name__) + + +class EnsureAspectSizeProcessor: + def __init__( + self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES + ): + self.report = report + self.payload_constraint = payload_constraint + + def ensure_dataset_profile_size( + self, dataset_urn: str, profile: DatasetProfileClass + ) -> None: + """ + This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted + in the future + """ + sample_fields_size = 0 + if profile.fieldProfiles: + logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}") + for field in profile.fieldProfiles: + if field.sampleValues: + values_len = 0 + for value in field.sampleValues: + if value: + values_len += len(value) + logger.debug( + f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}" + ) + if sample_fields_size + values_len > self.payload_constraint: + field.sampleValues = [] + self.report.warning( + title="Dataset profile truncated due to size constraint", + message="Dataset profile contained too much data and would have caused ingestion to fail", + context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints", + ) + else: + sample_fields_size += values_len + else: + logger.debug(f"Field {field.fieldPath} has no sample values") + + def ensure_schema_metadata_size( + self, dataset_urn: str, schema: SchemaMetadataClass + ) -> None: + """ + This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted + in the future + """ + total_fields_size = 0 + logger.debug(f"Amount of schema fields: {len(schema.fields)}") + accepted_fields: List[SchemaFieldClass] = [] + for field in schema.fields: + field_size = len(json.dumps(pre_json_transform(field.to_obj()))) + logger.debug(f"Field {field.fieldPath} takes total {field_size}") + if total_fields_size + field_size < self.payload_constraint: + accepted_fields.append(field) + total_fields_size += field_size + else: + self.report.warning( + title="Schema truncated due to size constraint", + message="Dataset schema contained too much data and would have caused ingestion to fail", + context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints", + ) + + schema.fields = accepted_fields + + def ensure_aspect_size( + self, + stream: Iterable[MetadataWorkUnit], + ) -> Iterable[MetadataWorkUnit]: + """ + We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception + on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects. + """ + for wu in stream: + logger.debug(f"Ensuring size of workunit: {wu.id}") + + if schema := wu.get_aspect_of_type(SchemaMetadataClass): + self.ensure_schema_metadata_size(wu.get_urn(), schema) + elif profile := wu.get_aspect_of_type(DatasetProfileClass): + self.ensure_dataset_profile_size(wu.get_urn(), profile) + yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 9d9a746580f939..7bfa7fdb28aaf8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -26,6 +26,9 @@ gen_containers, ) from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( + EnsureAspectSizeProcessor, +) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -260,6 +263,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, + EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py new file mode 100644 index 00000000000000..bdf1e0a2e0e860 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py @@ -0,0 +1,346 @@ +import json +import time +from unittest.mock import patch + +import pytest +from freezegun.api import freeze_time + +from datahub.emitter.aspect import JSON_CONTENT_TYPE +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES +from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( + EnsureAspectSizeProcessor, +) +from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.metadata.schema_classes import ( + ChangeTypeClass, + DatasetFieldProfileClass, + DatasetProfileClass, + DatasetSnapshotClass, + GenericAspectClass, + MetadataChangeProposalClass, + NumberTypeClass, + OtherSchemaClass, + SchemaFieldClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StatusClass, + StringTypeClass, + SubTypesClass, +) + + +@pytest.fixture +def processor(): + return EnsureAspectSizeProcessor(SourceReport()) + + +def too_big_schema_metadata() -> SchemaMetadataClass: + fields = [ + SchemaFieldClass( + "aaaa", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + SchemaFieldClass( + "bbbb", + nativeDataType="string", + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + ), + SchemaFieldClass( + "cccc", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + ] + # simple int type field takes ~160 bytes in JSON representation, below is to assure we exceed the threshold + for f_no in range(1000): + fields.append( + SchemaFieldClass( + fieldPath=f"t{f_no}", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + description=20000 * "a", + ) + ) + + # adding small field to check whether it will still be present in the output + fields.append( + SchemaFieldClass( + "dddd", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ) + ) + return SchemaMetadataClass( + schemaName="abcdef", + version=1, + platform="s3", + hash="ABCDE1234567890", + platformSchema=OtherSchemaClass(rawSchema="aaa"), + fields=fields, + ) + + +def proper_schema_metadata() -> SchemaMetadataClass: + fields = [ + SchemaFieldClass( + "aaaa", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + SchemaFieldClass( + "bbbb", + nativeDataType="string", + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + ), + SchemaFieldClass( + "cccc", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + ] + return SchemaMetadataClass( + schemaName="abcdef", + version=1, + platform="s3", + hash="ABCDE1234567890", + platformSchema=OtherSchemaClass(rawSchema="aaa"), + fields=fields, + ) + + +def proper_dataset_profile() -> DatasetProfileClass: + sample_values = [ + "23483295", + "234234", + "324234", + "12123", + "3150314", + "19231", + "211", + "93498", + "12837", + "73847", + "12434", + "33466", + "98785", + "4546", + "4547", + "342", + "11", + "34", + "444", + "38576", + ] + field_profiles = [ + DatasetFieldProfileClass(fieldPath="a", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="b", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="c", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="d", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="e", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="f", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="g", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="h", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="i", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="j", sampleValues=sample_values), + ] + return DatasetProfileClass( + timestampMillis=int(time.time()) * 1000, fieldProfiles=field_profiles + ) + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_proper_dataset_profile(processor): + profile = proper_dataset_profile() + orig_repr = json.dumps(profile.to_obj()) + processor.ensure_dataset_profile_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", profile + ) + assert orig_repr == json.dumps( + profile.to_obj() + ), "Aspect was modified in case where workunit processor should have been no-op" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_too_big_schema_metadata(processor): + schema = too_big_schema_metadata() + assert len(schema.fields) == 1004 + + processor.ensure_schema_metadata_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", schema + ) + assert len(schema.fields) < 1004, "Schema has not been properly truncated" + assert schema.fields[-1].fieldPath == "dddd", "Small field was not added at the end" + # +100kb is completely arbitrary, but we are truncating the aspect based on schema fields size only, not total taken + # by other parameters of the aspect - it is reasonable approach though - schema fields is the only field in schema + # metadata which can be expected to grow out of control + assert ( + len(json.dumps(schema.to_obj())) < INGEST_MAX_PAYLOAD_BYTES + 100000 + ), "Aspect exceeded acceptable size" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_proper_schema_metadata(processor): + schema = proper_schema_metadata() + orig_repr = json.dumps(schema.to_obj()) + processor.ensure_schema_metadata_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", schema + ) + assert orig_repr == json.dumps( + schema.to_obj() + ), "Aspect was modified in case where workunit processor should have been no-op" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_too_big_dataset_profile(processor): + profile = proper_dataset_profile() + big_field = DatasetFieldProfileClass( + fieldPath="big", + sampleValues=20 * [(int(INGEST_MAX_PAYLOAD_BYTES / 20) - 10) * "a"], + ) + assert profile.fieldProfiles + profile.fieldProfiles.insert(4, big_field) + processor.ensure_dataset_profile_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", profile + ) + + expected_profile = proper_dataset_profile() + reduced_field = DatasetFieldProfileClass( + fieldPath="big", + sampleValues=[], + ) + assert expected_profile.fieldProfiles + expected_profile.fieldProfiles.insert(4, reduced_field) + assert json.dumps(profile.to_obj()) == json.dumps( + expected_profile.to_obj() + ), "Field 'big' was not properly removed from aspect due to its size" + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=proper_dataset_profile(), + ).as_workunit() + ] + ) + ] + ensure_dataset_profile_size_mock.assert_called_once() + ensure_schema_metadata_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect_mcpc( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + profile_aspect = proper_dataset_profile() + mcpc = MetadataWorkUnit( + id="test", + mcp_raw=MetadataChangeProposalClass( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspectName=DatasetProfileClass.ASPECT_NAME, + aspect=GenericAspectClass( + value=json.dumps(profile_aspect.to_obj()).encode(), + contentType=JSON_CONTENT_TYPE, + ), + ), + ) + ret = [*processor.ensure_aspect_size([mcpc])] # noqa: F841 + ensure_dataset_profile_size_mock.assert_called_once() + ensure_schema_metadata_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect_mce( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + snapshot = DatasetSnapshotClass( + urn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspects=[proper_schema_metadata()], + ) + mce = MetadataWorkUnit( + id="test", mce=MetadataChangeEvent(proposedSnapshot=snapshot) + ) + ret = [*processor.ensure_aspect_size([mce])] # noqa: F841 + ensure_schema_metadata_size_mock.assert_called_once() + ensure_dataset_profile_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_schema_metadata_aspect( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=proper_schema_metadata(), + ).as_workunit() + ] + ) + ] + ensure_schema_metadata_size_mock.assert_called_once() + ensure_dataset_profile_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_not_triggered_by_unhandled_aspects( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=StatusClass(removed=False), + ).as_workunit(), + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=SubTypesClass(typeNames=["table"]), + ).as_workunit(), + ] + ) + ] + ensure_schema_metadata_size_mock.assert_not_called() + ensure_dataset_profile_size_mock.assert_not_called() From 98c056d569d4e5f2fa031a5a3ac8f3009ee49567 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 00:36:57 +0530 Subject: [PATCH 07/49] refactor(ingest/tableau): mark the `fetch_size` configuration as deprecated (#12126) --- .../ingestion/source/tableau/tableau.py | 18 +++++++++++------- .../integration/tableau/test_tableau_ingest.py | 1 + 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index fadcb8ff8f3966..984cf9357199d6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -49,6 +49,7 @@ DatasetSourceConfigMixin, ) from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, @@ -380,11 +381,6 @@ class TableauConfig( description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.", ) - fetch_size: int = Field( - default=250, - description="Specifies the number of records to retrieve in each batch during a query execution.", - ) - # We've found that even with a small workbook page size (e.g. 10), the Tableau API often # returns warnings like this: # { @@ -499,6 +495,10 @@ class TableauConfig( "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.", ) + _fetch_size = pydantic_removed_field( + "fetch_size", + ) + # pre = True because we want to take some decision before pydantic initialize the configuration to default values @root_validator(pre=True) def projects_backward_compatibility(cls, values: Dict) -> Dict: @@ -1147,7 +1147,7 @@ def get_connection_object_page( connection_type: str, query_filter: str, current_cursor: Optional[str], - fetch_size: int = 250, + fetch_size: int, retry_on_auth_error: bool = True, retries_remaining: Optional[int] = None, ) -> Tuple[dict, Optional[str], int]: @@ -1344,7 +1344,11 @@ def get_connection_objects( connection_type=connection_type, query_filter=filter_, current_cursor=current_cursor, - fetch_size=self.config.fetch_size, + # `filter_page` contains metadata object IDs (e.g., Project IDs, Field IDs, Sheet IDs, etc.). + # The number of IDs is always less than or equal to page_size. + # If the IDs are primary keys, the number of metadata objects to load matches the number of records to return. + # In our case, mostly, the IDs are primary key, therefore, fetch_size is set equal to page_size. + fetch_size=page_size, ) yield from connection_objects.get(c.NODES) or [] diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 4b2ac96931b950..fa00eaef9ccabb 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1324,6 +1324,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): query_filter=mock.MagicMock(), current_cursor=None, retries_remaining=1, + fetch_size=10, ) warnings = list(reporter.warnings) From 3c3d0322fe9608ccf7cbaadfd83f6f7f0e7afeff Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 01:27:34 +0530 Subject: [PATCH 08/49] test(ingest/tableau): add test for extract_project_hierarchy scenario (#12079) --- .../tableau/test_tableau_ingest.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index fa00eaef9ccabb..c3a8880bf20a09 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -27,6 +27,7 @@ from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( TableauConfig, + TableauProject, TableauSiteSource, TableauSource, TableauSourceReport, @@ -1342,6 +1343,82 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) +@pytest.mark.parametrize( + "extract_project_hierarchy, allowed_projects", + [ + (True, ["project1", "project4", "project3"]), + (False, ["project1", "project4"]), + ], +) +def test_extract_project_hierarchy(extract_project_hierarchy, allowed_projects): + context = PipelineContext(run_id="0", pipeline_name="test_tableau") + + config_dict = config_source_default.copy() + + del config_dict["stateful_ingestion"] + del config_dict["projects"] + + config_dict["project_pattern"] = { + "allow": ["project1", "project4"], + "deny": ["project2"], + } + + config_dict["extract_project_hierarchy"] = extract_project_hierarchy + + config = TableauConfig.parse_obj(config_dict) + + site_source = TableauSiteSource( + config=config, + ctx=context, + platform="tableau", + site=SiteItem(name="Site 1", content_url="site1"), + site_id="site1", + report=TableauSourceReport(), + server=Server("https://test-tableau-server.com"), + ) + + all_project_map: Dict[str, TableauProject] = { + "p1": TableauProject( + id="1", + name="project1", + path=[], + parent_id=None, + parent_name=None, + description=None, + ), + "p2": TableauProject( + id="2", + name="project2", + path=[], + parent_id="1", + parent_name="project1", + description=None, + ), + "p3": TableauProject( + id="3", + name="project3", + path=[], + parent_id="1", + parent_name="project1", + description=None, + ), + "p4": TableauProject( + id="4", + name="project4", + path=[], + parent_id=None, + parent_name=None, + description=None, + ), + } + + site_source._init_tableau_project_registry(all_project_map) + + assert allowed_projects == [ + project.name for project in site_source.tableau_project_registry.values() + ] + + @pytest.mark.integration def test_connection_report_test(requests_mock): server_info_response = """ From 667fa8fccec40037c55ec1c99a35777dbc0e5eaf Mon Sep 17 00:00:00 2001 From: "nicholas.fwang" Date: Sat, 21 Dec 2024 04:59:44 +0900 Subject: [PATCH 09/49] docs(structured properties): fix entityTypes in creating structured property (#12187) --- docs/api/tutorials/structured-properties.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/tutorials/structured-properties.md b/docs/api/tutorials/structured-properties.md index 95c89424e9ca7a..2caa015e206595 100644 --- a/docs/api/tutorials/structured-properties.md +++ b/docs/api/tutorials/structured-properties.md @@ -73,7 +73,7 @@ mutation createStructuredProperty { {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} ], cardinality: SINGLE, - entityTypes: ["urn:li:entityType:dataset", "urn:li:entityType:dataFlow"], + entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], } ) { urn From 327c6f911ada269d8ad9554bceed8aaf16568295 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:59:07 -0600 Subject: [PATCH 10/49] chore(bump): bump alpine and dockerize (#12184) --- .../docker-custom-build-and-push/action.yml | 3 +- .github/workflows/docker-postgres-setup.yml | 2 +- .github/workflows/docker-unified.yml | 46 +++++++++---------- docker/datahub-gms/Dockerfile | 4 +- docker/datahub-mae-consumer/Dockerfile | 4 +- docker/datahub-mce-consumer/Dockerfile | 4 +- docker/datahub-upgrade/Dockerfile | 4 +- docker/elasticsearch-setup/Dockerfile | 4 +- docker/mysql-setup/Dockerfile | 4 +- docker/postgres-setup/Dockerfile | 4 +- 10 files changed, 40 insertions(+), 39 deletions(-) diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index ccaff510c120aa..cc2c2bd86416d7 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -97,10 +97,11 @@ runs: cache-to: | type=inline - name: Upload image locally for testing (if not publishing) - uses: ishworkh/docker-image-artifact-upload@v1 + uses: ishworkh/container-image-artifact-upload@v2.0.0 if: ${{ inputs.publish != 'true' }} with: image: ${{ steps.single_tag.outputs.SINGLE_TAG }} + retention_days: "2" # Code for building multi-platform images and pushing to Docker Hub. - name: Set up QEMU diff --git a/.github/workflows/docker-postgres-setup.yml b/.github/workflows/docker-postgres-setup.yml index 956f3f7b1c3903..c028bfb55d48d5 100644 --- a/.github/workflows/docker-postgres-setup.yml +++ b/.github/workflows/docker-postgres-setup.yml @@ -52,7 +52,7 @@ jobs: with: images: | acryldata/datahub-postgres-setup - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' }} diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 49dd26e1cd27e3..16a2d29e9fd85e 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -186,7 +186,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_GMS_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -257,7 +257,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MAE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -328,7 +328,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MCE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -399,7 +399,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -472,7 +472,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: actions/checkout@v4 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_FRONTEND_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -533,7 +533,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -594,7 +594,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -655,7 +655,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -727,7 +727,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -775,7 +775,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -836,7 +836,7 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish =='true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} @@ -883,7 +883,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image Slim Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} @@ -937,7 +937,7 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -982,7 +982,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image Full Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} @@ -1079,47 +1079,47 @@ jobs: - name: Disk Check run: df -h . && docker images - name: Download GMS image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.gms_build.result == 'success' }} with: image: ${{ env.DATAHUB_GMS_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Frontend image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.frontend_build.result == 'success' }} with: image: ${{ env.DATAHUB_FRONTEND_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Kafka Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.kafka_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Mysql Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mysql_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Elastic Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.elasticsearch_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download MCE Consumer image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mce_consumer_build.result == 'success' }} with: image: ${{ env.DATAHUB_MCE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download MAE Consumer image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mae_consumer_build.result == 'success' }} with: image: ${{ env.DATAHUB_MAE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download upgrade image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.datahub_upgrade_build.result == 'success' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download datahub-ingestion-slim image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' && needs.datahub_ingestion_slim_build.result == 'success' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index b15bf3c6f9f17b..47b10535f8deea 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index 6edaa29ee1a8bb..74375072761d89 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index 1eb56633c561e6..3adef53cd06068 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index 3d59a903414b1a..a8ef4e8034fdd5 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile index 4e64dcbc1e452c..1a6fe5bee6c840 100644 --- a/docker/elasticsearch-setup/Dockerfile +++ b/docker/elasticsearch-setup/Dockerfile @@ -6,11 +6,11 @@ ARG APP_ENV=prod # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile index b0ca45ad8f6f24..8a2d42bc233180 100644 --- a/docker/mysql-setup/Dockerfile +++ b/docker/mysql-setup/Dockerfile @@ -1,11 +1,11 @@ # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile index e145456e807d4d..31e9687cea15e8 100644 --- a/docker/postgres-setup/Dockerfile +++ b/docker/postgres-setup/Dockerfile @@ -1,11 +1,11 @@ # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk From f6c0cf34c075e078fe6cf3c2e18e6a8d711cc8db Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:04:58 -0600 Subject: [PATCH 11/49] docs update: Update v_0_3_7.md (#12197) Co-authored-by: Chris Collins --- docs/managed-datahub/release-notes/v_0_3_7.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index 75f5ac21224c27..31302403ea9305 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -13,6 +13,12 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Known Issues +### v0.3.7.8 + * Notes Feature + * Adding a Note to an entity will result in that note showing up in the Settings > Home Page list of announcements as well as the profile page of the entity. + * If more than 30 Notes are added to entities, there's a risk that home page announcements will not show up on the home page properly. + * Notes are only supported for Dataset and Column entities in this release. + ### v0.3.7.7 * Postgres regression, non-functional when using postgres @@ -24,7 +30,9 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ### v0.3.7.8 +- Helm Chart Requirement: 1.4.157+ - [Postgres] Fix regression from MySQL fix in v0.3.7.7 +- [UI] Fix editing post on entity profile page becomes announcement ### v0.3.7.7 From 8e9fc20fb6ec57b547c97d433ec5f85b8a3efe9a Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 20:00:09 -0600 Subject: [PATCH 12/49] feat(gradle): add quickstartPgDebug option (#12195) --- docker/build.gradle | 262 ++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 131 deletions(-) diff --git a/docker/build.gradle b/docker/build.gradle index 25e3dc12036ef9..7b36c0d9acdcf0 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -18,24 +18,131 @@ ext { ':datahub-upgrade', ':metadata-service:war', ] - quickstart_modules = backend_profile_modules + [ - ':metadata-jobs:mce-consumer-job', - ':metadata-jobs:mae-consumer-job', - ':datahub-frontend' + + python_services_modules = [] + + // Common configuration for all tasks + common_config = [ + captureContainersOutput: true, + captureContainersOutputToFiles: project.file('build/container-logs') ] - debug_modules = quickstart_modules - [':metadata-jobs:mce-consumer-job', - ':metadata-jobs:mae-consumer-job'] - compose_args = ['-f', compose_base] - debug_reloadable = [ - 'datahub-gms-debug', - 'system-update-debug', - 'frontend-debug' + // declarative task configuration + quickstart_configs = [ + 'quickstart': [ + profile: 'quickstart-consumers', + modules: python_services_modules + backend_profile_modules + [ + ':datahub-frontend', + ':metadata-jobs:mce-consumer-job', + ':metadata-jobs:mae-consumer-job', + ] + ], + 'quickstartDebug': [ + profile: 'debug', + modules: python_services_modules + backend_profile_modules + [':datahub-frontend'], + isDebug: true + ], + 'quickstartPg': [ + profile: 'quickstart-postgres', + modules: (backend_profile_modules - [':docker:mysql-setup']) + [ + ':docker:postgres-setup', + ':datahub-frontend' + ] + ], + 'quickstartPgDebug': [ + profile: 'debug-postgres', + modules: python_services_modules + (backend_profile_modules - [':docker:mysql-setup']) + [ + ':docker:postgres-setup', + ':datahub-frontend' + ], + isDebug: true + ], + 'quickstartSlim': [ + profile: 'quickstart-backend', + modules: backend_profile_modules + [':docker:datahub-ingestion'], + additionalEnv: [ + 'DATAHUB_ACTIONS_IMAGE': 'acryldata/datahub-ingestion', + 'ACTIONS_VERSION': "v${version}-slim", + 'ACTIONS_EXTRA_PACKAGES': 'acryl-datahub-actions[executor] acryl-datahub-actions', + 'ACTIONS_CONFIG': 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml', + 'DATAHUB_LOCAL_COMMON_ENV': "${rootProject.project(':metadata-integration:java:spark-lineage-legacy').projectDir}/spark-smoke-test/smoke-gms.env" + ] + ], + 'quickstartStorage': [ + profile: 'quickstart-storage', + preserveVolumes: true + ] ] - // Postgres - pg_quickstart_modules = quickstart_modules - [':docker:mysql-setup'] + [':docker:postgres-setup'] +} + +// Register all quickstart tasks +quickstart_configs.each { taskName, config -> + tasks.register(taskName) +} + +// Dynamically create all quickstart tasks and configurations +dockerCompose { + // Configure default settings that apply to all configurations + useComposeFiles = [compose_base] + projectName = project_name + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + + quickstart_configs.each { taskName, config -> + "${taskName}" { + isRequiredBy(tasks.named(taskName)) + if (config.profile) { + composeAdditionalArgs = ['--profile', config.profile] + } + + // Common environment variables + environment.put 'DATAHUB_VERSION', config.isDebug ? + System.getenv("DATAHUB_VERSION") ?: "v${version}" : + "v${version}" + environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' + environment.put "METADATA_TESTS_ENABLED", "true" + environment.put "DATAHUB_REPO", "${docker_registry}" + + // Additional environment variables if specified + if (config.additionalEnv) { + config.additionalEnv.each { key, value -> + environment.put key, value + } + } + + useComposeFiles = [compose_base] + projectName = project_name + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + + // Apply common configuration + common_config.each { key, value -> + delegate."${key}" = value + } + + // Apply additional task-specific configuration if specified + if (config.additionalConfig) { + config.additionalConfig.each { key, value -> + delegate."${key}" = value + } + } + } + } +} - revision = 1 // increment to trigger rebuild +// Configure dependencies for ComposeUp tasks +quickstart_configs.each { taskName, config -> + if (config.modules) { + tasks.getByName("${taskName}ComposeUp").dependsOn( + config.modules.collect { it + ":${config.isDebug ? 'dockerTagDebug' : 'dockerTag'}" } + ) + } } tasks.register('minDockerCompose2.20', Exec) { @@ -43,18 +150,11 @@ tasks.register('minDockerCompose2.20', Exec) { args '-c', 'echo -e "$(docker compose version --short)\n2.20"|sort --version-sort --check=quiet --reverse' } -tasks.register('quickstart') {} -tasks.register('quickstartSlim') {} -tasks.register('quickstartDebug') {} -tasks.register('quickstartPg') {} -tasks.register('quickstartStorage') {} - tasks.register('quickstartNuke') { doFirst { - dockerCompose.quickstart.removeVolumes = true - dockerCompose.quickstartPg.removeVolumes = true - dockerCompose.quickstartSlim.removeVolumes = true - dockerCompose.quickstartDebug.removeVolumes = true + quickstart_configs.each { taskName, config -> + dockerCompose."${taskName}".removeVolumes = !config.preserveVolumes + } } finalizedBy(tasks.withType(ComposeDownForced)) } @@ -63,117 +163,17 @@ tasks.register('quickstartDown') { finalizedBy(tasks.withType(ComposeDownForced)) } -dockerCompose { - quickstart { - isRequiredBy(tasks.named('quickstart')) - composeAdditionalArgs = ['--profile', 'quickstart-consumers'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - captureContainersOutput = true - captureContainersOutputToFiles = project.file('build/container-logs') - } - - quickstartPg { - isRequiredBy(tasks.named('quickstartPg')) - composeAdditionalArgs = ['--profile', 'quickstart-postgres'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } - - /** - * The smallest disk footprint required for Spark integration tests - * - * No frontend, mae, mce, or other services - */ - quickstartSlim { - isRequiredBy(tasks.named('quickstartSlim')) - composeAdditionalArgs = ['--profile', 'quickstart-backend'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" - environment.put "ACTIONS_VERSION", "v${version}-slim" - environment.put "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' - environment.put "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - // disabled for spark-lineage smoke-test - environment.put 'DATAHUB_LOCAL_COMMON_ENV', "${rootProject.project(':metadata-integration:java:spark-lineage-legacy').projectDir}/spark-smoke-test/smoke-gms.env" - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - captureContainersOutput = true - captureContainersOutputToFiles = project.file('build/container-logs') - } - - quickstartDebug { - isRequiredBy(tasks.named('quickstartDebug')) - composeAdditionalArgs = ['--profile', 'debug'] - - if (System.getenv().containsKey("DATAHUB_VERSION")) { - environment.put 'DATAHUB_VERSION', System.getenv("DATAHUB_VERSION") - } - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } - - quickstartStorage { - isRequiredBy(tasks.named('quickstartStorage')) - composeAdditionalArgs = ['--profile', 'quickstart-storage'] - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } -} -tasks.getByName('quickstartComposeUp').dependsOn( - quickstart_modules.collect { it + ':dockerTag' }) -tasks.getByName('quickstartPgComposeUp').dependsOn( - pg_quickstart_modules.collect { it + ':dockerTag' }) -tasks.getByName('quickstartSlimComposeUp').dependsOn( - ([':docker:datahub-ingestion'] + backend_profile_modules) - .collect { it + ':dockerTag' }) -tasks.getByName('quickstartDebugComposeUp').dependsOn( - debug_modules.collect { it + ':dockerTagDebug' } -) tasks.withType(ComposeUp).configureEach { shouldRunAfter('quickstartNuke') dependsOn tasks.named("minDockerCompose2.20") } task debugReload(type: Exec) { - def cmd = ['docker compose -p datahub --profile debug'] + compose_args + ['restart'] + debug_reloadable + def cmd = ['docker compose -p datahub --profile debug'] + ['-f', compose_base] + [ + 'restart', + 'datahub-gms-debug', + 'system-update-debug', + 'frontend-debug' + ] commandLine 'bash', '-c', cmd.join(" ") -} +} \ No newline at end of file From 0b4d96e95c50c3db1fdf8cb65954e1f423c17310 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 12:07:53 +0530 Subject: [PATCH 13/49] fix(ingest/powerbi): support comments in m-query grammar (#12177) --- .../powerbi/powerbi-lexical-grammar.rule | 18 ++++++++-- .../integration/powerbi/test_m_parser.py | 36 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule index 51a0dff288558f..f237e2503317f2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule @@ -21,6 +21,11 @@ // | empty_string // | empty_string "," argument_list // - Added sql_string in any_literal +// - Added WS_INLINE? in field expression +// Added to ignore any comments +// %ignore WS // Ignore whitespace +// %ignore CPP_COMMENT // Ignore single-line comments +// %ignore C_COMMENT // Ignore multi-line comments lexical_unit: lexical_elements? @@ -245,6 +250,8 @@ operator_or_punctuator: "," | "=>" | ".." | "..." + | "{{" + | "}}" document: section_document | expression_document @@ -275,6 +282,7 @@ expression: logical_or_expression | if_expression | error_raising_expression | error_handling_expression + | outer_expression logical_or_expression: logical_and_expression @@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/ sql_string: "\"" sql_content "\"" +outer_expression: "{{" expression "}}" + argument_list: WS_INLINE? expression | WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list | WS_INLINE? sql_string @@ -409,7 +419,7 @@ record_expression: "[" field_list? "]" field_list: field | field "," field_list -field: field_name WS_INLINE? "=" WS_INLINE? expression +field: WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression field_name: generalized_identifier | quoted_identifier @@ -621,4 +631,8 @@ any_literal: record_literal %import common.DIGIT %import common.LF %import common.CR -%import common.ESCAPED_STRING \ No newline at end of file +%import common.ESCAPED_STRING + +%ignore WS // Ignore whitespace +%ignore CPP_COMMENT // Ignore single-line comments +%ignore C_COMMENT // Ignore multi-line comments \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 63821f9038a88c..832d00d9c54702 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1171,3 +1171,39 @@ def test_m_query_timeout(mock_get_lark_parser): assert ( is_entry_present ), 'Warning message "M-Query Parsing Timeout" should be present in reporter' + + +def test_comments_in_m_query(): + q: str = 'let\n Source = Snowflake.Databases("xaa48144.snowflakecomputing.com", "COMPUTE_WH", [Role="ACCOUNTADMIN"]),\n SNOWFLAKE_SAMPLE_DATA_Database = Source{[Name="SNOWFLAKE_SAMPLE_DATA", Kind="Database"]}[Data],\n TPCDS_SF100TCL_Schema = SNOWFLAKE_SAMPLE_DATA_Database{[Name="TPCDS_SF100TCL", Kind="Schema"]}[Data],\n ITEM_Table = TPCDS_SF100TCL_Schema{[Name="ITEM", Kind="Table"]}[Data],\n \n // Group by I_BRAND and calculate the count\n BrandCountsTable = Table.Group(ITEM_Table, {"I_BRAND"}, {{"BrandCount", each Table.RowCount(_), Int64.Type}})\nin\n BrandCountsTable' + + table: powerbi_data_classes.Table = powerbi_data_classes.Table( + columns=[], + measures=[], + expression=q, + name="pet_price_index", + full_name="datalake.sandbox_pet.pet_price_index", + ) + + reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + parameters={ + "hostname": "xyz.databricks.com", + "http_path": "/sql/1.0/warehouses/abc", + "catalog": "cat", + "schema": "public", + }, + )[0].upstreams + + assert len(data_platform_tables) == 1 + assert ( + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpcds_sf100tcl.item,PROD)" + ) From 95b9d1b4c9687c3d505485aa600b5040a2549047 Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Sat, 21 Dec 2024 06:38:59 +0000 Subject: [PATCH 14/49] feat(ingest/aws-common): improved instance profile support (#12139) for ec2, ecs, eks, lambda, beanstalk, app runner and cft roles --- .../ingestion/source/aws/aws_common.py | 258 ++++++++++++-- .../tests/unit/test_aws_common.py | 328 ++++++++++++++++++ 2 files changed, 559 insertions(+), 27 deletions(-) create mode 100644 metadata-ingestion/tests/unit/test_aws_common.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py index 161aed5bb59881..b76eb95def1ede 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py @@ -1,7 +1,12 @@ +import logging +import os from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union +from enum import Enum +from http import HTTPStatus +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union import boto3 +import requests from boto3.session import Session from botocore.config import DEFAULT_TIMEOUT, Config from botocore.utils import fix_s3_host @@ -14,6 +19,8 @@ ) from datahub.configuration.source_common import EnvConfigMixin +logger = logging.getLogger(__name__) + if TYPE_CHECKING: from mypy_boto3_dynamodb import DynamoDBClient from mypy_boto3_glue import GlueClient @@ -22,6 +29,26 @@ from mypy_boto3_sts import STSClient +class AwsEnvironment(Enum): + EC2 = "EC2" + ECS = "ECS" + EKS = "EKS" + LAMBDA = "LAMBDA" + APP_RUNNER = "APP_RUNNER" + BEANSTALK = "ELASTIC_BEANSTALK" + CLOUD_FORMATION = "CLOUD_FORMATION" + UNKNOWN = "UNKNOWN" + + +class AwsServicePrincipal(Enum): + LAMBDA = "lambda.amazonaws.com" + EKS = "eks.amazonaws.com" + APP_RUNNER = "apprunner.amazonaws.com" + ECS = "ecs.amazonaws.com" + ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com" + EC2 = "ec2.amazonaws.com" + + class AwsAssumeRoleConfig(PermissiveConfigModel): # Using the PermissiveConfigModel to allow the user to pass additional arguments. @@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel): ) +def get_instance_metadata_token() -> Optional[str]: + """Get IMDSv2 token""" + try: + response = requests.put( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + if response.status_code == HTTPStatus.OK: + return response.text + except requests.exceptions.RequestException: + logger.debug("Failed to get IMDSv2 token") + return None + + +def is_running_on_ec2() -> bool: + """Check if code is running on EC2 using IMDSv2""" + token = get_instance_metadata_token() + if not token: + return False + + try: + response = requests.get( + "http://169.254.169.254/latest/meta-data/instance-id", + headers={"X-aws-ec2-metadata-token": token}, + timeout=1, + ) + return response.status_code == HTTPStatus.OK + except requests.exceptions.RequestException: + return False + + +def detect_aws_environment() -> AwsEnvironment: + """ + Detect the AWS environment we're running in. + Order matters as some environments may have multiple indicators. + """ + # Check Lambda first as it's most specific + if os.getenv("AWS_LAMBDA_FUNCTION_NAME"): + if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"): + return AwsEnvironment.CLOUD_FORMATION + return AwsEnvironment.LAMBDA + + # Check EKS (IRSA) + if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"): + return AwsEnvironment.EKS + + # Check App Runner + if os.getenv("AWS_APP_RUNNER_SERVICE_ID"): + return AwsEnvironment.APP_RUNNER + + # Check ECS + if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv( + "ECS_CONTAINER_METADATA_URI" + ): + return AwsEnvironment.ECS + + # Check Elastic Beanstalk + if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"): + return AwsEnvironment.BEANSTALK + + if is_running_on_ec2(): + return AwsEnvironment.EC2 + + return AwsEnvironment.UNKNOWN + + +def get_instance_role_arn() -> Optional[str]: + """Get role ARN from EC2 instance metadata using IMDSv2""" + token = get_instance_metadata_token() + if not token: + return None + + try: + response = requests.get( + "http://169.254.169.254/latest/meta-data/iam/security-credentials/", + headers={"X-aws-ec2-metadata-token": token}, + timeout=1, + ) + if response.status_code == 200: + role_name = response.text.strip() + if role_name: + sts = boto3.client("sts") + identity = sts.get_caller_identity() + return identity.get("Arn") + except Exception as e: + logger.debug(f"Failed to get instance role ARN: {e}") + return None + + +def get_lambda_role_arn() -> Optional[str]: + """Get the Lambda function's role ARN""" + try: + function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME") + if not function_name: + return None + + lambda_client = boto3.client("lambda") + function_config = lambda_client.get_function_configuration( + FunctionName=function_name + ) + return function_config.get("Role") + except Exception as e: + logger.debug(f"Failed to get Lambda role ARN: {e}") + return None + + +def get_current_identity() -> Tuple[Optional[str], Optional[str]]: + """ + Get the current role ARN and source type based on the runtime environment. + Returns (role_arn, credential_source) + """ + env = detect_aws_environment() + + if env == AwsEnvironment.LAMBDA: + role_arn = get_lambda_role_arn() + return role_arn, AwsServicePrincipal.LAMBDA.value + + elif env == AwsEnvironment.EKS: + role_arn = os.getenv("AWS_ROLE_ARN") + return role_arn, AwsServicePrincipal.EKS.value + + elif env == AwsEnvironment.APP_RUNNER: + try: + sts = boto3.client("sts") + identity = sts.get_caller_identity() + return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value + except Exception as e: + logger.debug(f"Failed to get App Runner role: {e}") + + elif env == AwsEnvironment.ECS: + try: + metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv( + "ECS_CONTAINER_METADATA_URI" + ) + if metadata_uri: + response = requests.get(f"{metadata_uri}/task", timeout=1) + if response.status_code == HTTPStatus.OK: + task_metadata = response.json() + if "TaskARN" in task_metadata: + return ( + task_metadata.get("TaskARN"), + AwsServicePrincipal.ECS.value, + ) + except Exception as e: + logger.debug(f"Failed to get ECS task role: {e}") + + elif env == AwsEnvironment.BEANSTALK: + # Beanstalk uses EC2 instance metadata + return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value + + elif env == AwsEnvironment.EC2: + return get_instance_role_arn(), AwsServicePrincipal.EC2.value + + return None, None + + def assume_role( role: AwsAssumeRoleConfig, aws_region: Optional[str], @@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel): ) aws_profile: Optional[str] = Field( default=None, - description="Named AWS profile to use. Only used if access key / secret are unset. If not set the default will be used", + description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.", ) aws_region: Optional[str] = Field(None, description="AWS region code.") @@ -145,6 +329,7 @@ def _normalized_aws_roles(self) -> List[AwsAssumeRoleConfig]: def get_session(self) -> Session: if self.aws_access_key_id and self.aws_secret_access_key: + # Explicit credentials take precedence session = Session( aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, @@ -152,38 +337,57 @@ def get_session(self) -> Session: region_name=self.aws_region, ) elif self.aws_profile: + # Named profile is second priority session = Session( region_name=self.aws_region, profile_name=self.aws_profile ) else: - # Use boto3's credential autodetection. + # Use boto3's credential autodetection session = Session(region_name=self.aws_region) - if self._normalized_aws_roles(): - # Use existing session credentials to start the chain of role assumption. - current_credentials = session.get_credentials() - credentials = { - "AccessKeyId": current_credentials.access_key, - "SecretAccessKey": current_credentials.secret_key, - "SessionToken": current_credentials.token, - } - - for role in self._normalized_aws_roles(): - if self._should_refresh_credentials(): - credentials = assume_role( - role, - self.aws_region, - credentials=credentials, + target_roles = self._normalized_aws_roles() + if target_roles: + current_role_arn, credential_source = get_current_identity() + + # Only assume role if: + # 1. We're not in a known AWS environment with a role, or + # 2. We need to assume a different role than our current one + should_assume_role = current_role_arn is None or any( + role.RoleArn != current_role_arn for role in target_roles + ) + + if should_assume_role: + env = detect_aws_environment() + logger.debug(f"Assuming role(s) from {env.value} environment") + + current_credentials = session.get_credentials() + if current_credentials is None: + raise ValueError("No credentials available for role assumption") + + credentials = { + "AccessKeyId": current_credentials.access_key, + "SecretAccessKey": current_credentials.secret_key, + "SessionToken": current_credentials.token, + } + + for role in target_roles: + if self._should_refresh_credentials(): + credentials = assume_role( + role=role, + aws_region=self.aws_region, + credentials=credentials, + ) + if isinstance(credentials["Expiration"], datetime): + self._credentials_expiration = credentials["Expiration"] + + session = Session( + aws_access_key_id=credentials["AccessKeyId"], + aws_secret_access_key=credentials["SecretAccessKey"], + aws_session_token=credentials["SessionToken"], + region_name=self.aws_region, ) - if isinstance(credentials["Expiration"], datetime): - self._credentials_expiration = credentials["Expiration"] - - session = Session( - aws_access_key_id=credentials["AccessKeyId"], - aws_secret_access_key=credentials["SecretAccessKey"], - aws_session_token=credentials["SessionToken"], - region_name=self.aws_region, - ) + else: + logger.debug(f"Using existing role from {credential_source}") return session diff --git a/metadata-ingestion/tests/unit/test_aws_common.py b/metadata-ingestion/tests/unit/test_aws_common.py new file mode 100644 index 00000000000000..9291fb91134b1c --- /dev/null +++ b/metadata-ingestion/tests/unit/test_aws_common.py @@ -0,0 +1,328 @@ +import json +import os +from unittest.mock import MagicMock, patch + +import boto3 +import pytest +from moto import mock_iam, mock_lambda, mock_sts + +from datahub.ingestion.source.aws.aws_common import ( + AwsConnectionConfig, + AwsEnvironment, + detect_aws_environment, + get_current_identity, + get_instance_metadata_token, + get_instance_role_arn, + is_running_on_ec2, +) + + +@pytest.fixture +def mock_aws_config(): + return AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + ) + + +class TestAwsCommon: + def test_environment_detection_no_environment(self): + """Test environment detection when no AWS environment is present""" + with patch.dict(os.environ, {}, clear=True): + assert detect_aws_environment() == AwsEnvironment.UNKNOWN + + def test_environment_detection_lambda(self): + """Test Lambda environment detection""" + with patch.dict(os.environ, {"AWS_LAMBDA_FUNCTION_NAME": "test-function"}): + assert detect_aws_environment() == AwsEnvironment.LAMBDA + + def test_environment_detection_lambda_cloudformation(self): + """Test CloudFormation Lambda environment detection""" + with patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "test-function", + "AWS_EXECUTION_ENV": "CloudFormation.xxx", + }, + ): + assert detect_aws_environment() == AwsEnvironment.CLOUD_FORMATION + + def test_environment_detection_eks(self): + """Test EKS environment detection""" + with patch.dict( + os.environ, + { + "AWS_WEB_IDENTITY_TOKEN_FILE": "/var/run/secrets/token", + "AWS_ROLE_ARN": "arn:aws:iam::123456789012:role/test-role", + }, + ): + assert detect_aws_environment() == AwsEnvironment.EKS + + def test_environment_detection_app_runner(self): + """Test App Runner environment detection""" + with patch.dict(os.environ, {"AWS_APP_RUNNER_SERVICE_ID": "service-id"}): + assert detect_aws_environment() == AwsEnvironment.APP_RUNNER + + def test_environment_detection_ecs(self): + """Test ECS environment detection""" + with patch.dict( + os.environ, {"ECS_CONTAINER_METADATA_URI_V4": "http://169.254.170.2/v4"} + ): + assert detect_aws_environment() == AwsEnvironment.ECS + + def test_environment_detection_beanstalk(self): + """Test Elastic Beanstalk environment detection""" + with patch.dict(os.environ, {"ELASTIC_BEANSTALK_ENVIRONMENT_NAME": "my-env"}): + assert detect_aws_environment() == AwsEnvironment.BEANSTALK + + @patch("requests.put") + def test_ec2_metadata_token(self, mock_put): + """Test EC2 metadata token retrieval""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + + token = get_instance_metadata_token() + assert token == "token123" + + mock_put.assert_called_once_with( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + + @patch("requests.put") + def test_ec2_metadata_token_failure(self, mock_put): + """Test EC2 metadata token failure case""" + mock_put.return_value.status_code = 404 + + token = get_instance_metadata_token() + assert token is None + + @patch("requests.get") + @patch("requests.put") + def test_is_running_on_ec2(self, mock_put, mock_get): + """Test EC2 instance detection with IMDSv2""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 200 + + assert is_running_on_ec2() is True + + mock_put.assert_called_once_with( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + mock_get.assert_called_once_with( + "http://169.254.169.254/latest/meta-data/instance-id", + headers={"X-aws-ec2-metadata-token": "token123"}, + timeout=1, + ) + + @patch("requests.get") + @patch("requests.put") + def test_is_running_on_ec2_failure(self, mock_put, mock_get): + """Test EC2 instance detection failure""" + mock_put.return_value.status_code = 404 + assert is_running_on_ec2() is False + + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 404 + assert is_running_on_ec2() is False + + @mock_sts + @mock_lambda + @mock_iam + def test_get_current_identity_lambda(self): + """Test getting identity in Lambda environment""" + with patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "test-function", + "AWS_DEFAULT_REGION": "us-east-1", + }, + ): + # Create IAM role first with proper trust policy + iam_client = boto3.client("iam", region_name="us-east-1") + trust_policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole", + } + ], + } + iam_client.create_role( + RoleName="test-role", AssumeRolePolicyDocument=json.dumps(trust_policy) + ) + + lambda_client = boto3.client("lambda", region_name="us-east-1") + lambda_client.create_function( + FunctionName="test-function", + Runtime="python3.8", + Role="arn:aws:iam::123456789012:role/test-role", + Handler="index.handler", + Code={"ZipFile": b"def handler(event, context): pass"}, + ) + + role_arn, source = get_current_identity() + assert source == "lambda.amazonaws.com" + assert role_arn == "arn:aws:iam::123456789012:role/test-role" + + @patch("requests.get") + @patch("requests.put") + @mock_sts + def test_get_instance_role_arn_success(self, mock_put, mock_get): + """Test getting EC2 instance role ARN""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 200 + mock_get.return_value.text = "test-role" + + with patch("boto3.client") as mock_boto: + mock_sts = MagicMock() + mock_sts.get_caller_identity.return_value = { + "Arn": "arn:aws:sts::123456789012:assumed-role/test-role/instance" + } + mock_boto.return_value = mock_sts + + role_arn = get_instance_role_arn() + assert ( + role_arn == "arn:aws:sts::123456789012:assumed-role/test-role/instance" + ) + + @mock_sts + def test_aws_connection_config_basic(self, mock_aws_config): + """Test basic AWS connection configuration""" + session = mock_aws_config.get_session() + creds = session.get_credentials() + assert creds.access_key == "test-key" + assert creds.secret_key == "test-secret" + + @mock_sts + def test_aws_connection_config_with_session_token(self): + """Test AWS connection with session token""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_session_token="test-token", + aws_region="us-east-1", + ) + + session = config.get_session() + creds = session.get_credentials() + assert creds.token == "test-token" + + @mock_sts + def test_aws_connection_config_role_assumption(self): + """Test AWS connection with role assumption""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + aws_role="arn:aws:iam::123456789012:role/test-role", + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = (None, None) + session = config.get_session() + creds = session.get_credentials() + assert creds is not None + + @mock_sts + def test_aws_connection_config_skip_role_assumption(self): + """Test AWS connection skipping role assumption when already in role""" + config = AwsConnectionConfig( + aws_region="us-east-1", + aws_role="arn:aws:iam::123456789012:role/current-role", + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = ( + "arn:aws:iam::123456789012:role/current-role", + "ec2.amazonaws.com", + ) + session = config.get_session() + assert session is not None + + @mock_sts + def test_aws_connection_config_multiple_roles(self): + """Test AWS connection with multiple role assumption""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + aws_role=[ + "arn:aws:iam::123456789012:role/role1", + "arn:aws:iam::123456789012:role/role2", + ], + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = (None, None) + session = config.get_session() + assert session is not None + + def test_aws_connection_config_validation_error(self): + """Test AWS connection validation""" + with patch.dict( + "os.environ", + { + "AWS_ACCESS_KEY_ID": "test-key", + # Deliberately missing AWS_SECRET_ACCESS_KEY + "AWS_DEFAULT_REGION": "us-east-1", + }, + clear=True, + ): + config = AwsConnectionConfig() # Let it pick up from environment + session = config.get_session() + with pytest.raises( + Exception, + match="Partial credentials found in env, missing: AWS_SECRET_ACCESS_KEY", + ): + session.get_credentials() + + @pytest.mark.parametrize( + "env_vars,expected_environment", + [ + ({}, AwsEnvironment.UNKNOWN), + ({"AWS_LAMBDA_FUNCTION_NAME": "test"}, AwsEnvironment.LAMBDA), + ( + { + "AWS_LAMBDA_FUNCTION_NAME": "test", + "AWS_EXECUTION_ENV": "CloudFormation", + }, + AwsEnvironment.CLOUD_FORMATION, + ), + ( + { + "AWS_WEB_IDENTITY_TOKEN_FILE": "/token", + "AWS_ROLE_ARN": "arn:aws:iam::123:role/test", + }, + AwsEnvironment.EKS, + ), + ({"AWS_APP_RUNNER_SERVICE_ID": "service-123"}, AwsEnvironment.APP_RUNNER), + ( + {"ECS_CONTAINER_METADATA_URI_V4": "http://169.254.170.2"}, + AwsEnvironment.ECS, + ), + ( + {"ELASTIC_BEANSTALK_ENVIRONMENT_NAME": "my-env"}, + AwsEnvironment.BEANSTALK, + ), + ], + ) + def test_environment_detection_parametrized(self, env_vars, expected_environment): + """Parametrized test for environment detection with different configurations""" + with patch.dict(os.environ, env_vars, clear=True): + assert detect_aws_environment() == expected_environment From 8350a4e03ac9a259bb21e295c173972fd74d5f6f Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Sat, 21 Dec 2024 07:52:27 +0000 Subject: [PATCH 15/49] feat(ingest/hive): lineage from/to file storage (#11841) Co-authored-by: Aseem Bansal --- .../src/datahub/ingestion/source/sql/hive.py | 614 +++++++++++++++++- 1 file changed, 606 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 59f301baf40165..fad54fda453786 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -1,7 +1,10 @@ import json import logging import re -from typing import Any, Dict, Iterable, List, Optional, Union +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from urllib.parse import urlparse from pydantic.class_validators import validator from pydantic.fields import Field @@ -11,7 +14,12 @@ from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp from sqlalchemy.engine.reflection import Inspector -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataplatform_instance_urn, + make_dataset_urn_with_platform_instance, + make_schema_field_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( SourceCapability, @@ -29,14 +37,24 @@ TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, + DatasetLineageTypeClass, + DatasetPropertiesClass, DateTypeClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, NullTypeClass, NumberTypeClass, - SchemaField, + OtherSchemaClass, + SchemaFieldClass, + SchemaMetadataClass, TimeTypeClass, + UpstreamClass, + UpstreamLineageClass, + ViewPropertiesClass, ) -from datahub.metadata.schema_classes import ViewPropertiesClass from datahub.utilities import config_clean from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column @@ -46,6 +64,511 @@ register_custom_type(HiveTimestamp, TimeTypeClass) register_custom_type(HiveDecimal, NumberTypeClass) + +class StoragePlatform(Enum): + """Enumeration of storage platforms supported for lineage""" + + S3 = "s3" + AZURE = "abs" + GCS = "gcs" + DBFS = "dbfs" + LOCAL = "file" + HDFS = "hdfs" + + +# Mapping of URL schemes to storage platforms +STORAGE_SCHEME_MAPPING = { + # S3 and derivatives + "s3": StoragePlatform.S3, + "s3a": StoragePlatform.S3, + "s3n": StoragePlatform.S3, + # Azure and derivatives + "abfs": StoragePlatform.AZURE, + "abfss": StoragePlatform.AZURE, + "adl": StoragePlatform.AZURE, + "adls": StoragePlatform.AZURE, + "wasb": StoragePlatform.AZURE, + "wasbs": StoragePlatform.AZURE, + # GCS and derivatives + "gs": StoragePlatform.GCS, + "gcs": StoragePlatform.GCS, + # DBFS + "dbfs": StoragePlatform.DBFS, + # Local filesystem + "file": StoragePlatform.LOCAL, + # HDFS + "hdfs": StoragePlatform.HDFS, +} + + +class StoragePathParser: + """Parser for storage paths with platform-specific logic""" + + @staticmethod + def parse_storage_location(location: str) -> Optional[Tuple[StoragePlatform, str]]: + """ + Parse a storage location into platform and normalized path. + + Args: + location: Storage location URI (e.g., s3://bucket/path, abfss://container@account.dfs.core.windows.net/path) + + Returns: + Tuple of (StoragePlatform, normalized_path) if valid, None if invalid + """ + + try: + # Handle special case for local files with no scheme + if location.startswith("/"): + return StoragePlatform.LOCAL, location + + # Parse the URI + parsed = urlparse(location) + scheme = parsed.scheme.lower() + + if not scheme: + return None + + # Look up the platform + platform = STORAGE_SCHEME_MAPPING.get(scheme) + if not platform: + return None + + # Get normalized path based on platform + if platform == StoragePlatform.S3: + # For S3, combine bucket and path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.AZURE: + if scheme in ("abfs", "abfss"): + # Format: abfss://container@account.dfs.core.windows.net/path + container = parsed.netloc.split("@")[0] + path = f"{container}/{parsed.path.lstrip('/')}" + else: + # Handle other Azure schemes + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.GCS: + # For GCS, combine bucket and path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.DBFS: + # For DBFS, use path as-is + path = parsed.path.lstrip("/") + + elif platform == StoragePlatform.LOCAL: + # For local files, use full path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.HDFS: + # For HDFS, use full path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + else: + return None + + # Clean up the path + path = path.rstrip("/") # Remove trailing slashes + path = re.sub(r"/+", "/", path) # Normalize multiple slashes + path = f"/{path}" + + return platform, path + + except Exception as exp: + logger.warning(f"Failed to parse storage location {location}: {exp}") + return None + + @staticmethod + def get_platform_name(platform: StoragePlatform) -> str: + """Get the platform name to use in URNs""" + + platform_names = { + StoragePlatform.S3: "s3", + StoragePlatform.AZURE: "adls", + StoragePlatform.GCS: "gcs", + StoragePlatform.DBFS: "dbfs", + StoragePlatform.LOCAL: "file", + StoragePlatform.HDFS: "hdfs", + } + return platform_names[platform] + + +class HiveStorageLineageConfig: + """Configuration for Hive storage lineage.""" + + def __init__( + self, + emit_storage_lineage: bool, + hive_storage_lineage_direction: str, + include_column_lineage: bool, + storage_platform_instance: Optional[str], + ): + if hive_storage_lineage_direction.lower() not in ["upstream", "downstream"]: + raise ValueError( + "hive_storage_lineage_direction must be either upstream or downstream" + ) + + self.emit_storage_lineage = emit_storage_lineage + self.hive_storage_lineage_direction = hive_storage_lineage_direction.lower() + self.include_column_lineage = include_column_lineage + self.storage_platform_instance = storage_platform_instance + + +@dataclass +class HiveStorageSourceReport: + """Report for tracking storage lineage statistics""" + + storage_locations_scanned: int = 0 + filtered_locations: List[str] = Field(default_factory=list) + failed_locations: List[str] = Field(default_factory=list) + + def report_location_scanned(self) -> None: + self.storage_locations_scanned += 1 + + def report_location_filtered(self, location: str) -> None: + self.filtered_locations.append(location) + + def report_location_failed(self, location: str) -> None: + self.failed_locations.append(location) + + +class HiveStorageLineage: + """Handles storage lineage for Hive tables""" + + def __init__( + self, + config: HiveStorageLineageConfig, + env: str, + convert_urns_to_lowercase: bool = False, + ): + self.config = config + self.env = env + self.convert_urns_to_lowercase = convert_urns_to_lowercase + self.report = HiveStorageSourceReport() + + def _make_dataset_platform_instance( + self, + platform: str, + instance: Optional[str], + ) -> DataPlatformInstanceClass: + """Create DataPlatformInstance aspect""" + + return DataPlatformInstanceClass( + platform=make_data_platform_urn(platform), + instance=make_dataplatform_instance_urn(platform, instance) + if instance + else None, + ) + + def _make_storage_dataset_urn( + self, + storage_location: str, + ) -> Optional[Tuple[str, str]]: + """ + Create storage dataset URN from location. + Returns tuple of (urn, platform) if successful, None otherwise. + """ + + platform_instance = None + storage_info = StoragePathParser.parse_storage_location(storage_location) + if not storage_info: + logger.debug(f"Could not parse storage location: {storage_location}") + return None + + platform, path = storage_info + platform_name = StoragePathParser.get_platform_name(platform) + + if self.convert_urns_to_lowercase: + platform_name = platform_name.lower() + path = path.lower() + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + try: + storage_urn = make_dataset_urn_with_platform_instance( + platform=platform_name, + name=path, + env=self.env, + platform_instance=platform_instance, + ) + return storage_urn, platform_name + except Exception as exp: + logger.error(f"Failed to create URN for {platform_name}:{path}: {exp}") + return None + + def _get_fine_grained_lineages( + self, + dataset_urn: str, + storage_urn: str, + dataset_schema: SchemaMetadataClass, + storage_schema: SchemaMetadataClass, + ) -> Iterable[FineGrainedLineageClass]: + """Generate column-level lineage between dataset and storage""" + + if not self.config.include_column_lineage: + return + + for dataset_field in dataset_schema.fields: + dataset_path = dataset_field.fieldPath + + # Find matching field in storage schema + matching_field = next( + (f for f in storage_schema.fields if f.fieldPath == dataset_path), + None, + ) + + if matching_field: + if self.config.hive_storage_lineage_direction == "upstream": + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + make_schema_field_urn( + parent_urn=storage_urn, + field_path=matching_field.fieldPath, + ) + ], + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn( + parent_urn=dataset_urn, + field_path=dataset_path, + ) + ], + ) + else: + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + make_schema_field_urn( + parent_urn=dataset_urn, + field_path=dataset_path, + ) + ], + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn( + parent_urn=storage_urn, + field_path=matching_field.fieldPath, + ) + ], + ) + + def _create_lineage_mcp( + self, + source_urn: str, + target_urn: str, + fine_grained_lineages: Optional[Iterable[FineGrainedLineageClass]] = None, + ) -> Iterable[MetadataWorkUnit]: + """Create lineage MCP between source and target datasets""" + + lineages_list = ( + list(fine_grained_lineages) if fine_grained_lineages is not None else None + ) + + upstream_lineage = UpstreamLineageClass( + upstreams=[ + UpstreamClass(dataset=source_urn, type=DatasetLineageTypeClass.COPY) + ], + fineGrainedLineages=lineages_list, + ) + + yield MetadataWorkUnit( + id=f"{source_urn}-{target_urn}-lineage", + mcp=MetadataChangeProposalWrapper( + entityUrn=target_urn, aspect=upstream_lineage + ), + ) + + def get_storage_dataset_mcp( + self, + storage_location: str, + platform_instance: Optional[str] = None, + schema_metadata: Optional[SchemaMetadataClass] = None, + ) -> Iterable[MetadataWorkUnit]: + """ + Generate MCPs for storage dataset if needed. + This creates the storage dataset entity in DataHub. + """ + + storage_info = StoragePathParser.parse_storage_location( + storage_location, + ) + if not storage_info: + return + + platform, path = storage_info + platform_name = StoragePathParser.get_platform_name(platform) + + if self.convert_urns_to_lowercase: + platform_name = platform_name.lower() + path = path.lower() + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + try: + storage_urn = make_dataset_urn_with_platform_instance( + platform=platform_name, + name=path, + env=self.env, + platform_instance=platform_instance, + ) + + # Dataset properties + props = DatasetPropertiesClass(name=path) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-props", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, + aspect=props, + ), + ) + + # Platform instance + platform_instance_aspect = self._make_dataset_platform_instance( + platform=platform_name, + instance=platform_instance, + ) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-platform", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, aspect=platform_instance_aspect + ), + ) + + # Schema if available + if schema_metadata: + storage_schema = SchemaMetadataClass( + schemaName=f"{platform.value}_schema", + platform=f"urn:li:dataPlatform:{platform.value}", + version=0, + fields=schema_metadata.fields, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + ) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-schema", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, aspect=storage_schema + ), + ) + + except Exception as e: + logger.error( + f"Failed to create storage dataset MCPs for {storage_location}: {e}" + ) + return + + def get_lineage_mcp( + self, + dataset_urn: str, + table: Dict[str, Any], + dataset_schema: Optional[SchemaMetadataClass] = None, + ) -> Iterable[MetadataWorkUnit]: + """ + Generate lineage MCP for a Hive table to its storage location. + + Args: + dataset_urn: URN of the Hive dataset + table: Hive table dictionary containing metadata + dataset_schema: Optional schema metadata for the Hive dataset + + Returns: + MetadataWorkUnit containing the lineage MCP if successful + """ + + platform_instance = None + + if not self.config.emit_storage_lineage: + return + + # Get storage location from table + storage_location = table.get("StorageDescriptor", {}).get("Location") + if not storage_location: + return + + # Create storage dataset URN + storage_info = self._make_storage_dataset_urn(storage_location) + if not storage_info: + self.report.report_location_failed(storage_location) + return + + storage_urn, storage_platform = storage_info + self.report.report_location_scanned() + + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + # Create storage dataset entity + yield from self.get_storage_dataset_mcp( + storage_location=storage_location, + platform_instance=platform_instance, + schema_metadata=dataset_schema, + ) + + # Get storage schema if available (implement based on storage system) + storage_schema = ( + self._get_storage_schema(storage_location, dataset_schema) + if dataset_schema + else None + ) + + # Generate fine-grained lineage if schemas available + fine_grained_lineages = ( + None + if not (dataset_schema and storage_schema) + else self._get_fine_grained_lineages( + dataset_urn, storage_urn, dataset_schema, storage_schema + ) + ) + + # Create lineage MCP + if self.config.hive_storage_lineage_direction == "upstream": + yield from self._create_lineage_mcp( + source_urn=storage_urn, + target_urn=dataset_urn, + fine_grained_lineages=fine_grained_lineages, + ) + else: + yield from self._create_lineage_mcp( + source_urn=dataset_urn, + target_urn=storage_urn, + fine_grained_lineages=fine_grained_lineages, + ) + + def _get_storage_schema( + self, + storage_location: str, + table_schema: Optional[SchemaMetadataClass] = None, + ) -> Optional[SchemaMetadataClass]: + """ + Get schema metadata for storage location. + Currently supports: + - Delta tables + - Parquet files + - Spark tables + + Returns: + SchemaMetadataClass if schema can be inferred, None otherwise + """ + + if not table_schema: + return None + + storage_info = StoragePathParser.parse_storage_location(storage_location) + if not storage_info: + return None + + platform, _ = storage_info + + return SchemaMetadataClass( + schemaName=f"{platform.value}_schema", + platform=f"urn:li:dataPlatform:{platform.value}", + version=0, + fields=table_schema.fields, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + ) + + try: from databricks_dbapi.sqlalchemy_dialects.hive import DatabricksPyhiveDialect from pyhive.sqlalchemy_hive import _type_map @@ -94,8 +617,8 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw): DatabricksPyhiveDialect.get_columns = dbapi_get_columns_patched except ModuleNotFoundError: pass -except Exception as e: - logger.warning(f"Failed to patch method due to {e}") +except Exception as exp: + logger.warning(f"Failed to patch method due to {exp}") @reflection.cache # type: ignore @@ -126,10 +649,48 @@ class HiveConfig(TwoTierSQLAlchemyConfig): # defaults scheme: str = Field(default="hive", hidden_from_docs=True) + # Overriding as table location lineage is richer implementation here than with include_table_location_lineage + include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True) + + emit_storage_lineage: bool = Field( + default=False, + description="Whether to emit storage-to-Hive lineage", + ) + hive_storage_lineage_direction: str = Field( + default="upstream", + description="If 'upstream', storage is upstream to Hive. If 'downstream' storage is downstream to Hive", + ) + include_column_lineage: bool = Field( + default=True, + description="When enabled, column-level lineage will be extracted from storage", + ) + storage_platform_instance: Optional[str] = Field( + default=None, + description="Platform instance for the storage system", + ) + @validator("host_port") def clean_host_port(cls, v): return config_clean.remove_protocol(v) + @validator("hive_storage_lineage_direction") + def _validate_direction(cls, v: str) -> str: + """Validate the lineage direction.""" + if v.lower() not in ["upstream", "downstream"]: + raise ValueError( + "storage_lineage_direction must be either upstream or downstream" + ) + return v.lower() + + def get_storage_lineage_config(self) -> HiveStorageLineageConfig: + """Convert base config parameters to HiveStorageLineageConfig""" + return HiveStorageLineageConfig( + emit_storage_lineage=self.emit_storage_lineage, + hive_storage_lineage_direction=self.hive_storage_lineage_direction, + include_column_lineage=self.include_column_lineage, + storage_platform_instance=self.storage_platform_instance, + ) + @platform_name("Hive") @config_class(HiveConfig) @@ -151,12 +712,49 @@ class HiveSource(TwoTierSQLAlchemySource): def __init__(self, config, ctx): super().__init__(config, ctx, "hive") + self.storage_lineage = HiveStorageLineage( + config=config.get_storage_lineage_config(), + env=config.env, + convert_urns_to_lowercase=config.convert_urns_to_lowercase, + ) @classmethod def create(cls, config_dict, ctx): config = HiveConfig.parse_obj(config_dict) return cls(config, ctx) + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """Generate workunits for tables and their storage lineage.""" + for wu in super().get_workunits_internal(): + yield wu + + if not isinstance(wu, MetadataWorkUnit): + continue + + # Get dataset URN and required aspects using workunit methods + try: + dataset_urn = wu.get_urn() + dataset_props = wu.get_aspect_of_type(DatasetPropertiesClass) + schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass) + except Exception as exp: + logger.warning(f"Failed to process workunit {wu.id}: {exp}") + continue + + # Only proceed if we have the necessary properties + if dataset_props and dataset_props.customProperties: + table = { + "StorageDescriptor": { + "Location": dataset_props.customProperties.get("Location") + } + } + + if table.get("StorageDescriptor", {}).get("Location"): + yield from self.storage_lineage.get_lineage_mcp( + dataset_urn=dataset_urn, + table=table, + dataset_schema=schema_metadata, + ) + def get_schema_names(self, inspector): assert isinstance(self.config, HiveConfig) # This condition restricts the ingestion to the specified database. @@ -173,7 +771,7 @@ def get_schema_fields_for_column( pk_constraints: Optional[Dict[Any, Any]] = None, partition_keys: Optional[List[str]] = None, tags: Optional[List[str]] = None, - ) -> List[SchemaField]: + ) -> List[SchemaFieldClass]: fields = super().get_schema_fields_for_column( dataset_name, column, From 494c522405830aaec181bcd2d61b2cfe9a53f155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Sun, 22 Dec 2024 13:21:41 +0100 Subject: [PATCH 16/49] fix(ingest/mssql): add container dataflow/ datajob entities (#12194) --- .../ingestion/source/sql/mssql/job_models.py | 26 +++ .../ingestion/source/sql/mssql/source.py | 10 + .../golden_mces_mssql_no_db_to_file.json | 207 ++++++++++++++++- .../golden_mces_mssql_no_db_with_filter.json | 162 ++++++++++++- .../golden_mces_mssql_to_file.json | 219 +++++++++++++++++- ...golden_mces_mssql_with_lower_case_urn.json | 207 ++++++++++++++++- 6 files changed, 795 insertions(+), 36 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index d3941e7add0fd0..0cd62611519285 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -7,7 +7,9 @@ make_data_platform_urn, make_dataplatform_instance_urn, ) +from datahub.emitter.mcp_builder import DatabaseKey from datahub.metadata.schema_classes import ( + ContainerClass, DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, @@ -210,6 +212,18 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) + @property + def as_container_aspect(self) -> ContainerClass: + databaseKey = DatabaseKey( + platform=self.entity.flow.orchestrator, + instance=self.entity.flow.platform_instance + if self.entity.flow.platform_instance + else None, + env=self.entity.flow.env, + database=self.entity.flow.db, + ) + return ContainerClass(container=databaseKey.as_urn()) + @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.flow.platform_instance: @@ -257,6 +271,18 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: externalUrl=self.external_url, ) + @property + def as_container_aspect(self) -> ContainerClass: + databaseKey = DatabaseKey( + platform=self.entity.orchestrator, + instance=self.entity.platform_instance + if self.entity.platform_instance + else None, + env=self.entity.env, + database=self.entity.db, + ) + return ContainerClass(container=databaseKey.as_urn()) + @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.platform_instance: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 9d8b67041998ce..547adcc8eccc9e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,6 +639,11 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_container_aspect, + ).as_workunit() + data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( @@ -662,6 +667,11 @@ def construct_flow_workunits( aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_flow.as_container_aspect, + ).as_workunit() + data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 72dcda25c1296c..720ef0b3929453 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2310,8 +2458,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-05 16:44:43.803000", - "date_modified": "2024-12-05 16:44:43.803000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,6 +2474,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4427,8 +4612,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index 0df89ff1eb94d7..cf3abbfc62997a 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index b36188405e7e11..c2289f954a36ee 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -112,6 +112,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", @@ -129,6 +145,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -137,11 +178,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-19 12:34:45.843000", - "date_modified": "2024-12-19 12:34:46.017000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -160,6 +201,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -195,6 +252,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", @@ -2502,6 +2584,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", @@ -2519,6 +2617,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2532,8 +2655,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-19 12:34:45.660000", - "date_modified": "2024-12-19 12:34:45.660000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2548,6 +2671,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2565,6 +2704,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2577,8 +2741,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-19 12:34:45.667000", - "date_modified": "2024-12-19 12:34:45.667000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2593,6 +2757,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2610,6 +2790,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ebcadcc11dcbfa..4db18dae27b7e9 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-22 12:58:03.260000", - "date_modified": "2024-11-22 12:58:03.440000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-22 12:58:03.137000", - "date_modified": "2024-11-22 12:58:03.137000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2310,8 +2458,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-11-22 12:58:03.140000", - "date_modified": "2024-11-22 12:58:03.140000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,6 +2474,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4427,8 +4612,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] From ff262bc65e7ab3e067f51a412cfb40db6e726fea Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Sun, 22 Dec 2024 18:24:18 +0530 Subject: [PATCH 17/49] Revert "fix(mssql): adds missing containers for dataflow and datajob entities, required for browse paths v2 generation" (#12201) --- .../ingestion/source/sql/mssql/job_models.py | 26 --- .../ingestion/source/sql/mssql/source.py | 10 - .../golden_mces_mssql_no_db_to_file.json | 207 +---------------- .../golden_mces_mssql_no_db_with_filter.json | 162 +------------ .../golden_mces_mssql_to_file.json | 219 +----------------- ...golden_mces_mssql_with_lower_case_urn.json | 207 +---------------- 6 files changed, 36 insertions(+), 795 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 0cd62611519285..d3941e7add0fd0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -7,9 +7,7 @@ make_data_platform_urn, make_dataplatform_instance_urn, ) -from datahub.emitter.mcp_builder import DatabaseKey from datahub.metadata.schema_classes import ( - ContainerClass, DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, @@ -212,18 +210,6 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) - @property - def as_container_aspect(self) -> ContainerClass: - databaseKey = DatabaseKey( - platform=self.entity.flow.orchestrator, - instance=self.entity.flow.platform_instance - if self.entity.flow.platform_instance - else None, - env=self.entity.flow.env, - database=self.entity.flow.db, - ) - return ContainerClass(container=databaseKey.as_urn()) - @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.flow.platform_instance: @@ -271,18 +257,6 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: externalUrl=self.external_url, ) - @property - def as_container_aspect(self) -> ContainerClass: - databaseKey = DatabaseKey( - platform=self.entity.orchestrator, - instance=self.entity.platform_instance - if self.entity.platform_instance - else None, - env=self.entity.env, - database=self.entity.db, - ) - return ContainerClass(container=databaseKey.as_urn()) - @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.platform_instance: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 547adcc8eccc9e..9d8b67041998ce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,11 +639,6 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_job.urn, - aspect=data_job.as_container_aspect, - ).as_workunit() - data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( @@ -667,11 +662,6 @@ def construct_flow_workunits( aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_flow.urn, - aspect=data_flow.as_container_aspect, - ).as_workunit() - data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 720ef0b3929453..72dcda25c1296c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-05 16:44:43.910000", + "date_modified": "2024-12-05 16:44:44.043000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-05 16:44:43.800000", + "date_modified": "2024-12-05 16:44:43.800000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2458,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-12-05 16:44:43.803000", + "date_modified": "2024-12-05 16:44:43.803000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2474,43 +2326,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4612,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" }, "name": "View1", "tags": [] diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index cf3abbfc62997a..0df89ff1eb94d7 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-05 16:44:43.910000", + "date_modified": "2024-12-05 16:44:44.043000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-05 16:44:43.800000", + "date_modified": "2024-12-05 16:44:43.800000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index c2289f954a36ee..b36188405e7e11 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -112,22 +112,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", @@ -145,31 +129,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -178,11 +137,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-19 12:34:45.843000", + "date_modified": "2024-12-19 12:34:46.017000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -201,22 +160,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -252,31 +195,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", @@ -2584,22 +2502,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", @@ -2617,31 +2519,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2655,8 +2532,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-19 12:34:45.660000", + "date_modified": "2024-12-19 12:34:45.660000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2671,22 +2548,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2704,31 +2565,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2741,8 +2577,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-12-19 12:34:45.667000", + "date_modified": "2024-12-19 12:34:45.667000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2757,22 +2593,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2790,31 +2610,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index 4db18dae27b7e9..ebcadcc11dcbfa 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2458,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2474,43 +2326,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4612,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" }, "name": "View1", "tags": [] From 73dce9e4180d7beef1ea6c9a7c9eeedbc551d18a Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Sun, 22 Dec 2024 10:28:19 -0600 Subject: [PATCH 18/49] =?UTF-8?q?chore(bump):=20bump=20node=20version=20lo?= =?UTF-8?q?ng=20term=20support=20release=20(build=20time=20=E2=80=A6=20(#1?= =?UTF-8?q?2199)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build-and-test.yml | 2 +- .github/workflows/docker-unified.yml | 2 +- datahub-web-react/build.gradle | 3 +-- datahub-web-react/package.json | 2 +- docs-website/build.gradle | 2 +- smoke-test/build.gradle | 2 +- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 1b10fe6e74372b..98071b536a336a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -110,7 +110,7 @@ jobs: run: | ./gradlew :datahub-frontend:build :datahub-web-react:build --parallel env: - NODE_OPTIONS: "--max-old-space-size=3072" + NODE_OPTIONS: "--max-old-space-size=4096" - name: Gradle compile (jdk8) for legacy Spark if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 16a2d29e9fd85e..03a9b3afc3bc58 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -446,7 +446,7 @@ jobs: ./gradlew :datahub-frontend:dist -x test -x yarnTest -x yarnLint --parallel mv ./datahub-frontend/build/distributions/datahub-frontend-*.zip datahub-frontend.zip env: - NODE_OPTIONS: "--max-old-space-size=3072" + NODE_OPTIONS: "--max-old-space-size=4096" - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle index b9fffce173c5c4..bf1aa401e3f560 100644 --- a/datahub-web-react/build.gradle +++ b/datahub-web-react/build.gradle @@ -16,7 +16,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' @@ -93,7 +93,6 @@ task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { } task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { - environment = [NODE_OPTIONS: "--max-old-space-size=3072 --openssl-legacy-provider"] args = ['run', 'build'] outputs.cacheIf { true } diff --git a/datahub-web-react/package.json b/datahub-web-react/package.json index 31c10804482f0c..2d1d667a89f14a 100644 --- a/datahub-web-react/package.json +++ b/datahub-web-react/package.json @@ -90,7 +90,7 @@ "analyze": "source-map-explorer 'dist/assets/*.js'", "start": "yarn run generate && vite", "ec2-dev": "yarn run generate && CI=true;export CI;vite", - "build": "yarn run generate && NODE_OPTIONS='--max-old-space-size=3072 --openssl-legacy-provider' CI=false vite build", + "build": "yarn run generate && NODE_OPTIONS='--max-old-space-size=4096 --openssl-legacy-provider' CI=false vite build", "test": "vitest", "generate": "graphql-codegen --config codegen.yml", "lint": "eslint . --ext .ts,.tsx --quiet && yarn format-check && yarn type-check", diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 797863d2019fbd..1be790695e87e6 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -14,7 +14,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index def3e814b2ba0a..73ecdcb08ea149 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -16,7 +16,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' From 0562c7a190c4548e29c7845fa44e9adf0248e4de Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 23 Dec 2024 16:56:54 +0530 Subject: [PATCH 19/49] fix(ingest): exclude aspect from migration (#12206) --- .../src/datahub/ingestion/source/datahub/config.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index a3304334cb1ebc..cd3c2146e6d848 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -14,6 +14,17 @@ DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2" DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1" DEFAULT_DATABASE_BATCH_SIZE = 10_000 +DEFAULT_EXCLUDE_ASPECTS = { + "dataHubIngestionSourceKey", + "dataHubIngestionSourceInfo", + "datahubIngestionRunSummary", + "datahubIngestionCheckpoint", + "dataHubSecretKey", + "dataHubSecretValue", + "globalSettingsKey", + "globalSettingsInfo", + "testResults", +} class DataHubSourceConfig(StatefulIngestionConfigBase): @@ -44,7 +55,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): ) exclude_aspects: Set[str] = Field( - default_factory=set, + default=DEFAULT_EXCLUDE_ASPECTS, description="Set of aspect names to exclude from ingestion", ) From d06980f6f3421ac5d3a3fc21d5c15f3e3057338f Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:11:40 +0530 Subject: [PATCH 20/49] fix(ingest/snowflake): handle empty snowflake column upstreams (#12207) --- .../source/snowflake/snowflake_lineage_v2.py | 6 ++--- .../unit/snowflake/test_snowflake_source.py | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 69f28a0e6e595a..b815a6584379ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type -from pydantic import BaseModel, validator +from pydantic import BaseModel, Field, validator from datahub.configuration.datetimes import parse_absolute_time from datahub.ingestion.api.closeable import Closeable @@ -72,8 +72,8 @@ class ColumnUpstreamJob(BaseModel): class ColumnUpstreamLineage(BaseModel): - column_name: str - upstreams: List[ColumnUpstreamJob] + column_name: Optional[str] + upstreams: List[ColumnUpstreamJob] = Field(default_factory=list) class UpstreamTableNode(BaseModel): diff --git a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index c735feb5396086..2ff85a08f052f9 100644 --- a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -18,6 +18,7 @@ DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, ) +from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import UpstreamLineageEdge from datahub.ingestion.source.snowflake.snowflake_query import ( SnowflakeQuery, create_deny_regex_sql_filter, @@ -664,3 +665,26 @@ def test_create_snowsight_base_url_ap_northeast_1(): def test_snowflake_utils() -> None: assert_doctest(datahub.ingestion.source.snowflake.snowflake_utils) + + +def test_snowflake_query_result_parsing(): + db_row = { + "DOWNSTREAM_TABLE_NAME": "db.schema.downstream_table", + "DOWNSTREAM_TABLE_DOMAIN": "Table", + "UPSTREAM_TABLES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "upstream_object_domain": "Table", + "upstream_object_name": "db.schema.upstream_table", + } + ], + "UPSTREAM_COLUMNS": [{}], + "QUERIES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "query_text": "Query test", + "start_time": "2022-12-01 19:56:34", + } + ], + } + assert UpstreamLineageEdge.parse_obj(db_row) From dd23f9e294a72076e2cbe241cd6ce18f205bac68 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 23 Dec 2024 21:28:18 +0530 Subject: [PATCH 21/49] fix(ui): null dereference (#12193) --- .../styled/ERModelRelationship/ERModelRelationUtils.tsx | 2 +- .../shared/tabs/Dataset/Queries/utils/filterQueries.ts | 6 +++--- .../shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts | 2 +- .../source/executions/ExecutionRequestDetailsModal.tsx | 4 ++-- datahub-web-react/src/app/lineage/utils/titleUtils.ts | 4 ++-- .../src/app/search/context/SearchResultContext.tsx | 2 +- .../src/app/search/matches/MatchedFieldList.tsx | 2 +- .../src/app/search/matches/SearchTextHighlighter.tsx | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx b/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx index 0eb198aec48033..811ebf99b123a2 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx @@ -68,6 +68,6 @@ export function getDatasetName(datainput: any): string { datainput?.editableProperties?.name || datainput?.properties?.name || datainput?.name || - datainput?.urn?.split(',').at(1) + datainput?.urn?.split(',')?.at(1) ); } diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts index a8ec960ea2e081..fb97c8235cbe60 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts @@ -10,9 +10,9 @@ export const filterQueries = (filterText, queries: Query[]) => { const lowerFilterText = filterText.toLowerCase(); return queries.filter((query) => { return ( - query.title?.toLowerCase().includes(lowerFilterText) || - query.description?.toLowerCase().includes(lowerFilterText) || - query.query?.toLowerCase().includes(lowerFilterText) + query.title?.toLowerCase()?.includes(lowerFilterText) || + query.description?.toLowerCase()?.includes(lowerFilterText) || + query.query?.toLowerCase()?.includes(lowerFilterText) ); }); }; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts index 53b76d53f886af..9c0813fc2b85ab 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts @@ -12,7 +12,7 @@ function matchesTagsOrTermsOrDescription(field: SchemaField, filterText: string, .toLocaleLowerCase() .includes(filterText), ) || - field.description?.toLocaleLowerCase().includes(filterText) + field.description?.toLocaleLowerCase()?.includes(filterText) ); } diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index a7e6f516bb7943..f56eb06b6af14e 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -129,7 +129,7 @@ export const ExecutionDetailsModal = ({ urn, open, onClose }: Props) => { downloadFile(output, `exec-${urn}.log`); }; - const logs = (showExpandedLogs && output) || output?.split('\n').slice(0, 5).join('\n'); + const logs = (showExpandedLogs && output) || output?.split('\n')?.slice(0, 5)?.join('\n'); const result = data?.executionRequest?.result as Partial; const status = getIngestionSourceStatus(result); @@ -163,7 +163,7 @@ export const ExecutionDetailsModal = ({ urn, open, onClose }: Props) => { } catch (e) { recipeYaml = ''; } - const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 5).join('\n'); + const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n')?.slice(0, 5)?.join('\n'); const areLogsExpandable = output?.split(/\r\n|\r|\n/)?.length > 5; const isRecipeExpandable = recipeYaml?.split(/\r\n|\r|\n/)?.length > 5; diff --git a/datahub-web-react/src/app/lineage/utils/titleUtils.ts b/datahub-web-react/src/app/lineage/utils/titleUtils.ts index 6bd4cfea0f09a7..8bd0cbda55b33e 100644 --- a/datahub-web-react/src/app/lineage/utils/titleUtils.ts +++ b/datahub-web-react/src/app/lineage/utils/titleUtils.ts @@ -124,10 +124,10 @@ function truncate(input, length) { function getLastTokenOfTitle(title?: string): string { if (!title) return ''; - const lastToken = title?.split('.').slice(-1)[0]; + const lastToken = title?.split('.')?.slice(-1)?.[0]; // if the last token does not contain any content, the string should not be tokenized on `.` - if (lastToken.replace(/\s/g, '').length === 0) { + if (lastToken?.replace(/\s/g, '')?.length === 0) { return title; } diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx index 68adead0051492..961a50c1d4bfe9 100644 --- a/datahub-web-react/src/app/search/context/SearchResultContext.tsx +++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx @@ -40,7 +40,7 @@ export const useSearchResult = () => { }; export const useEntityType = () => { - return useSearchResultContext()?.searchResult.entity.type; + return useSearchResultContext()?.searchResult?.entity?.type; }; export const useMatchedFields = () => { diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx index 0bfe000dea3663..9d77d446ff3b82 100644 --- a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx +++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx @@ -42,7 +42,7 @@ const RenderedField = ({ field: MatchedField; }) => { const entityRegistry = useEntityRegistry(); - const query = useSearchQuery()?.trim().toLowerCase(); + const query = useSearchQuery()?.trim()?.toLowerCase(); const customRenderedField = customFieldRenderer?.(field); if (customRenderedField) return {customRenderedField}; if (isHighlightableEntityField(field)) { diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx index d8da1088ea89d1..7a0a0e1e41a4b9 100644 --- a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx +++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx @@ -23,7 +23,7 @@ const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Pro const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight; const matchedFields = useMatchedFieldsByGroup(field); const hasMatchedField = !!matchedFields?.length; - const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase(); + const normalizedSearchQuery = useSearchQuery()?.trim()?.toLowerCase(); const normalizedText = text.trim().toLowerCase(); const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery); const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined; From dc82251afed92ed605ce6dcc7c956396c494ca29 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:03:52 -0500 Subject: [PATCH 22/49] fix(ingest): quote asset urns in patch path (#12212) --- metadata-ingestion/src/datahub/specific/dataproduct.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/specific/dataproduct.py b/metadata-ingestion/src/datahub/specific/dataproduct.py index 6b7e695b4d57e7..f9830a4b23df05 100644 --- a/metadata-ingestion/src/datahub/specific/dataproduct.py +++ b/metadata-ingestion/src/datahub/specific/dataproduct.py @@ -131,7 +131,7 @@ def add_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path=f"/assets/{asset_urn}", + path=f"/assets/{self.quote(asset_urn)}", value=DataProductAssociation(destinationUrn=asset_urn), ) return self @@ -140,7 +140,7 @@ def remove_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "remove", - path=f"/assets/{asset_urn}", + path=f"/assets/{self.quote(asset_urn)}", value={}, ) return self From 4c0b568887c7a3c2aa8a1e1b888ce362ce768485 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:04:06 -0500 Subject: [PATCH 23/49] feat(ingest): add sql parser trace mode (#12210) --- .../datahub/sql_parsing/sqlglot_lineage.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index f387618bfaec12..bf28ab0e7b229b 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -66,6 +66,7 @@ "SQL_LINEAGE_TIMEOUT_ENABLED", True ) SQL_LINEAGE_TIMEOUT_SECONDS = 10 +SQL_PARSER_TRACE = get_boolean_env_variable("DATAHUB_SQL_PARSER_TRACE", False) # These rules are a subset of the rules in sqlglot.optimizer.optimizer.RULES. @@ -365,10 +366,11 @@ def _sqlglot_force_column_normalizer( return node - # logger.debug( - # "Prior to case normalization sql %s", - # statement.sql(pretty=True, dialect=dialect), - # ) + if SQL_PARSER_TRACE: + logger.debug( + "Prior to case normalization sql %s", + statement.sql(pretty=True, dialect=dialect), + ) statement = statement.transform(_sqlglot_force_column_normalizer, copy=False) # logger.debug( # "Sql after casing normalization %s", @@ -562,7 +564,7 @@ def _select_statement_cll( # noqa: C901 ) ) - # TODO: Also extract referenced columns (aka auxillary / non-SELECT lineage) + # TODO: Also extract referenced columns (aka auxiliary / non-SELECT lineage) except (sqlglot.errors.OptimizeError, ValueError, IndexError) as e: raise SqlUnderstandingError( f"sqlglot failed to compute some lineage: {e}" @@ -1022,6 +1024,14 @@ def _sqlglot_lineage_inner( logger.debug( f"Resolved {total_schemas_resolved} of {total_tables_discovered} table schemas" ) + if SQL_PARSER_TRACE: + for qualified_table, schema_info in table_name_schema_mapping.items(): + logger.debug( + "Table name %s resolved to %s with schema %s", + qualified_table, + table_name_urn_mapping[qualified_table], + schema_info, + ) column_lineage: Optional[List[_ColumnLineageInfo]] = None try: From b6ea974630d68c61eb7c5cd624ee013817de7bd6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:04:15 -0500 Subject: [PATCH 24/49] fix(ingest): preserve certs when converting emitter to graph (#12211) --- metadata-ingestion/src/datahub/ingestion/graph/client.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 4aa937639e9590..ca9a41172e5b6e 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -188,9 +188,12 @@ def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph": retry_max_times=emitter._retry_max_times, extra_headers=emitter._session.headers, disable_ssl_verification=emitter._session.verify is False, - # TODO: Support these headers. - # ca_certificate_path=emitter._ca_certificate_path, - # client_certificate_path=emitter._client_certificate_path, + ca_certificate_path=( + emitter._session.verify + if isinstance(emitter._session.verify, str) + else None + ), + client_certificate_path=emitter._session.cert, ) ) From 21ddb5538d08b64279f3526aa250ec489f5497ed Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 16:32:49 -0500 Subject: [PATCH 25/49] fix(ingest/mode): move sql logic to view properties aspect (#12196) --- .../src/datahub/ingestion/source/mode.py | 21 ++++++--- .../integration/mode/mode_mces_golden.json | 43 ++++++++++++++++++- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index c1ab9271ce13ae..ef0b499129f97b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -98,6 +98,7 @@ TagPropertiesClass, UpstreamClass, UpstreamLineageClass, + ViewPropertiesClass, ) from datahub.metadata.urns import QueryUrn from datahub.sql_parsing.sqlglot_lineage import ( @@ -930,16 +931,13 @@ def construct_query_or_dataset( dataset_props = DatasetPropertiesClass( name=report_info.get("name") if is_mode_dataset else query_data.get("name"), - description=f"""### Source Code -``` sql -{query_data.get("raw_query")} -``` - """, + description=None, externalUrl=externalUrl, customProperties=self.get_custom_props_from_dict( query_data, [ - "id" "created_at", + "id", + "created_at", "updated_at", "last_run_id", "data_source_id", @@ -949,7 +947,6 @@ def construct_query_or_dataset( ], ), ) - yield ( MetadataChangeProposalWrapper( entityUrn=query_urn, @@ -957,6 +954,16 @@ def construct_query_or_dataset( ).as_workunit() ) + if raw_query := query_data.get("raw_query"): + yield MetadataChangeProposalWrapper( + entityUrn=query_urn, + aspect=ViewPropertiesClass( + viewLogic=raw_query, + viewLanguage=QueryLanguageClass.SQL, + materialized=False, + ), + ).as_workunit() + if is_mode_dataset: space_container_key = self.gen_space_key(space_token) yield from add_dataset_to_container( diff --git a/metadata-ingestion/tests/integration/mode/mode_mces_golden.json b/metadata-ingestion/tests/integration/mode/mode_mces_golden.json index ed00dc5734680d..84dbdbe89f7b50 100644 --- a/metadata-ingestion/tests/integration/mode/mode_mces_golden.json +++ b/metadata-ingestion/tests/integration/mode/mode_mces_golden.json @@ -176,6 +176,7 @@ "datasets": [ "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)" ], + "dashboards": [], "lastModified": { "created": { "time": 1639169724316, @@ -253,6 +254,8 @@ "aspect": { "json": { "customProperties": { + "id": "19780522", + "created_at": "2024-09-02T07:38:43.755Z", "updated_at": "2024-09-02T07:40:44.046Z", "last_run_id": "3535709679", "data_source_id": "44763", @@ -260,7 +263,6 @@ }, "externalUrl": "https://app.mode.com/acryl/datasets/24f66e1701b6", "name": "Dataset 1", - "description": "### Source Code\n``` sql\n-- Returns first 100 rows from DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY\n SELECT \n\t\tAGE,\n\t\tID,\n\t\tNAME,\n\t\t_FIVETRAN_DELETED,\n\t\t_FIVETRAN_SYNCED\n FROM DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY LIMIT 100;\n\n-- Returns first 100 rows from ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER\n SELECT \n\t\tCOMMUNICATION_ACCOUNT_ID,\n\t\tID,\n\t\tMMS_CAPABLE,\n\t\tPHONE_NUMBER,\n\t\tSMS_CAPABLE,\n\t\tSTATUS,\n\t\tSTATUS_TLM,\n\t\tTLM,\n\t\tVOICE_CAPABLE,\n\t\tWHEN_CREATED\n FROM ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER LIMIT 100;\n \n \n```\n ", "tags": [] } }, @@ -270,6 +272,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "-- Returns first 100 rows from DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY\n SELECT \n\t\tAGE,\n\t\tID,\n\t\tNAME,\n\t\t_FIVETRAN_DELETED,\n\t\t_FIVETRAN_SYNCED\n FROM DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY LIMIT 100;\n\n-- Returns first 100 rows from ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER\n SELECT \n\t\tCOMMUNICATION_ACCOUNT_ID,\n\t\tID,\n\t\tMMS_CAPABLE,\n\t\tPHONE_NUMBER,\n\t\tSMS_CAPABLE,\n\t\tSTATUS,\n\t\tSTATUS_TLM,\n\t\tTLM,\n\t\tVOICE_CAPABLE,\n\t\tWHEN_CREATED\n FROM ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER LIMIT 100;\n \n ", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "mode-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)", @@ -336,13 +356,14 @@ "aspect": { "json": { "customProperties": { + "id": "10149707", + "created_at": "2021-12-10T20:55:24.361Z", "updated_at": "2021-12-10T23:12:53.273Z", "last_run_id": "1897576958", "data_source_id": "34499" }, "externalUrl": "https://app.mode.com/acryl/reports/9d2da37fa91e/details/queries/6e26a9f3d4e2", "name": "Customer and staff", - "description": "### Source Code\n``` sql\nSELECT rental.*, staff.first_name \"Staff First Name\", staff.last_name \"Staff Last Name\" FROM {{ @join_on_definition as rental }} join staff on staff.staff_id = rental.staff_id where selected_id = {{ selected_id }} \n{% form %}\nselected_id:\n type: text\n default: my_id\n{% endform %}\n```\n ", "tags": [] } }, @@ -352,6 +373,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,10149707,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT rental.*, staff.first_name \"Staff First Name\", staff.last_name \"Staff Last Name\" FROM {{ @join_on_definition as rental }} join staff on staff.staff_id = rental.staff_id where selected_id = {{ selected_id }} \n{% form %}\nselected_id:\n type: text\n default: my_id\n{% endform %}", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "mode-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,10149707,PROD)", From 047644b888b121fa3feb10a5f33bdef60b1072ce Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Tue, 24 Dec 2024 10:06:35 +0900 Subject: [PATCH 26/49] feat: update mlflow-related metadata models (#12174) Co-authored-by: John Joyce Co-authored-by: John Joyce --- .../src/main/resources/entity.graphql | 196 +++++++++++++++++- .../dataprocess/DataProcessInstanceOutput.pdl | 2 +- .../DataProcessInstanceProperties.pdl | 2 +- .../ml/metadata/MLModelGroupProperties.pdl | 35 ++++ .../ml/metadata/MLModelProperties.pdl | 28 ++- .../ml/metadata/MLTrainingRunProperties.pdl | 36 ++++ .../src/main/resources/entity-registry.yml | 4 + .../com.linkedin.entity.aspects.snapshot.json | 54 +++-- ...com.linkedin.entity.entities.snapshot.json | 99 +++++++-- .../com.linkedin.entity.runs.snapshot.json | 54 +++-- ...nkedin.operations.operations.snapshot.json | 54 +++-- ...m.linkedin.platform.platform.snapshot.json | 99 +++++++-- 12 files changed, 568 insertions(+), 95 deletions(-) create mode 100644 metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index e086273068ee53..9abf4e16f12dd7 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -262,8 +262,16 @@ type Query { Fetch all Business Attributes """ listBusinessAttributes(input: ListBusinessAttributesInput!): ListBusinessAttributesResult + + """ + Fetch a Data Process Instance by primary key (urn) + """ + dataProcessInstance(urn: String!): DataProcessInstance + + } + """ An ERModelRelationship is a high-level abstraction that dictates what datasets fields are erModelRelationshiped. """ @@ -9832,15 +9840,45 @@ type MLModelGroup implements EntityWithRelationships & Entity & BrowsableEntity privileges: EntityPrivileges } +""" +Properties describing a group of related ML models +""" type MLModelGroupProperties { + """ + Display name of the model group + """ + name: String + """ + Detailed description of the model group's purpose and contents + """ description: String - createdAt: Long + """ + When this model group was created + """ + created: AuditStamp + """ + When this model group was last modified + """ + lastModified: AuditStamp + + """ + Version identifier for this model group + """ version: VersionTag + """ + Custom key-value properties for the model group + """ customProperties: [CustomPropertiesEntry!] + + """ + Deprecated creation timestamp + @deprecated Use the 'created' field instead + """ + createdAt: Long @deprecated(reason: "Use `created` instead") } """ @@ -9990,40 +10028,103 @@ description: String } type MLMetric { + """ + Name of the metric (e.g. accuracy, precision, recall) + """ name: String + """ + Description of what this metric measures + """ description: String + """ + The computed value of the metric + """ value: String + """ + Timestamp when this metric was recorded + """ createdAt: Long } type MLModelProperties { + """ + The display name of the model used in the UI + """ + name: String! + """ + Detailed description of the model's purpose and characteristics + """ description: String - date: Long + """ + When the model was last modified + """ + lastModified: AuditStamp + """ + Version identifier for this model + """ version: String + """ + The type/category of ML model (e.g. classification, regression) + """ type: String + """ + Mapping of hyperparameter configurations + """ hyperParameters: HyperParameterMap - hyperParams: [MLHyperParam] + """ + List of hyperparameter settings used to train this model + """ + hyperParams: [MLHyperParam] + """ + Performance metrics from model training + """ trainingMetrics: [MLMetric] + """ + Names of ML features used by this model + """ mlFeatures: [String!] + """ + Tags for categorizing and searching models + """ tags: [String!] + """ + Model groups this model belongs to + """ groups: [MLModelGroup] + """ + Additional custom properties specific to this model + """ customProperties: [CustomPropertiesEntry!] + """ + URL to view this model in external system + """ externalUrl: String + + """ + When this model was created + """ + created: AuditStamp + + """ + Deprecated timestamp for model creation + @deprecated Use 'created' field instead + """ + date: Long @deprecated(reason: "Use `created` instead") } type MLFeatureProperties { @@ -12804,3 +12905,92 @@ type CronSchedule { """ timezone: String! } + + +""" +Properties describing a data process instance's execution metadata +""" +type DataProcessInstanceProperties { + """ + The display name of this process instance + """ + name: String! + + """ + URL to view this process instance in the external system + """ + externalUrl: String + + """ + When this process instance was created + """ + created: AuditStamp + + """ + Additional custom properties specific to this process instance + """ + customProperties: [CustomPropertiesEntry!] +} + +""" +Properties specific to an ML model training run instance +""" +type MLTrainingRunProperties { + """ + Unique identifier for this training run + """ + id: String + + """ + List of URLs to access training run outputs (e.g. model artifacts, logs) + """ + outputUrls: [String] + + """ + Hyperparameters used in this training run + """ + hyperParams: [MLHyperParam] + + """ + Performance metrics recorded during this training run + """ + trainingMetrics: [MLMetric] +} + +extend type DataProcessInstance { + + """ + Additional read only properties associated with the Data Job + """ + properties: DataProcessInstanceProperties + + """ + The specific instance of the data platform that this entity belongs to + """ + dataPlatformInstance: DataPlatformInstance + + """ + Sub Types that this entity implements + """ + subTypes: SubTypes + + """ + The parent container in which the entity resides + """ + container: Container + + """ + Standardized platform urn where the data process instance is defined + """ + platform: DataPlatform! + + """ + Recursively get the lineage of containers for this entity + """ + parentContainers: ParentContainersResult + + """ + Additional properties when subtype is Training Run + """ + mlTrainingRunProperties: MLTrainingRunProperties +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl index f33c41e63efed6..fe782dbe01ca9b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl @@ -15,7 +15,7 @@ record DataProcessInstanceOutput { @Relationship = { "/*": { "name": "Produces", - "entityTypes": [ "dataset" ] + "entityTypes": [ "dataset", "mlModel" ] } } @Searchable = { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index c63cb1a97c017d..5c6bfaecf1ef4d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -52,4 +52,4 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc } created: AuditStamp -} +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl index b54e430038082d..81c5e7a240f618 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl @@ -4,6 +4,7 @@ import com.linkedin.common.Urn import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties +import com.linkedin.common.TimeStamp /** * Properties associated with an ML Model Group @@ -13,6 +14,17 @@ import com.linkedin.common.CustomProperties } record MLModelGroupProperties includes CustomProperties { + /** + * Display name of the MLModelGroup + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + /** * Documentation of the MLModelGroup */ @@ -25,8 +37,31 @@ record MLModelGroupProperties includes CustomProperties { /** * Date when the MLModelGroup was developed */ + @deprecated createdAt: optional Time + /** + * Time and Actor who created the MLModelGroup + */ + created: optional TimeStamp + + /** + * Date when the MLModelGroup was last modified + */ + lastModified: optional TimeStamp + + /** + * List of jobs (if any) used to train the model group. Visible in Lineage. + */ + @Relationship = { + "/*": { + "name": "TrainedBy", + "entityTypes": [ "dataJob" ], + "isLineage": true + } + } + trainingJobs: optional array[Urn] + /** * Version of the MLModelGroup */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl index 621a3e1747b504..d89d07384bba1d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -6,6 +6,7 @@ import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties import com.linkedin.common.ExternalReference +import com.linkedin.common.TimeStamp /** * Properties associated with a ML Model @@ -15,6 +16,18 @@ import com.linkedin.common.ExternalReference } record MLModelProperties includes CustomProperties, ExternalReference { + /** + * Display name of the MLModel + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + + /** * Documentation of the MLModel */ @@ -27,8 +40,19 @@ record MLModelProperties includes CustomProperties, ExternalReference { /** * Date when the MLModel was developed */ + @deprecated date: optional Time + /** + * Audit stamp containing who created this and when + */ + created: optional TimeStamp + + /** + * Date when the MLModel was last modified + */ + lastModified: optional TimeStamp + /** * Version of the MLModel */ @@ -93,12 +117,12 @@ record MLModelProperties includes CustomProperties, ExternalReference { deployments: optional array[Urn] /** - * List of jobs (if any) used to train the model + * List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect. */ @Relationship = { "/*": { "name": "TrainedBy", - "entityTypes": [ "dataJob" ], + "entityTypes": [ "dataJob", "dataProcessInstance" ], "isLineage": true } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl new file mode 100644 index 00000000000000..f8b8eeafe908b7 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl @@ -0,0 +1,36 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.AuditStamp +import com.linkedin.common.CustomProperties +import com.linkedin.common.ExternalReference +import com.linkedin.common.Urn +import com.linkedin.common.JobFlowUrn +import com.linkedin.common.DataJobUrn +/** + * The inputs and outputs of this training run + */ +@Aspect = { + "name": "mlTrainingRunProperties", +} +record MLTrainingRunProperties includes CustomProperties, ExternalReference { + + /** + * Run Id of the ML Training Run + */ + id: optional string + + /** + * List of URLs for the Outputs of the ML Training Run + */ + outputUrls: optional array[string] + + /** + * Hyperparameters of the ML Training Run + */ + hyperParams: optional array[MLHyperParam] + + /** + * Metrics of the ML Training Run + */ + trainingMetrics: optional array[MLMetric] +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 1c3eb5b574e204..4fe170ced69f33 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -116,6 +116,10 @@ entities: - dataProcessInstanceRunEvent - status - testResults + - dataPlatformInstance + - subTypes + - container + - mlTrainingRunProperties - name: chart category: core keyAspect: chartKey diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 827789130d8bbb..1c713fd33884b5 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -3826,12 +3826,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3840,17 +3851,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3866,7 +3888,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3901,7 +3923,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3936,7 +3958,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3944,7 +3966,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3952,7 +3974,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3967,7 +3989,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3975,7 +3997,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3989,11 +4011,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4004,7 +4026,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4020,7 +4042,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index b549cef0af84b2..77d4644f3c121a 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -3984,12 +3984,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3998,17 +4009,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4024,7 +4046,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4059,7 +4081,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4094,7 +4116,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4102,7 +4124,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4110,7 +4132,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4125,7 +4147,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4133,7 +4155,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4147,11 +4169,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4162,7 +4184,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4178,7 +4200,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4981,12 +5003,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4995,12 +5028,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index c8be9d063eaea9..8b6def75f7a665 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -3550,12 +3550,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3564,17 +3575,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3590,7 +3612,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3625,7 +3647,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3660,7 +3682,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3668,7 +3690,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3676,7 +3698,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3691,7 +3713,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3699,7 +3721,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3713,11 +3735,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3728,7 +3750,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3744,7 +3766,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 8c7595c5e505d8..e4cc5c42303ee2 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -3544,12 +3544,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3558,17 +3569,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3584,7 +3606,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3619,7 +3641,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3654,7 +3676,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3662,7 +3684,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3670,7 +3692,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3685,7 +3707,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3693,7 +3715,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3707,11 +3729,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3722,7 +3744,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3738,7 +3760,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 75e5c9a559076b..e375ac698ab516 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -3978,12 +3978,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3992,17 +4003,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4018,7 +4040,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4053,7 +4075,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4088,7 +4110,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4096,7 +4118,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4104,7 +4126,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4119,7 +4141,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4127,7 +4149,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4141,11 +4163,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4156,7 +4178,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4172,7 +4194,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4975,12 +4997,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4989,12 +5022,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { From 09a9b6eef912d8f855a2cc6fdc03032f5ec7a652 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Mon, 23 Dec 2024 22:39:57 -0800 Subject: [PATCH 27/49] feat(ingest/looker): Do not emit usage for non-ingested dashboards and charts (#11647) --- .../ingestion/source/looker/looker_common.py | 9 + .../ingestion/source/looker/looker_source.py | 22 +- .../ingestion/source/looker/looker_usage.py | 40 +- .../looker/looker_mces_usage_history.json | 364 +++++++++++++++++- .../tests/integration/looker/test_looker.py | 87 ++++- 5 files changed, 482 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index a66962f962255f..1183916e9b3fef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1408,6 +1408,15 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): dashboards_with_activity: LossySet[str] = dataclasses_field( default_factory=LossySet ) + + # Entities that don't seem to exist, so we don't emit usage aspects for them despite having usage data + dashboards_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + charts_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + stage_latency: List[StageLatency] = dataclasses_field(default_factory=list) _looker_explore_registry: Optional[LookerExploreRegistry] = None total_explores: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 815c5dfb1c0147..8487d5113bc1d3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -68,6 +68,7 @@ ViewField, ViewFieldType, gen_model_key, + get_urn_looker_element_id, ) from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI @@ -165,6 +166,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): # Required, as we do not ingest all folders but only those that have dashboards/looks self.processed_folders: List[str] = [] + # Keep track of ingested chart urns, to omit usage for non-ingested entities + self.chart_urns: Set[str] = set() + @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() @@ -642,6 +646,7 @@ def _make_chart_metadata_events( chart_urn = self._make_chart_urn( element_id=dashboard_element.get_urn_element_id() ) + self.chart_urns.add(chart_urn) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[Status(removed=False)], @@ -1380,7 +1385,9 @@ def _get_folder_and_ancestors_workunits( yield from self._emit_folder_as_container(folder) def extract_usage_stat( - self, looker_dashboards: List[looker_usage.LookerDashboardForUsage] + self, + looker_dashboards: List[looker_usage.LookerDashboardForUsage], + ingested_chart_urns: Set[str], ) -> List[MetadataChangeProposalWrapper]: looks: List[looker_usage.LookerChartForUsage] = [] # filter out look from all dashboard @@ -1391,6 +1398,15 @@ def extract_usage_stat( # dedup looks looks = list({str(look.id): look for look in looks}.values()) + filtered_looks = [] + for look in looks: + if not look.id: + continue + chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id)) + if chart_urn in ingested_chart_urns: + filtered_looks.append(look) + else: + self.reporter.charts_skipped_for_usage.add(look.id) # Keep stat generators to generate entity stat aspect later stat_generator_config: looker_usage.StatGeneratorConfig = ( @@ -1414,7 +1430,7 @@ def extract_usage_stat( stat_generator_config, self.reporter, self._make_chart_urn, - looks, + filtered_looks, ) mcps: List[MetadataChangeProposalWrapper] = [] @@ -1669,7 +1685,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.source_config.extract_usage_history: self.reporter.report_stage_start("usage_extraction") usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat( - looker_dashboards_for_usage + looker_dashboards_for_usage, self.chart_urns ) for usage_mcp in usage_mcps: yield usage_mcp.as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index ef7d64e4f42d43..098d7d73a3da84 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -42,6 +42,7 @@ TimeWindowSizeClass, _Aspect as AspectAbstract, ) +from datahub.utilities.lossy_collections import LossySet logger = logging.getLogger(__name__) @@ -170,7 +171,7 @@ def __init__( self.config = config self.looker_models = looker_models # Later it will help to find out for what are the looker entities from query result - self.id_vs_model: Dict[str, ModelForUsage] = { + self.id_to_model: Dict[str, ModelForUsage] = { self.get_id(looker_object): looker_object for looker_object in looker_models } self.post_filter = len(self.looker_models) > 100 @@ -225,6 +226,10 @@ def get_id(self, looker_object: ModelForUsage) -> str: def get_id_from_row(self, row: dict) -> str: pass + @abstractmethod + def report_skip_set(self) -> LossySet[str]: + pass + def create_mcp( self, model: ModelForUsage, aspect: Aspect ) -> MetadataChangeProposalWrapper: @@ -258,20 +263,11 @@ def _process_entity_timeseries_rows( return entity_stat_aspect - def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]: - aspects: List[Tuple[ModelForUsage, AspectAbstract]] = [] - for looker_object in self.looker_models: - aspects.append( - (looker_object, self.to_entity_absolute_stat_aspect(looker_object)) - ) - - return aspects - def _fill_user_stat_aspect( self, entity_usage_stat: Dict[Tuple[str, str], Aspect], user_wise_rows: List[Dict], - ) -> Iterable[Tuple[ModelForUsage, Aspect]]: + ) -> Iterable[Tuple[str, Aspect]]: logger.debug("Entering fill user stat aspect") # We first resolve all the users using a threadpool to warm up the cache @@ -300,7 +296,7 @@ def _fill_user_stat_aspect( for row in user_wise_rows: # Confirm looker object was given for stat generation - looker_object = self.id_vs_model.get(self.get_id_from_row(row)) + looker_object = self.id_to_model.get(self.get_id_from_row(row)) if looker_object is None: logger.warning( "Looker object with id({}) was not register with stat generator".format( @@ -338,7 +334,7 @@ def _fill_user_stat_aspect( logger.debug("Starting to yield answers for user-wise counts") for (id, _), aspect in entity_usage_stat.items(): - yield self.id_vs_model[id], aspect + yield id, aspect def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: rows = [] @@ -357,7 +353,7 @@ def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: ) if self.post_filter: logger.debug("post filtering") - rows = [r for r in rows if self.get_id_from_row(r) in self.id_vs_model] + rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model] logger.debug("Filtered down to %d rows", len(rows)) except Exception as e: logger.warning(f"Failed to execute {query_name} query: {e}") @@ -378,7 +374,8 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: return # yield absolute stat for looker entities - for looker_object, aspect in self._process_absolute_aspect(): # type: ignore + for looker_object in self.looker_models: + aspect = self.to_entity_absolute_stat_aspect(looker_object) yield self.create_mcp(looker_object, aspect) # Execute query and process the raw json which contains stat information @@ -399,10 +396,13 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: ) user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query") # yield absolute stat for entity - for looker_object, aspect in self._fill_user_stat_aspect( + for object_id, aspect in self._fill_user_stat_aspect( entity_usage_stat, user_wise_rows ): - yield self.create_mcp(looker_object, aspect) + if object_id in self.id_to_model: + yield self.create_mcp(self.id_to_model[object_id], aspect) + else: + self.report_skip_set().add(object_id) class DashboardStatGenerator(BaseStatGenerator): @@ -425,6 +425,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "DashboardStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.dashboards_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { HistoryViewField.HISTORY_DASHBOARD_ID: ",".join( @@ -541,6 +544,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "ChartStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.charts_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { LookViewField.LOOK_ID: ",".join( diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 594983c8fb0f2a..ed0c5401c9029f 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -1,4 +1,66 @@ [ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(looker,dashboard_elements.3)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "upstream_fields": "" + }, + "title": "", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "chartUrl": "https://looker.company.com/x/", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Look" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { @@ -9,7 +71,9 @@ "customProperties": {}, "title": "foo", "description": "lorem ipsum", - "charts": [], + "charts": [ + "urn:li:chart:(looker,dashboard_elements.3)" + ], "datasets": [], "dashboards": [], "lastModified": { @@ -89,6 +153,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "inputFields", + "aspect": { + "json": { + "fields": [] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(looker,dashboards.1)", @@ -215,6 +295,98 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "looker", + "env": "PROD", + "model_name": "look_data" + }, + "name": "look_data", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "LookML Model" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { @@ -389,6 +561,180 @@ "lastRunId": "no-run-id-provided" } }, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Explore/look_data" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "project": "lkml_samples", + "model": "look_data", + "looker.explore.label": "My Explore View", + "looker.explore.name": "look_view", + "looker.explore.file": "test_source_file.lkml" + }, + "externalUrl": "https://looker.company.com/explore/look_data/look_view", + "name": "My Explore View", + "description": "lorem ipsum", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", + "type": "VIEW" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "look_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "dim1", + "nullable": false, + "description": "dimension one description", + "label": "Dimensions One Label", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Explore" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "embed", + "aspect": { + "json": { + "renderUrl": "https://looker.company.com/embed/explore/look_data/look_view" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + }, + { + "id": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "urn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { @@ -747,22 +1093,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "looker-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index a39de8384efb23..c96bcc729a95da 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -31,7 +31,10 @@ from datahub.ingestion.api.source import SourceReport from datahub.ingestion.run.pipeline import Pipeline, PipelineInitError from datahub.ingestion.source.looker import looker_common, looker_usage -from datahub.ingestion.source.looker.looker_common import LookerExplore +from datahub.ingestion.source.looker.looker_common import ( + LookerDashboardSourceReport, + LookerExplore, +) from datahub.ingestion.source.looker.looker_config import LookerCommonConfig from datahub.ingestion.source.looker.looker_lib_wrapper import ( LookerAPI, @@ -414,7 +417,9 @@ def setup_mock_dashboard_multiple_charts(mocked_client): ) -def setup_mock_dashboard_with_usage(mocked_client): +def setup_mock_dashboard_with_usage( + mocked_client: mock.MagicMock, skip_look: bool = False +) -> None: mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="1", @@ -437,7 +442,13 @@ def setup_mock_dashboard_with_usage(mocked_client): ), ), DashboardElement( - id="3", type="", look=LookWithQuery(id="3", view_count=30) + id="3", + type="" if skip_look else "vis", # Looks only ingested if type == `vis` + look=LookWithQuery( + id="3", + view_count=30, + query=Query(model="look_data", view="look_view"), + ), ), ], ) @@ -611,6 +622,12 @@ def side_effect_query_inline( HistoryViewField.HISTORY_DASHBOARD_USER: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, }, + { + HistoryViewField.HISTORY_DASHBOARD_ID: "5", + HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07", + HistoryViewField.HISTORY_DASHBOARD_USER: 1, + HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, + }, ] ), looker_usage.QueryId.DASHBOARD_PER_USER_PER_DAY_USAGE_STAT: json.dumps( @@ -790,6 +807,70 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): ) +@freeze_time(FROZEN_TIME) +def test_looker_filter_usage_history(pytestconfig, tmp_path, mock_time): + mocked_client = mock.MagicMock() + with mock.patch("looker_sdk.init40") as mock_sdk: + mock_sdk.return_value = mocked_client + setup_mock_dashboard_with_usage(mocked_client, skip_look=True) + mocked_client.run_inline_query.side_effect = side_effect_query_inline + setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + + temp_output_file = f"{tmp_path}/looker_mces.json" + pipeline = Pipeline.create( + { + "run_id": "looker-test", + "source": { + "type": "looker", + "config": { + "base_url": "https://looker.company.com", + "client_id": "foo", + "client_secret": "bar", + "extract_usage_history": True, + "max_threads": 1, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": temp_output_file, + }, + }, + } + ) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status() + + # There should be 4 dashboardUsageStatistics aspects (one absolute and 3 timeseries) + dashboard_usage_aspect_count = 0 + # There should be 0 chartUsageStatistics -- filtered by set of ingested charts + chart_usage_aspect_count = 0 + with open(temp_output_file) as f: + temp_output_dict = json.load(f) + for element in temp_output_dict: + if ( + element.get("entityType") == "dashboard" + and element.get("aspectName") == "dashboardUsageStatistics" + ): + dashboard_usage_aspect_count = dashboard_usage_aspect_count + 1 + if ( + element.get("entityType") == "chart" + and element.get("aspectName") == "chartUsageStatistics" + ): + chart_usage_aspect_count = chart_usage_aspect_count + 1 + + assert dashboard_usage_aspect_count == 4 + assert chart_usage_aspect_count == 0 + + source_report = cast(LookerDashboardSourceReport, pipeline.source.get_report()) + # From timeseries query + assert str(source_report.dashboards_skipped_for_usage) == str(["5"]) + # From dashboard element + assert str(source_report.charts_skipped_for_usage) == str(["3"]) + + @freeze_time(FROZEN_TIME) def test_looker_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): output_file_name: str = "looker_mces.json" From 87e7b58ac699005ca5757e6ef47fb853d89a6583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 24 Dec 2024 10:46:19 +0100 Subject: [PATCH 28/49] fix(tableau): retry on InternalServerError 504 (#12213) --- .../ingestion/source/tableau/tableau.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 984cf9357199d6..2b7aac2bea1d05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -35,7 +35,10 @@ SiteItem, TableauAuth, ) -from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError +from tableauserverclient.server.endpoint.exceptions import ( + InternalServerError, + NonXMLResponseError, +) from urllib3 import Retry import datahub.emitter.mce_builder as builder @@ -1196,6 +1199,24 @@ def get_connection_object_page( retry_on_auth_error=False, retries_remaining=retries_remaining - 1, ) + + except InternalServerError as ise: + # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry. + if ise.code == 504: + if retries_remaining <= 0: + raise ise + return self.get_connection_object_page( + query=query, + connection_type=connection_type, + query_filter=query_filter, + fetch_size=fetch_size, + current_cursor=current_cursor, + retry_on_auth_error=False, + retries_remaining=retries_remaining - 1, + ) + else: + raise ise + except OSError: # In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04), # the request logic was changed to use threads. From 4d990b06bd0df4f51443893e2efb39e09d9818b6 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Tue, 24 Dec 2024 18:14:51 +0530 Subject: [PATCH 29/49] fix(ingest/snowflake): always ingest view and external table ddl lineage (#12191) --- docs/how/updating-datahub.md | 2 +- .../source/snowflake/snowflake_config.py | 28 ++----------------- .../source/snowflake/snowflake_lineage_v2.py | 13 ++------- .../source/snowflake/snowflake_query.py | 9 ------ .../source/snowflake/snowflake_schema_gen.py | 6 +--- .../source/snowflake/snowflake_shares.py | 2 +- .../source/snowflake/snowflake_v2.py | 20 +++++++++---- .../source_report/ingestion_stage.py | 1 + .../tests/integration/snowflake/common.py | 2 -- .../integration/snowflake/test_snowflake.py | 2 -- .../test_snowflake_classification.py | 1 - .../snowflake/test_snowflake_failures.py | 2 -- .../snowflake/test_snowflake_tag.py | 2 -- .../performance/snowflake/test_snowflake.py | 1 - .../unit/snowflake/test_snowflake_source.py | 23 +++++++-------- 15 files changed, 36 insertions(+), 78 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 5bc0e66fa2ff1d..a742ebe0cd8968 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -17,7 +17,7 @@ This file documents any backwards-incompatible changes in DataHub and assists people when migrating to a new version. ## Next - +- #12191 - Configs `include_view_lineage` and `include_view_column_lineage` are removed from snowflake ingestion source. View and External Table DDL lineage will always be ingested when definitions are available. - #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.
PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.
Entity urn with `include_workspace_name_in_dataset_urn: false` diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 1d1cc3c2af4f08..2b2dcf860cdb07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -163,26 +163,13 @@ class SnowflakeConfig( default=True, description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", ) - include_view_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", - ) + + _include_view_lineage = pydantic_removed_field("include_view_lineage") + _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage") ignore_start_time_lineage: bool = False upstream_lineage_in_report: bool = False - @pydantic.root_validator(skip_on_failure=True) - def validate_include_view_lineage(cls, values): - if ( - "include_table_lineage" in values - and not values.get("include_table_lineage") - and values.get("include_view_lineage") - ): - raise ValueError( - "include_table_lineage must be True for include_view_lineage to be set." - ) - return values - class SnowflakeV2Config( SnowflakeConfig, @@ -222,11 +209,6 @@ class SnowflakeV2Config( description="Populates table->table and view->table column lineage. Requires appropriate grants given to the role and the Snowflake Enterprise Edition or above.", ) - include_view_column_lineage: bool = Field( - default=True, - description="Populates view->view and table->view column lineage using DataHub's sql parser.", - ) - use_queries_v2: bool = Field( default=False, description="If enabled, uses the new queries extractor to extract queries from snowflake.", @@ -355,10 +337,6 @@ def get_sql_alchemy_url( self, database=database, username=username, password=password, role=role ) - @property - def parse_view_ddl(self) -> bool: - return self.include_view_column_lineage - @validator("shares") def validate_shares( cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index b815a6584379ac..6b200590d7ab63 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -8,7 +8,6 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.ingestion.api.closeable import Closeable -from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( LINEAGE_PERMISSION_ERROR, @@ -163,11 +162,11 @@ def get_time_window(self) -> Tuple[datetime, datetime]: self.config.end_time, ) - def get_workunits( + def add_time_based_lineage_to_aggregator( self, discovered_tables: List[str], discovered_views: List[str], - ) -> Iterable[MetadataWorkUnit]: + ) -> None: if not self._should_ingest_lineage(): return @@ -177,9 +176,7 @@ def get_workunits( # snowflake view/table -> snowflake table self.populate_table_upstreams(discovered_tables) - for mcp in self.sql_aggregator.gen_metadata(): - yield mcp.as_workunit() - + def update_state(self): if self.redundant_run_skip_handler: # Update the checkpoint state for this run. self.redundant_run_skip_handler.update_state( @@ -337,10 +334,6 @@ def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), upstreams_deny_pattern=self.config.temporary_tables_pattern, - # The self.config.include_view_lineage setting is about fetching upstreams of views. - # We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False. - # TODO: Remove this `include_view_lineage` flag, since it's effectively dead code. - include_view_lineage=True, include_column_lineage=self.config.include_column_lineage, ) try: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 97c398c1962d6b..a94b39476b2c22 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -376,7 +376,6 @@ def view_dependencies() -> str: def table_to_table_lineage_history_v2( start_time_millis: int, end_time_millis: int, - include_view_lineage: bool = True, include_column_lineage: bool = True, upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: @@ -385,14 +384,12 @@ def table_to_table_lineage_history_v2( start_time_millis, end_time_millis, upstreams_deny_pattern, - include_view_lineage, ) else: return SnowflakeQuery.table_upstreams_only( start_time_millis, end_time_millis, upstreams_deny_pattern, - include_view_lineage, ) @staticmethod @@ -677,12 +674,9 @@ def table_upstreams_with_column_lineage( start_time_millis: int, end_time_millis: int, upstreams_deny_pattern: List[str], - include_view_lineage: bool = True, ) -> str: allowed_upstream_table_domains = ( SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER - if include_view_lineage - else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER ) upstream_sql_filter = create_deny_regex_sql_filter( @@ -847,12 +841,9 @@ def table_upstreams_only( start_time_millis: int, end_time_millis: int, upstreams_deny_pattern: List[str], - include_view_lineage: bool = True, ) -> str: allowed_upstream_table_domains = ( SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER - if include_view_lineage - else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER ) upstream_sql_filter = create_deny_regex_sql_filter( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 4b72b09fafe2dd..8a1bf15b7a7bc4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -435,11 +435,7 @@ def _process_schema( ) if self.config.include_views: - if ( - self.aggregator - and self.config.include_view_lineage - and self.config.parse_view_ddl - ): + if self.aggregator: for view in views: view_identifier = self.identifiers.get_dataset_identifier( view.name, schema_name, db_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py index 794a6f4a59f46f..606acd53dc3324 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -72,7 +72,7 @@ def get_shares_workunits( assert len(sibling_dbs) == 1 # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy # hence this lineage code is not written in SnowflakeLineageExtractor - # also this is not governed by configs include_table_lineage and include_view_lineage + # also this is not governed by configs include_table_lineage yield self.get_upstream_lineage_with_primary_sibling( db.name, schema.name, table_name, sibling_dbs[0] ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 884e6c49f5b62a..954e8a29c1a1bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -82,6 +82,7 @@ LINEAGE_EXTRACTION, METADATA_EXTRACTION, QUERIES_EXTRACTION, + VIEW_PARSING, ) from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator from datahub.utilities.registries.domain_registry import DomainRegistry @@ -103,7 +104,7 @@ @capability(SourceCapability.DESCRIPTIONS, "Enabled by default") @capability( SourceCapability.LINEAGE_COARSE, - "Enabled by default, can be disabled via configuration `include_table_lineage` and `include_view_lineage`", + "Enabled by default, can be disabled via configuration `include_table_lineage`", ) @capability( SourceCapability.LINEAGE_FINE, @@ -512,15 +513,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: discovered_datasets = discovered_tables + discovered_views if self.config.use_queries_v2: - self.report.set_ingestion_stage("*", "View Parsing") - assert self.aggregator is not None + self.report.set_ingestion_stage("*", VIEW_PARSING) yield from auto_workunit(self.aggregator.gen_metadata()) self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) schema_resolver = self.aggregator._schema_resolver - queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor( + queries_extractor = SnowflakeQueriesExtractor( connection=self.connection, config=SnowflakeQueriesExtractorConfig( window=self.config, @@ -546,13 +546,21 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: queries_extractor.close() else: - if self.config.include_table_lineage and self.lineage_extractor: + if self.lineage_extractor: self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) - yield from self.lineage_extractor.get_workunits( + self.lineage_extractor.add_time_based_lineage_to_aggregator( discovered_tables=discovered_tables, discovered_views=discovered_views, ) + # This would emit view and external table ddl lineage + # as well as query lineage via lineage_extractor + for mcp in self.aggregator.gen_metadata(): + yield mcp.as_workunit() + + if self.lineage_extractor: + self.lineage_extractor.update_state() + if ( self.config.include_usage_stats or self.config.include_operational_stats ) and self.usage_extractor: diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 92407eaae6e901..42b3b648bd298d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -15,6 +15,7 @@ USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage" +VIEW_PARSING = "View Parsing" QUERIES_EXTRACTION = "Queries Extraction" PROFILING = "Profiling" diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 862d27186703a8..7b4f5abe1cd462 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -458,7 +458,6 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=True, ), ): @@ -548,7 +547,6 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=False, ), ): diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 1d7470d24f7689..ef4918a20e640c 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -117,7 +117,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_technical_schema=True, include_table_lineage=True, - include_view_lineage=True, include_usage_stats=True, format_sql_queries=True, validate_upstreams_against_patterns=False, @@ -216,7 +215,6 @@ def test_snowflake_private_link_and_incremental_mcps( include_table_lineage=True, include_column_lineage=False, include_views=True, - include_view_lineage=True, include_usage_stats=False, format_sql_queries=True, incremental_lineage=False, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py index 75a9df4f280512..52453b30f740ab 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py @@ -66,7 +66,6 @@ def test_snowflake_classification_perf(num_workers, num_cols_per_table, num_tabl schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 0b838b0bb59c3a..de6e996a52642b 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -49,7 +49,6 @@ def snowflake_pipeline_config(tmp_path): include_technical_schema=True, match_fully_qualified_names=True, schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_view_lineage=False, include_usage_stats=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc @@ -227,7 +226,6 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=True, ) ], diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py index d5e265e7838825..9bb598cb0c1c7f 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py @@ -30,7 +30,6 @@ def test_snowflake_tag_pattern(): ), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, @@ -74,7 +73,6 @@ def test_snowflake_tag_pattern_deny(): ), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 5042c78c2e7b91..984d9e42957452 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -37,7 +37,6 @@ def run_test(): password="TST_PWD", include_technical_schema=False, include_table_lineage=True, - include_view_lineage=True, include_usage_stats=True, include_operational_stats=True, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace(tzinfo=timezone.utc), diff --git a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index 2ff85a08f052f9..75f32b535eb2e8 100644 --- a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -257,17 +257,6 @@ def test_options_contain_connect_args(): assert connect_args is not None -def test_snowflake_config_with_view_lineage_no_table_lineage_throws_error(): - config_dict = default_config_dict.copy() - config_dict["include_view_lineage"] = True - config_dict["include_table_lineage"] = False - with pytest.raises( - ValidationError, - match="include_table_lineage must be True for include_view_lineage to be set", - ): - SnowflakeV2Config.parse_obj(config_dict) - - def test_snowflake_config_with_column_lineage_no_table_lineage_throws_error(): config_dict = default_config_dict.copy() config_dict["include_column_lineage"] = True @@ -667,6 +656,18 @@ def test_snowflake_utils() -> None: assert_doctest(datahub.ingestion.source.snowflake.snowflake_utils) +def test_using_removed_fields_causes_no_error() -> None: + assert SnowflakeV2Config.parse_obj( + { + "account_id": "test", + "username": "snowflake", + "password": "snowflake", + "include_view_lineage": "true", + "include_view_column_lineage": "true", + } + ) + + def test_snowflake_query_result_parsing(): db_row = { "DOWNSTREAM_TABLE_NAME": "db.schema.downstream_table", From d88e6c997713509d8ecdb463c42d072c5c857853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 24 Dec 2024 16:03:36 +0100 Subject: [PATCH 30/49] fix(tableau): fixes wrong argument when reauthenticating (#12216) --- .../ingestion/source/tableau/tableau.py | 48 +++++++++++-------- .../tableau/test_tableau_ingest.py | 10 ++-- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 2b7aac2bea1d05..508500ffe489b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -621,6 +621,12 @@ def update_table( self.parsed_columns = parsed_columns +@dataclass +class SiteIdContentUrl: + site_id: str + site_content_url: str + + class TableauSourceReport(StaleEntityRemovalSourceReport): get_all_datasources_query_failed: bool = False num_get_datasource_query_failures: int = 0 @@ -773,7 +779,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: config=self.config, ctx=self.ctx, site=site, - site_id=site.id, report=self.report, server=self.server, platform=self.platform, @@ -792,8 +797,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: site_source = TableauSiteSource( config=self.config, ctx=self.ctx, - site=site, - site_id=self.server.site_id, + site=site + if site + else SiteIdContentUrl( + site_id=self.server.site_id, site_content_url=self.config.site + ), report=self.report, server=self.server, platform=self.platform, @@ -826,8 +834,7 @@ def __init__( self, config: TableauConfig, ctx: PipelineContext, - site: Optional[SiteItem], - site_id: Optional[str], + site: Union[SiteItem, SiteIdContentUrl], report: TableauSourceReport, server: Server, platform: str, @@ -838,13 +845,18 @@ def __init__( self.ctx: PipelineContext = ctx self.platform = platform - self.site: Optional[SiteItem] = site - if site_id is not None: - self.site_id: str = site_id + self.site: Optional[SiteItem] = None + if isinstance(site, SiteItem): + self.site = site + assert site.id is not None, "Site ID is required" + self.site_id = site.id + self.site_content_url = site.content_url + elif isinstance(site, SiteIdContentUrl): + self.site = None + self.site_id = site.site_id + self.site_content_url = site.site_content_url else: - assert self.site is not None, "site or site_id is required" - assert self.site.id is not None, "site_id is required when site is provided" - self.site_id = self.site.id + raise AssertionError("site or site id+content_url pair is required") self.database_tables: Dict[str, DatabaseTable] = {} self.tableau_stat_registry: Dict[str, UsageStat] = {} @@ -898,16 +910,14 @@ def dataset_browse_prefix(self) -> str: # datasets also have the env in the browse path return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" - def _re_authenticate(self): + def _re_authenticate(self) -> None: + self.report.info( + message="Re-authenticating to Tableau", + context=f"site='{self.site_content_url}'", + ) # Sign-in again may not be enough because Tableau sometimes caches invalid sessions # so we need to recreate the Tableau Server object - self.server = self.config.make_tableau_client(self.site_id) - - @property - def site_content_url(self) -> Optional[str]: - if self.site and self.site.content_url: - return self.site.content_url - return None + self.server = self.config.make_tableau_client(self.site_content_url) def _populate_usage_stat_registry(self) -> None: if self.server is None: diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index c3a8880bf20a09..902ff243c802a8 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -26,6 +26,7 @@ from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( + SiteIdContentUrl, TableauConfig, TableauProject, TableauSiteSource, @@ -1008,8 +1009,7 @@ def check_lineage_metadata( config=config, ctx=context, platform="tableau", - site=SiteItem(name="Site 1", content_url="site1"), - site_id="site1", + site=SiteIdContentUrl(site_id="id1", site_content_url="site1"), report=TableauSourceReport(), server=Server("https://test-tableau-server.com"), ) @@ -1313,8 +1313,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): platform="tableau", config=mock.MagicMock(), ctx=mock.MagicMock(), - site=mock.MagicMock(), - site_id=None, + site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"), server=mock_sdk.return_value, report=reporter, ) @@ -1371,8 +1370,7 @@ def test_extract_project_hierarchy(extract_project_hierarchy, allowed_projects): config=config, ctx=context, platform="tableau", - site=SiteItem(name="Site 1", content_url="site1"), - site_id="site1", + site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"), report=TableauSourceReport(), server=Server("https://test-tableau-server.com"), ) From 48736a03dd56c70a3894efcbef6e95a23d8cbfdd Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Wed, 25 Dec 2024 00:57:27 +0530 Subject: [PATCH 31/49] fix(ingest/looker): Add flag for Looker metadata extraction (#12205) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../src/datahub/sql_parsing/tool_meta_extractor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 5af9d9d4f0fffc..d2682252e0fbf5 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -40,6 +40,7 @@ def _get_last_line(query: str) -> str: class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) failures: List[str] = field(default_factory=list) + looker_user_mapping_missing: Optional[bool] = None class ToolMetaExtractor: @@ -108,7 +109,9 @@ def extract_looker_user_mapping_from_graph( PlatformResource.search_by_filters(query=query, graph_client=graph) ) - if len(platform_resources) > 1: + if len(platform_resources) == 0: + report.looker_user_mapping_missing = True + elif len(platform_resources) > 1: report.failures.append( "Looker user metadata extraction failed. Found more than one looker user id mappings." ) From f4b33b59d1726dd962db4d3300f085cc60626a81 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Dec 2024 11:33:06 -0800 Subject: [PATCH 32/49] fix(ingest/mode): Handle 204 response and invalid json (#12156) Co-authored-by: Aseem Bansal --- .../src/datahub/ingestion/source/mode.py | 46 +++--- .../tests/integration/mode/test_mode.py | 141 ++++++++++++++++-- 2 files changed, 151 insertions(+), 36 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index ef0b499129f97b..68ecc5d8694ac5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from datetime import datetime, timezone from functools import lru_cache +from json import JSONDecodeError from typing import Dict, Iterable, List, Optional, Set, Tuple, Union import dateutil.parser as dp @@ -193,6 +194,9 @@ class HTTPError429(HTTPError): pass +ModeRequestError = (HTTPError, JSONDecodeError) + + @dataclass class ModeSourceReport(StaleEntityRemovalSourceReport): filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList) @@ -328,11 +332,11 @@ def __init__(self, ctx: PipelineContext, config: ModeConfig): # Test the connection try: self._get_request_json(f"{self.config.connect_uri}/api/verify") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Connect", message="Unable to verify connection to mode.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}" @@ -521,11 +525,11 @@ def _get_creator(self, href: str) -> Optional[str]: if self.config.owner_username_instead_of_email else user_json.get("email") ) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_warning( title="Failed to retrieve Mode creator", message=f"Unable to retrieve user for {href}", - context=f"Reason: {str(http_error)}", + context=f"Reason: {str(e)}", ) return user @@ -571,11 +575,11 @@ def _get_space_name_and_tokens(self) -> dict: logging.debug(f"Skipping space {space_name} due to space pattern") continue space_info[s.get("token", "")] = s.get("name", "") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Spaces", message="Unable to retrieve spaces / collections for workspace.", - context=f"Workspace: {self.workspace_uri}, Error: {str(http_error)}", + context=f"Workspace: {self.workspace_uri}, Error: {str(e)}", ) return space_info @@ -721,11 +725,11 @@ def _get_data_sources(self) -> List[dict]: try: ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources") data_sources = ds_json.get("_embedded", {}).get("data_sources", []) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to retrieve Data Sources", message="Unable to retrieve data sources from Mode.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) return data_sources @@ -812,11 +816,11 @@ def _get_definition(self, definition_name): if definition.get("name", "") == definition_name: return definition.get("source", "") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Definition", message="Unable to retrieve definition from Mode.", - context=f"Definition Name: {definition_name}, Error: {str(http_error)}", + context=f"Definition Name: {definition_name}, Error: {str(e)}", ) return None @@ -1382,11 +1386,11 @@ def _get_reports(self, space_token: str) -> List[dict]: f"{self.workspace_uri}/spaces/{space_token}/reports" ) reports = reports_json.get("_embedded", {}).get("reports", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Reports for Space", message="Unable to retrieve reports for space token.", - context=f"Space Token: {space_token}, Error: {str(http_error)}", + context=f"Space Token: {space_token}, Error: {str(e)}", ) return reports @@ -1400,11 +1404,11 @@ def _get_datasets(self, space_token: str) -> List[dict]: url = f"{self.workspace_uri}/spaces/{space_token}/datasets" datasets_json = self._get_request_json(url) datasets = datasets_json.get("_embedded", {}).get("reports", []) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Datasets for Space", message=f"Unable to retrieve datasets for space token {space_token}.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) return datasets @@ -1416,11 +1420,11 @@ def _get_queries(self, report_token: str) -> list: f"{self.workspace_uri}/reports/{report_token}/queries" ) queries = queries_json.get("_embedded", {}).get("queries", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Queries", message="Unable to retrieve queries for report token.", - context=f"Report Token: {report_token}, Error: {str(http_error)}", + context=f"Report Token: {report_token}, Error: {str(e)}", ) return queries @@ -1433,11 +1437,11 @@ def _get_last_query_run( f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}" ) queries = queries_json.get("_embedded", {}).get("queries", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Queries for Report", message="Unable to retrieve queries for report token.", - context=f"Report Token:{report_token}, Error: {str(http_error)}", + context=f"Report Token:{report_token}, Error: {str(e)}", ) return {} return queries @@ -1451,13 +1455,13 @@ def _get_charts(self, report_token: str, query_token: str) -> list: f"/queries/{query_token}/charts" ) charts = charts_json.get("_embedded", {}).get("charts", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Charts", message="Unable to retrieve charts from Mode.", context=f"Report Token: {report_token}, " f"Query token: {query_token}, " - f"Error: {str(http_error)}", + f"Error: {str(e)}", ) return charts @@ -1477,6 +1481,8 @@ def get_request(): response = self.session.get( url, timeout=self.config.api_options.timeout ) + if response.status_code == 204: # No content, don't parse json + return {} return response.json() except HTTPError as http_error: error_response = http_error.response diff --git a/metadata-ingestion/tests/integration/mode/test_mode.py b/metadata-ingestion/tests/integration/mode/test_mode.py index ce7533d5611e49..7f1e3935aa0fa1 100644 --- a/metadata-ingestion/tests/integration/mode/test_mode.py +++ b/metadata-ingestion/tests/integration/mode/test_mode.py @@ -1,11 +1,14 @@ import json import pathlib +from typing import Sequence from unittest.mock import patch +import pytest from freezegun import freeze_time from requests.models import HTTPError from datahub.configuration.common import PipelineExecutionError +from datahub.ingestion.api.source import StructuredLogEntry from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers @@ -28,7 +31,7 @@ "https://app.mode.com/api/acryl/reports/24f66e1701b6/queries": "dataset_queries_24f66e1701b6.json", } -RESPONSE_ERROR_LIST = ["https://app.mode.com/api/acryl/spaces/75737b70402e/reports"] +ERROR_URL = "https://app.mode.com/api/acryl/spaces/75737b70402e/reports" test_resources_dir = pathlib.Path(__file__).parent @@ -49,6 +52,14 @@ def mount(self, prefix, adaptor): return self def get(self, url, timeout=40): + if self.error_list is not None and self.url in self.error_list: + http_error_msg = "{} Client Error: {} for url: {}".format( + 400, + "Simulate error", + self.url, + ) + raise HTTPError(http_error_msg, response=self) + self.url = url self.timeout = timeout response_json_path = f"{test_resources_dir}/setup/{JSON_RESPONSE_MAP.get(url)}" @@ -57,29 +68,46 @@ def get(self, url, timeout=40): self.json_data = data return self - def raise_for_status(self): - if self.error_list is not None and self.url in self.error_list: - http_error_msg = "{} Client Error: {} for url: {}".format( - 400, - "Simulate error", - self.url, - ) - raise HTTPError(http_error_msg, response=self) + +class MockResponseJson(MockResponse): + def __init__( + self, + status_code: int = 200, + *, + json_empty_list: Sequence[str] = (), + json_error_list: Sequence[str] = (), + ): + super().__init__(None, status_code) + self.json_empty_list = json_empty_list + self.json_error_list = json_error_list + + def json(self): + if self.url in self.json_empty_list: + return json.loads("") # Shouldn't be called + if self.url in self.json_error_list: + return json.loads("{") + return super().json() + + def get(self, url, timeout=40): + response = super().get(url, timeout) + if self.url in self.json_empty_list: + response.status_code = 204 + return response -def mocked_requests_sucess(*args, **kwargs): +def mocked_requests_success(*args, **kwargs): return MockResponse(None, 200) def mocked_requests_failure(*args, **kwargs): - return MockResponse(RESPONSE_ERROR_LIST, 200) + return MockResponse([ERROR_URL], 200) @freeze_time(FROZEN_TIME) def test_mode_ingest_success(pytestconfig, tmp_path): with patch( "datahub.ingestion.source.mode.requests.Session", - side_effect=mocked_requests_sucess, + side_effect=mocked_requests_success, ): pipeline = Pipeline.create( { @@ -142,8 +170,89 @@ def test_mode_ingest_failure(pytestconfig, tmp_path): } ) pipeline.run() - try: + with pytest.raises(PipelineExecutionError) as exec_error: pipeline.raise_from_status() - except PipelineExecutionError as exec_error: - assert exec_error.args[0] == "Source reported errors" - assert len(exec_error.args[1].failures) == 1 + assert exec_error.value.args[0] == "Source reported errors" + assert len(exec_error.value.args[1].failures) == 1 + error_dict: StructuredLogEntry + _level, error_dict = exec_error.value.args[1].failures[0] + error = next(iter(error_dict.context)) + assert "Simulate error" in error + assert ERROR_URL in error + + +@freeze_time(FROZEN_TIME) +def test_mode_ingest_json_empty(pytestconfig, tmp_path): + with patch( + "datahub.ingestion.source.mode.requests.Session", + side_effect=lambda *args, **kwargs: MockResponseJson( + json_empty_list=["https://app.mode.com/api/modeuser"] + ), + ): + global test_resources_dir + test_resources_dir = pytestconfig.rootpath / "tests/integration/mode" + + pipeline = Pipeline.create( + { + "run_id": "mode-test", + "source": { + "type": "mode", + "config": { + "token": "xxxx", + "password": "xxxx", + "connect_uri": "https://app.mode.com/", + "workspace": "acryl", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/mode_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status(raise_warnings=True) + + +@freeze_time(FROZEN_TIME) +def test_mode_ingest_json_failure(pytestconfig, tmp_path): + with patch( + "datahub.ingestion.source.mode.requests.Session", + side_effect=lambda *args, **kwargs: MockResponseJson( + json_error_list=["https://app.mode.com/api/modeuser"] + ), + ): + global test_resources_dir + test_resources_dir = pytestconfig.rootpath / "tests/integration/mode" + + pipeline = Pipeline.create( + { + "run_id": "mode-test", + "source": { + "type": "mode", + "config": { + "token": "xxxx", + "password": "xxxx", + "connect_uri": "https://app.mode.com/", + "workspace": "acryl", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/mode_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status(raise_warnings=False) + with pytest.raises(PipelineExecutionError) as exec_error: + pipeline.raise_from_status(raise_warnings=True) + assert len(exec_error.value.args[1].warnings) > 0 + error_dict: StructuredLogEntry + _level, error_dict = exec_error.value.args[1].warnings[0] + error = next(iter(error_dict.context)) + assert "Expecting property name enclosed in double quotes" in error From 756b199506d57449c60a3a28901f7d22fe89f9f1 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Dec 2024 14:56:35 -0800 Subject: [PATCH 33/49] fix(ingest/glue): Add additional checks and logging when specifying catalog_id (#12168) --- .../src/datahub/ingestion/source/aws/glue.py | 14 +++++- .../tests/unit/glue/glue_mces_golden.json | 2 +- .../glue/glue_mces_golden_table_lineage.json | 2 +- .../glue_mces_platform_instance_golden.json | 2 +- .../tests/unit/glue/test_glue_source.py | 43 +++++++++++++++++-- .../tests/unit/glue/test_glue_source_stubs.py | 8 ++-- 6 files changed, 59 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 37c146218e2633..7a5ed154d40bc7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -52,6 +52,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.report import EntityFilterReport from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws import s3_util @@ -115,7 +116,6 @@ logger = logging.getLogger(__name__) - DEFAULT_PLATFORM = "glue" VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"] @@ -220,6 +220,7 @@ def platform_validator(cls, v: str) -> str: class GlueSourceReport(StaleEntityRemovalSourceReport): tables_scanned = 0 filtered: List[str] = dataclass_field(default_factory=list) + databases: EntityFilterReport = EntityFilterReport.field(type="database") num_job_script_location_missing: int = 0 num_job_script_location_invalid: int = 0 @@ -668,6 +669,7 @@ def get_datajob_wu(self, node: Dict[str, Any], job_name: str) -> MetadataWorkUni return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce) def get_all_databases(self) -> Iterable[Mapping[str, Any]]: + logger.debug("Getting all databases") # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html paginator = self.glue_client.get_paginator("get_databases") @@ -684,10 +686,18 @@ def get_all_databases(self) -> Iterable[Mapping[str, Any]]: pattern += "[?!TargetDatabase]" for database in paginator_response.search(pattern): - if self.source_config.database_pattern.allowed(database["Name"]): + if (not self.source_config.database_pattern.allowed(database["Name"])) or ( + self.source_config.catalog_id + and database.get("CatalogId") + and database.get("CatalogId") != self.source_config.catalog_id + ): + self.report.databases.dropped(database["Name"]) + else: + self.report.databases.processed(database["Name"]) yield database def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]: + logger.debug(f"Getting tables from database {database['Name']}") # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html paginator = self.glue_client.get_paginator("get_tables") database_name = database["Name"] diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index 87971de12fbb39..71d7c31b222bde 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -124,7 +124,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json index e2dd4cec97c2ec..22bb4b53b91efd 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json @@ -124,7 +124,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json index 0b883062763f41..b700335c26e5aa 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json @@ -129,7 +129,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py index 693fd6bc336fd3..9e3f260a23f1c8 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -35,8 +35,8 @@ validate_all_providers_have_committed_successfully, ) from tests.unit.glue.test_glue_source_stubs import ( - databases_1, - databases_2, + empty_database, + flights_database, get_bucket_tagging, get_databases_delta_response, get_databases_response, @@ -64,6 +64,7 @@ tables_2, tables_profiling_1, target_database_tables, + test_database, ) FROZEN_TIME = "2020-04-14 07:00:00" @@ -310,6 +311,40 @@ def test_config_without_platform(): assert source.platform == "glue" +def test_get_databases_filters_by_catalog(): + def format_databases(databases): + return set(d["Name"] for d in databases) + + all_catalogs_source: GlueSource = GlueSource( + config=GlueSourceConfig(aws_region="us-west-2"), + ctx=PipelineContext(run_id="glue-source-test"), + ) + with Stubber(all_catalogs_source.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_response, {}) + + expected = [flights_database, test_database, empty_database] + actual = all_catalogs_source.get_all_databases() + assert format_databases(actual) == format_databases(expected) + assert all_catalogs_source.report.databases.dropped_entities.as_obj() == [] + + catalog_id = "123412341234" + single_catalog_source: GlueSource = GlueSource( + config=GlueSourceConfig(catalog_id=catalog_id, aws_region="us-west-2"), + ctx=PipelineContext(run_id="glue-source-test"), + ) + with Stubber(single_catalog_source.glue_client) as glue_stubber: + glue_stubber.add_response( + "get_databases", get_databases_response, {"CatalogId": catalog_id} + ) + + expected = [flights_database, test_database] + actual = single_catalog_source.get_all_databases() + assert format_databases(actual) == format_databases(expected) + assert single_catalog_source.report.databases.dropped_entities.as_obj() == [ + "empty-database" + ] + + @freeze_time(FROZEN_TIME) def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): deleted_actor_golden_mcs = "{}/glue_deleted_actor_mces_golden.json".format( @@ -357,8 +392,8 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): tables_on_first_call = tables_1 tables_on_second_call = tables_2 mock_get_all_databases_and_tables.side_effect = [ - (databases_1, tables_on_first_call), - (databases_2, tables_on_second_call), + ([flights_database], tables_on_first_call), + ([test_database], tables_on_second_call), ] pipeline_run1 = run_and_get_pipeline(pipeline_config_dict) diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py index dba1eea3010c2f..43bf62fd4e3b8a 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py @@ -88,12 +88,14 @@ "Permissions": ["ALL"], } ], - "CatalogId": "123412341234", + "CatalogId": "000000000000", }, ] } -databases_1 = [{"Name": "flights-database", "CatalogId": "123412341234"}] -databases_2 = [{"Name": "test-database", "CatalogId": "123412341234"}] +flights_database = {"Name": "flights-database", "CatalogId": "123412341234"} +test_database = {"Name": "test-database", "CatalogId": "123412341234"} +empty_database = {"Name": "empty-database", "CatalogId": "000000000000"} + tables_1 = [ { "Name": "avro", From 16698da509ec5c9f86188db7a16c38cea19d61bf Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Thu, 26 Dec 2024 18:10:09 +0530 Subject: [PATCH 34/49] fix(ingest/gc): misc fixes in gc source (#12226) --- .../datahub/ingestion/source/gc/datahub_gc.py | 23 +++++-- .../source/gc/execution_request_cleanup.py | 61 +++++++++++++++---- .../source_report/ingestion_stage.py | 1 + 3 files changed, 68 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 4eecbb4d9d7177..168b787b85e8be 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -34,6 +34,7 @@ SoftDeletedEntitiesCleanupConfig, SoftDeletedEntitiesReport, ) +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport logger = logging.getLogger(__name__) @@ -86,6 +87,7 @@ class DataHubGcSourceReport( DataProcessCleanupReport, SoftDeletedEntitiesReport, DatahubExecutionRequestCleanupReport, + IngestionStageReport, ): expired_tokens_revoked: int = 0 @@ -139,31 +141,40 @@ def get_workunits_internal( ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: try: + self.report.report_ingestion_stage_start("Expired Token Cleanup") self.revoke_expired_tokens() except Exception as e: self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: try: + self.report.report_ingestion_stage_start("Truncate Indices") self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) if self.config.soft_deleted_entities_cleanup.enabled: try: + self.report.report_ingestion_stage_start( + "Soft Deleted Entities Cleanup" + ) self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) - if self.config.execution_request_cleanup.enabled: - try: - self.execution_request_cleanup.run() - except Exception as e: - self.report.failure("While trying to cleanup execution request ", exc=e) if self.config.dataprocess_cleanup.enabled: try: + self.report.report_ingestion_stage_start("Data Process Cleanup") yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: self.report.failure("While trying to cleanup data process ", exc=e) + if self.config.execution_request_cleanup.enabled: + try: + self.report.report_ingestion_stage_start("Execution request Cleanup") + self.execution_request_cleanup.run() + except Exception as e: + self.report.failure("While trying to cleanup execution request ", exc=e) + # Otherwise last stage's duration does not get calculated. + self.report.report_ingestion_stage_start("End") yield from [] def truncate_indices(self) -> None: @@ -281,6 +292,8 @@ def revoke_expired_tokens(self) -> None: list_access_tokens = expired_tokens_res.get("listAccessTokens", {}) tokens = list_access_tokens.get("tokens", []) total = list_access_tokens.get("total", 0) + if tokens == []: + break for token in tokens: self.report.expired_tokens_revoked += 1 token_id = token["id"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 3baf858e44cdc8..170a6ada3e336f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -1,3 +1,4 @@ +import datetime import logging import time from typing import Any, Dict, Iterator, Optional @@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel): description="Global switch for this cleanup task", ) + runtime_limit_seconds: int = Field( + default=3600, + description="Maximum runtime in seconds for the cleanup task", + ) + + max_read_errors: int = Field( + default=10, + description="Maximum number of read errors before aborting", + ) + def keep_history_max_milliseconds(self): return self.keep_history_max_days * 24 * 3600 * 1000 class DatahubExecutionRequestCleanupReport(SourceReport): - execution_request_cleanup_records_read: int = 0 - execution_request_cleanup_records_preserved: int = 0 - execution_request_cleanup_records_deleted: int = 0 - execution_request_cleanup_read_errors: int = 0 - execution_request_cleanup_delete_errors: int = 0 + ergc_records_read: int = 0 + ergc_records_preserved: int = 0 + ergc_records_deleted: int = 0 + ergc_read_errors: int = 0 + ergc_delete_errors: int = 0 + ergc_start_time: Optional[datetime.datetime] = None + ergc_end_time: Optional[datetime.datetime] = None class CleanupRecord(BaseModel): @@ -124,6 +137,13 @@ def _scroll_execution_requests( params.update(overrides) while True: + if self._reached_runtime_limit(): + break + if self.report.ergc_read_errors >= self.config.max_read_errors: + self.report.failure( + f"ergc({self.instance_id}): too many read errors, aborting." + ) + break try: url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}" response = self.graph._session.get(url, headers=headers, params=params) @@ -141,7 +161,7 @@ def _scroll_execution_requests( logger.error( f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}" ) - self.report.execution_request_cleanup_read_errors += 1 + self.report.ergc_read_errors += 1 def _scroll_garbage_records(self): state: Dict[str, Dict] = {} @@ -150,7 +170,7 @@ def _scroll_garbage_records(self): running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000 for entry in self._scroll_execution_requests(): - self.report.execution_request_cleanup_records_read += 1 + self.report.ergc_records_read += 1 key = entry.ingestion_source # Always delete corrupted records @@ -171,7 +191,7 @@ def _scroll_garbage_records(self): # Do not delete if number of requests is below minimum if state[key]["count"] < self.config.keep_history_min_count: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if number of requests do not exceed allowed maximum, @@ -179,7 +199,7 @@ def _scroll_garbage_records(self): if (state[key]["count"] < self.config.keep_history_max_count) and ( entry.requested_at > state[key]["cutoffTimestamp"] ): - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not @@ -188,7 +208,7 @@ def _scroll_garbage_records(self): "RUNNING", "PENDING", ]: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Otherwise delete current record @@ -200,7 +220,7 @@ def _scroll_garbage_records(self): f"record timestamp: {entry.requested_at}." ) ) - self.report.execution_request_cleanup_records_deleted += 1 + self.report.ergc_records_deleted += 1 yield entry def _delete_entry(self, entry: CleanupRecord) -> None: @@ -210,17 +230,31 @@ def _delete_entry(self, entry: CleanupRecord) -> None: ) self.graph.delete_entity(entry.urn, True) except Exception as e: - self.report.execution_request_cleanup_delete_errors += 1 + self.report.ergc_delete_errors += 1 logger.error( f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}" ) + def _reached_runtime_limit(self) -> bool: + if ( + self.config.runtime_limit_seconds + and self.report.ergc_start_time + and ( + datetime.datetime.now() - self.report.ergc_start_time + >= datetime.timedelta(seconds=self.config.runtime_limit_seconds) + ) + ): + logger.info(f"ergc({self.instance_id}): max runtime reached.") + return True + return False + def run(self) -> None: if not self.config.enabled: logger.info( f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled." ) return + self.report.ergc_start_time = datetime.datetime.now() logger.info( ( @@ -232,8 +266,11 @@ def run(self) -> None: ) for entry in self._scroll_garbage_records(): + if self._reached_runtime_limit(): + break self._delete_entry(entry) + self.report.ergc_end_time = datetime.datetime.now() logger.info( f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 42b3b648bd298d..ce683e64b3f468 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -42,4 +42,5 @@ def report_ingestion_stage_start(self, stage: str) -> None: self._timer = PerfTimer() self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + logger.info(f"Stage started: {self.ingestion_stage}") self._timer.start() From fe43f076dadc754313864f7a9ca286223ebb0275 Mon Sep 17 00:00:00 2001 From: Chakru <161002324+chakru-r@users.noreply.github.com> Date: Thu, 26 Dec 2024 21:22:16 +0530 Subject: [PATCH 35/49] Parallelize smoke test (#12225) --- .github/workflows/docker-unified.yml | 50 +++++++++++++------ smoke-test/.gitignore | 4 +- smoke-test/build.gradle | 38 ++++++-------- smoke-test/conftest.py | 52 ++++++++++++++++++++ smoke-test/smoke.sh | 25 ++++++---- smoke-test/tests/cypress/integration_test.py | 49 ++++++++++++------ 6 files changed, 155 insertions(+), 63 deletions(-) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 03a9b3afc3bc58..47c26068347c07 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -1011,18 +1011,39 @@ jobs: needs: setup outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} + cypress_batch_count: ${{ steps.set-batch-count.outputs.cypress_batch_count }} + python_batch_count: ${{ steps.set-batch-count.outputs.python_batch_count }} steps: + - id: set-batch-count + # Tests are split simply to ensure the configured number of batches for parallelization. This may need some + # increase as a new tests added increase the duration where an additional parallel batch helps. + # python_batch_count is used to split pytests in the smoke-test (batches of actual test functions) + # cypress_batch_count is used to split the collection of cypress test specs into batches. + run: | + echo "cypress_batch_count=11" >> "$GITHUB_OUTPUT" + echo "python_batch_count=5" >> "$GITHUB_OUTPUT" + - id: set-matrix + # For m batches for python and n batches for cypress, we need a test matrix of python x m + cypress x n. + # while the github action matrix generation can handle these two parts individually, there isnt a way to use the + # two generated matrices for the same job. So, produce that matrix with scripting and use the include directive + # to add it to the test matrix. run: | - if [ '${{ needs.setup.outputs.frontend_only }}' == 'true' ]; then - echo 'matrix=["cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" - elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> "$GITHUB_OUTPUT" - elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" - else - echo 'matrix=[]' >> "$GITHUB_OUTPUT" + python_batch_count=${{ steps.set-batch-count.outputs.python_batch_count }} + python_matrix=$(printf "{\"test_strategy\":\"pytests\",\"batch\":\"0\",\"batch_count\":\"$python_batch_count\"}"; for ((i=1;i> "$GITHUB_OUTPUT" smoke_test: name: Run Smoke Tests @@ -1043,8 +1064,7 @@ jobs: ] strategy: fail-fast: false - matrix: - test_strategy: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }} + matrix: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }} if: ${{ always() && !failure() && !cancelled() && needs.smoke_test_matrix.outputs.matrix != '[]' }} steps: - name: Free up disk space @@ -1220,6 +1240,8 @@ jobs: CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }} CLEANUP_DATA: "false" TEST_STRATEGY: ${{ matrix.test_strategy }} + BATCH_COUNT: ${{ matrix.batch_count }} + BATCH_NUMBER: ${{ matrix.batch }} run: | echo "$DATAHUB_VERSION" ./gradlew --stop @@ -1230,25 +1252,25 @@ jobs: if: failure() run: | docker ps -a - TEST_STRATEGY="-${{ matrix.test_strategy }}" + TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}" source .github/scripts/docker_logs.sh - name: Upload logs uses: actions/upload-artifact@v3 if: failure() with: - name: docker-logs-${{ matrix.test_strategy }} + name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }} path: "docker_logs/*.log" retention-days: 5 - name: Upload screenshots uses: actions/upload-artifact@v3 if: failure() with: - name: cypress-snapshots-${{ matrix.test_strategy }} + name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }} path: smoke-test/tests/cypress/cypress/screenshots/ - uses: actions/upload-artifact@v3 if: always() with: - name: Test Results (smoke tests) ${{ matrix.test_strategy }} + name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }} path: | **/build/reports/tests/test/** **/build/test-results/test/** diff --git a/smoke-test/.gitignore b/smoke-test/.gitignore index b8af2eef535a0b..d8cfd65ff81b9c 100644 --- a/smoke-test/.gitignore +++ b/smoke-test/.gitignore @@ -29,6 +29,8 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +**/cypress/node_modules + # PyInstaller # Usually these files are written by a python script from a template @@ -132,4 +134,4 @@ dmypy.json # Pyre type checker .pyre/ junit* -tests/cypress/onboarding.json \ No newline at end of file +tests/cypress/onboarding.json diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index 73ecdcb08ea149..60d08e0206cdab 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -91,39 +91,31 @@ task pythonLintFix(type: Exec, dependsOn: installDev) { * The following tasks assume an already running quickstart. * ./gradlew quickstart (or another variation `quickstartDebug`) */ -task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { - environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'no_cypress_suite0' - - workingDir = project.projectDir - commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" -} +// ./gradlew :smoke-test:pytest -PbatchNumber=2 (default 0) +task pytest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { + // Get BATCH_NUMBER from command line argument with default value of 0 + def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0' -task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'no_cypress_suite1' + environment 'TEST_STRATEGY', 'pytests' + environment 'BATCH_COUNT', 5 + environment 'BATCH_NUMBER', batchNumber workingDir = project.projectDir commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" + "./smoke.sh" } -task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { - environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'cypress_suite1' - - workingDir = project.projectDir - commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" -} +// ./gradlew :smoke-test:cypressTest -PbatchNumber=2 (default 0) +task cypressTest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { + // Get BATCH_NUMBER from command line argument with default value of 0 + def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0' -task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'cypress_rest' + environment 'TEST_STRATEGY', 'cypress' + environment 'BATCH_COUNT', 11 + environment 'BATCH_NUMBER', batchNumber workingDir = project.projectDir commandLine 'bash', '-c', diff --git a/smoke-test/conftest.py b/smoke-test/conftest.py index 6d148db9886a48..d48a92b22ab48f 100644 --- a/smoke-test/conftest.py +++ b/smoke-test/conftest.py @@ -1,6 +1,8 @@ import os import pytest +from typing import List, Tuple +from _pytest.nodes import Item import requests from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph @@ -45,3 +47,53 @@ def graph_client(auth_session) -> DataHubGraph: def pytest_sessionfinish(session, exitstatus): """whole test run finishes.""" send_message(exitstatus) + + +def get_batch_start_end(num_tests: int) -> Tuple[int, int]: + batch_count_env = os.getenv("BATCH_COUNT", 1) + batch_count = int(batch_count_env) + + batch_number_env = os.getenv("BATCH_NUMBER", 0) + batch_number = int(batch_number_env) + + if batch_count == 0 or batch_count > num_tests: + raise ValueError( + f"Invalid batch count {batch_count}: must be >0 and <= {num_tests} (num_tests)" + ) + if batch_number >= batch_count: + raise ValueError( + f"Invalid batch number: {batch_number}, must be less than {batch_count} (zer0 based index)" + ) + + batch_size = round(num_tests / batch_count) + + batch_start = batch_size * batch_number + batch_end = batch_start + batch_size + # We must have exactly as many batches as specified by BATCH_COUNT. + if ( + num_tests - batch_end < batch_size + ): # We must have exactly as many batches as specified by BATCH_COUNT, put the remaining in the last batch. + batch_end = num_tests + + if batch_count > 0: + print(f"Running tests for batch {batch_number} of {batch_count}") + + return batch_start, batch_end + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: List[Item] +) -> None: + if os.getenv("TEST_STRATEGY") == "cypress": + return # We launch cypress via pytests, but needs a different batching mechanism at cypress level. + + # If BATCH_COUNT and BATCH_ENV vars are set, splits the pytests to batches and runs filters only the BATCH_NUMBER + # batch for execution. Enables multiple parallel launches. Current implementation assumes all test are of equal + # weight for batching. TODO. A weighted batching method can help make batches more equal sized by cost. + # this effectively is a no-op if BATCH_COUNT=1 + start_index, end_index = get_batch_start_end(num_tests=len(items)) + + items.sort(key=lambda x: x.nodeid) # we want the order to be stable across batches + # replace items with the filtered list + print(f"Running tests for batch {start_index}-{end_index}") + items[:] = items[start_index:end_index] diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index 888a60f488e1fc..ec8188ebf5f4db 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -34,15 +34,20 @@ source ./set-cypress-creds.sh # set environment variables for the test source ./set-test-env-vars.sh -# no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest -if [[ -z "${TEST_STRATEGY}" ]]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke.xml +# TEST_STRATEGY: +# if set to pytests, runs all pytests, skips cypress tests(though cypress test launch is via a pytest). +# if set tp cypress, runs all cypress tests +# if blank, runs all. +# When invoked via the github action, BATCH_COUNT and BATCH_NUM env vars are set to run a slice of those tests per +# worker for parallelism. docker-unified.yml generates a test matrix of pytests/cypress in batches. As number of tests +# increase, the batch_count config (in docker-unified.yml) may need adjustment. +if [[ "${TEST_STRATEGY}" == "pytests" ]]; then + #pytests only - github test matrix runs pytests in one of the runners when applicable. + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-pytests.xml -k 'not test_run_cypress' +elif [[ "${TEST_STRATEGY}" == "cypress" ]]; then + # run only cypress tests. The test inspects BATCH_COUNT and BATCH_NUMBER and runs only a subset of tests in that batch. + # github workflow test matrix will invoke this in multiple runners for each batch. + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-cypress${BATCH_NUMBER}.xml tests/cypress/integration_test.py else - if [ "$TEST_STRATEGY" == "no_cypress_suite0" ]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -k 'not test_run_cypress' -m 'not no_cypress_suite1' - elif [ "$TEST_STRATEGY" == "no_cypress_suite1" ]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -m 'no_cypress_suite1' - else - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_cypress_${TEST_STRATEGY}.xml tests/cypress/integration_test.py - fi + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-all.xml fi diff --git a/smoke-test/tests/cypress/integration_test.py b/smoke-test/tests/cypress/integration_test.py index 0d824a96810d05..33c67a923c278d 100644 --- a/smoke-test/tests/cypress/integration_test.py +++ b/smoke-test/tests/cypress/integration_test.py @@ -1,10 +1,11 @@ import datetime import os import subprocess -from typing import List, Set +from typing import List import pytest +from conftest import get_batch_start_end from tests.setup.lineage.ingest_time_lineage import ( get_time_lineage_urns, ingest_time_lineage, @@ -169,10 +170,29 @@ def ingest_cleanup_data(auth_session, graph_client): print("deleted onboarding data") -def _get_spec_map(items: Set[str]) -> str: - if len(items) == 0: - return "" - return ",".join([f"**/{item}/*.js" for item in items]) +def _get_js_files(base_path: str): + file_paths = [] + for root, dirs, files in os.walk(base_path): + for file in files: + if file.endswith(".js"): + file_paths.append(os.path.relpath(os.path.join(root, file), base_path)) + return sorted(file_paths) # sort to make the order stable across batch runs + + +def _get_cypress_tests_batch(): + """ + Batching is configured via env vars BATCH_COUNT and BATCH_NUMBER. All cypress tests are split into exactly + BATCH_COUNT batches. When BATCH_NUMBER env var is set (zero based index), that batch alone is run. + Github workflow via test_matrix, runs all batches in parallel to speed up the test elapsed time. + If either of these vars are not set, all tests are run sequentially. + :return: + """ + all_tests = _get_js_files("tests/cypress/cypress/e2e") + + batch_start, batch_end = get_batch_start_end(num_tests=len(all_tests)) + + return all_tests[batch_start:batch_end] + # return test_batches[int(batch_number)] #if BATCH_NUMBER was set, we this test just runs that one batch. def test_run_cypress(auth_session): @@ -182,24 +202,23 @@ def test_run_cypress(auth_session): test_strategy = os.getenv("TEST_STRATEGY", None) if record_key: record_arg = " --record " - tag_arg = f" --tag {test_strategy} " + batch_number = os.getenv("BATCH_NUMBER") + batch_count = os.getenv("BATCH_COUNT") + if batch_number and batch_count: + batch_suffix = f"-{batch_number}{batch_count}" + else: + batch_suffix = "" + tag_arg = f" --tag {test_strategy}{batch_suffix}" else: record_arg = " " rest_specs = set(os.listdir("tests/cypress/cypress/e2e")) cypress_suite1_specs = {"mutations", "search", "views"} rest_specs.difference_update(set(cypress_suite1_specs)) - strategy_spec_map = { - "cypress_suite1": cypress_suite1_specs, - "cypress_rest": rest_specs, - } print(f"test strategy is {test_strategy}") test_spec_arg = "" - if test_strategy is not None: - specs = strategy_spec_map.get(test_strategy) - assert specs is not None - specs_str = _get_spec_map(specs) - test_spec_arg = f" --spec '{specs_str}' " + specs_str = ",".join([f"**/{f}" for f in _get_cypress_tests_batch()]) + test_spec_arg = f" --spec '{specs_str}' " print("Running Cypress tests with command") command = f"NO_COLOR=1 npx cypress run {record_arg} {test_spec_arg} {tag_arg}" From 5708bd9beba6ea08d66a37506867380a45718df3 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 26 Dec 2024 11:14:15 -0600 Subject: [PATCH 36/49] chore(bump): spring minor version bump 6.1.14 (#12228) --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index a3d807a7333494..8929b4e644972c 100644 --- a/build.gradle +++ b/build.gradle @@ -35,7 +35,7 @@ buildscript { ext.pegasusVersion = '29.57.0' ext.mavenVersion = '3.6.3' ext.versionGradle = '8.11.1' - ext.springVersion = '6.1.13' + ext.springVersion = '6.1.14' ext.springBootVersion = '3.2.9' ext.springKafkaVersion = '3.1.6' ext.openTelemetryVersion = '1.18.0' From a920e9bec8ed8f0aa6ecfd482c8659afa4f3e034 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 15:33:39 -0500 Subject: [PATCH 37/49] fix(ingest/lookml): emit warnings for resolution failures (#12215) --- .../source/looker/looker_dataclasses.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py index 327c9ebf99bd20..d771821a14d88d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -186,16 +186,16 @@ def resolve_includes( f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}" ) if "*" not in inc and not included_files: - reporter.report_failure( + reporter.warning( title="Error Resolving Include", - message=f"Cannot resolve include {inc}", - context=f"Path: {path}", + message="Cannot resolve included file", + context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}", ) elif not included_files: - reporter.report_failure( + reporter.warning( title="Error Resolving Include", - message=f"Did not resolve anything for wildcard include {inc}", - context=f"Path: {path}", + message="Did not find anything matching the wildcard include", + context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}", ) # only load files that we haven't seen so far included_files = [x for x in included_files if x not in seen_so_far] @@ -231,9 +231,7 @@ def resolve_includes( source_config, reporter, seen_so_far, - traversal_path=traversal_path - + "." - + pathlib.Path(included_file).stem, + traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}", ) ) except Exception as e: From e1998dd371b1002ae8893d7a63d84e5bace079c7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 15:34:00 -0500 Subject: [PATCH 38/49] chore(ingest): remove `enable_logging` helper (#12222) --- .../powerbi/rest_api_wrapper/data_resolver.py | 31 ++-- .../powerbi/test_admin_only_api.py | 12 -- .../tests/integration/powerbi/test_powerbi.py | 141 +++++++----------- .../integration/powerbi/test_profiling.py | 10 -- .../tableau/test_tableau_ingest.py | 29 ---- 5 files changed, 70 insertions(+), 153 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index e1301edef10b84..161975fa635fdb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -84,13 +84,14 @@ def __init__( tenant_id: str, metadata_api_timeout: int, ): - self.__access_token: Optional[str] = None - self.__access_token_expiry_time: Optional[datetime] = None - self.__tenant_id = tenant_id + self._access_token: Optional[str] = None + self._access_token_expiry_time: Optional[datetime] = None + + self._tenant_id = tenant_id # Test connection by generating access token logger.info(f"Trying to connect to {self._get_authority_url()}") # Power-Bi Auth (Service Principal Auth) - self.__msal_client = msal.ConfidentialClientApplication( + self._msal_client = msal.ConfidentialClientApplication( client_id, client_credential=client_secret, authority=DataResolverBase.AUTHORITY + tenant_id, @@ -168,18 +169,18 @@ def _get_app( pass def _get_authority_url(self): - return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}" + return f"{DataResolverBase.AUTHORITY}{self._tenant_id}" def get_authorization_header(self): return {Constant.Authorization: self.get_access_token()} - def get_access_token(self): - if self.__access_token is not None and not self._is_access_token_expired(): - return self.__access_token + def get_access_token(self) -> str: + if self._access_token is not None and not self._is_access_token_expired(): + return self._access_token logger.info("Generating PowerBi access token") - auth_response = self.__msal_client.acquire_token_for_client( + auth_response = self._msal_client.acquire_token_for_client( scopes=[DataResolverBase.SCOPE] ) @@ -193,24 +194,24 @@ def get_access_token(self): logger.info("Generated PowerBi access token") - self.__access_token = "Bearer {}".format( + self._access_token = "Bearer {}".format( auth_response.get(Constant.ACCESS_TOKEN) ) safety_gap = 300 - self.__access_token_expiry_time = datetime.now() + timedelta( + self._access_token_expiry_time = datetime.now() + timedelta( seconds=( max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0) ) ) - logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}") + logger.debug(f"{Constant.PBIAccessToken}={self._access_token}") - return self.__access_token + return self._access_token def _is_access_token_expired(self) -> bool: - if not self.__access_token_expiry_time: + if not self._access_token_expiry_time: return True - return self.__access_token_expiry_time < datetime.now() + return self._access_token_expiry_time < datetime.now() def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: """ diff --git a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py index b636c12cfda064..00dc79ed38cfba 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py +++ b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py @@ -1,5 +1,3 @@ -import logging -import sys from typing import Any, Dict from unittest import mock @@ -483,12 +481,6 @@ def register_mock_admin_api(request_mock: Any, override_data: dict = {}) -> None ) -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def mock_msal_cca(*args, **kwargs): class MsalClient: def acquire_token_for_client(self, *args, **kwargs): @@ -527,8 +519,6 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_admin_only_apis(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) @@ -567,8 +557,6 @@ def test_admin_only_apis(mock_msal, pytestconfig, tmp_path, mock_time, requests_ def test_most_config_and_modified_since( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index edde11ff87d293..739be7cc8408dd 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,8 +1,6 @@ import datetime import json -import logging import re -import sys from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast from unittest import mock @@ -31,29 +29,21 @@ FROZEN_TIME = "2022-02-03 07:00:00" -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - -class MsalClient: - call_num = 0 - token: Dict[str, Any] = { - "access_token": "dummy", - } - - @staticmethod - def acquire_token_for_client(*args, **kwargs): - MsalClient.call_num += 1 - return MsalClient.token +def mock_msal_cca(*args, **kwargs): + class MsalClient: + def __init__(self): + self.call_num = 0 + self.token: Dict[str, Any] = { + "access_token": "dummy", + } - @staticmethod - def reset(): - MsalClient.call_num = 0 + def acquire_token_for_client(self, *args, **kwargs): + self.call_num += 1 + return self.token + def reset(self): + self.call_num = 0 -def mock_msal_cca(*args, **kwargs): return MsalClient() @@ -154,8 +144,6 @@ def test_powerbi_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -199,8 +187,6 @@ def test_powerbi_workspace_type_filter( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -260,8 +246,6 @@ def test_powerbi_ingest_patch_disabled( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -327,8 +311,6 @@ def test_powerbi_platform_instance_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -515,8 +497,6 @@ def test_extract_reports( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -561,8 +541,6 @@ def test_extract_lineage( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -660,8 +638,6 @@ def test_admin_access_is_not_allowed( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -723,8 +699,6 @@ def test_workspace_container( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -764,85 +738,84 @@ def test_workspace_container( ) -@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_access_token_expiry_with_long_expiry( - mock_msal: MagicMock, pytestconfig: pytest.Config, tmp_path: str, mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) - pipeline = Pipeline.create( - { - "run_id": "powerbi-test", - "source": { - "type": "powerbi", - "config": { - **default_source_config(), + mock_msal = mock_msal_cca() + + with mock.patch("msal.ConfidentialClientApplication", return_value=mock_msal): + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/powerbi_access_token_mces.json", + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, }, - }, - } - ) + } + ) # for long expiry, the token should only be requested once. - MsalClient.token = { + mock_msal.token = { "access_token": "dummy2", "expires_in": 3600, } + mock_msal.reset() - MsalClient.reset() pipeline.run() # We expect the token to be requested twice (once for AdminApiResolver and one for RegularApiResolver) - assert MsalClient.call_num == 2 + assert mock_msal.call_num == 2 -@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_access_token_expiry_with_short_expiry( - mock_msal: MagicMock, pytestconfig: pytest.Config, tmp_path: str, mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) - pipeline = Pipeline.create( - { - "run_id": "powerbi-test", - "source": { - "type": "powerbi", - "config": { - **default_source_config(), + mock_msal = mock_msal_cca() + with mock.patch("msal.ConfidentialClientApplication", return_value=mock_msal): + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/powerbi_access_token_mces.json", + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, }, - }, - } - ) + } + ) # for short expiry, the token should be requested when expires. - MsalClient.token = { + mock_msal.token = { "access_token": "dummy", "expires_in": 0, } + mock_msal.reset() + pipeline.run() - assert MsalClient.call_num > 2 + assert mock_msal.call_num > 2 def dataset_type_mapping_set_to_all_platform(pipeline: Pipeline) -> None: @@ -940,8 +913,6 @@ def test_dataset_type_mapping_error( def test_server_to_platform_map( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" new_config: dict = { **default_source_config(), @@ -1416,8 +1387,6 @@ def test_powerbi_cross_workspace_reference_info_message( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, @@ -1495,8 +1464,6 @@ def common_app_ingest( output_mcp_path: str, override_config: dict = {}, ) -> Pipeline: - enable_logging() - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, diff --git a/metadata-ingestion/tests/integration/powerbi/test_profiling.py b/metadata-ingestion/tests/integration/powerbi/test_profiling.py index 4b48bed003b1e8..78d35cf31a26d9 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_profiling.py +++ b/metadata-ingestion/tests/integration/powerbi/test_profiling.py @@ -1,5 +1,3 @@ -import logging -import sys from typing import Any, Dict from unittest import mock @@ -271,12 +269,6 @@ def register_mock_admin_api(request_mock: Any, override_data: dict = {}) -> None ) -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def mock_msal_cca(*args, **kwargs): class MsalClient: def acquire_token_for_client(self, *args, **kwargs): @@ -311,8 +303,6 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_profiling(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 902ff243c802a8..71e5ad10c2fc5e 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1,7 +1,5 @@ import json -import logging import pathlib -import sys from typing import Any, Dict, List, cast from unittest import mock @@ -88,12 +86,6 @@ } -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def read_response(file_name): response_json_path = f"{test_resources_dir}/setup/{file_name}" with open(response_json_path) as file: @@ -376,7 +368,6 @@ def tableau_ingest_common( @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces.json" golden_file_name: str = "tableau_mces_golden.json" tableau_ingest_common( @@ -454,7 +445,6 @@ def mock_data() -> List[dict]: @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_cll.json" golden_file_name: str = "tableau_cll_mces_golden.json" @@ -481,7 +471,6 @@ def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_project_pattern_mces.json" golden_file_name: str = "tableau_mces_golden.json" @@ -505,7 +494,6 @@ def test_project_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_path_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_project_path_mces.json" golden_file_name: str = "tableau_project_path_mces_golden.json" @@ -529,8 +517,6 @@ def test_project_path_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_hierarchy(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() - output_file_name: str = "tableau_nested_project_mces.json" golden_file_name: str = "tableau_nested_project_mces_golden.json" @@ -554,7 +540,6 @@ def test_project_hierarchy(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_extract_all_project(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_extract_all_project_mces.json" golden_file_name: str = "tableau_extract_all_project_mces_golden.json" @@ -644,7 +629,6 @@ def test_project_path_pattern_deny(pytestconfig, tmp_path, mock_datahub_graph): def test_tableau_ingest_with_platform_instance( pytestconfig, tmp_path, mock_datahub_graph ): - enable_logging() output_file_name: str = "tableau_with_platform_instance_mces.json" golden_file_name: str = "tableau_with_platform_instance_mces_golden.json" @@ -691,7 +675,6 @@ def test_tableau_ingest_with_platform_instance( def test_lineage_overrides(): - enable_logging() # Simple - specify platform instance to presto table assert ( TableauUpstreamReference( @@ -745,7 +728,6 @@ def test_lineage_overrides(): def test_database_hostname_to_platform_instance_map(): - enable_logging() # Simple - snowflake table assert ( TableauUpstreamReference( @@ -916,7 +898,6 @@ def test_tableau_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph) def test_tableau_no_verify(): - enable_logging() # This test ensures that we can connect to a self-signed certificate # when ssl_verify is set to False. @@ -941,7 +922,6 @@ def test_tableau_no_verify(): @freeze_time(FROZEN_TIME) @pytest.mark.integration_batch_2 def test_tableau_signout_timeout(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_signout_timeout_mces.json" golden_file_name: str = "tableau_signout_timeout_mces_golden.json" tableau_ingest_common( @@ -1073,7 +1053,6 @@ def test_get_all_datasources_failure(pytestconfig, tmp_path, mock_datahub_graph) @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest_multiple_sites(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_multiple_sites.json" golden_file_name: str = "tableau_multiple_sites_mces_golden.json" @@ -1135,7 +1114,6 @@ def test_tableau_ingest_multiple_sites(pytestconfig, tmp_path, mock_datahub_grap @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest_sites_as_container(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_ingest_sites_as_container.json" golden_file_name: str = "tableau_sites_as_container_mces_golden.json" @@ -1159,7 +1137,6 @@ def test_tableau_ingest_sites_as_container(pytestconfig, tmp_path, mock_datahub_ @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_site_name_pattern_mces.json" golden_file_name: str = "tableau_site_name_pattern_mces_golden.json" @@ -1183,7 +1160,6 @@ def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_permission_ingestion_mces.json" golden_file_name: str = "tableau_permission_ingestion_mces_golden.json" @@ -1209,7 +1185,6 @@ def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_no_hidden_assets(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_no_hidden_assets_mces.json" golden_file_name: str = "tableau_no_hidden_assets_mces_golden.json" @@ -1232,7 +1207,6 @@ def test_no_hidden_assets(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_ingest_tags_disabled(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_ingest_tags_disabled_mces.json" golden_file_name: str = "tableau_ingest_tags_disabled_mces_golden.json" @@ -1254,7 +1228,6 @@ def test_ingest_tags_disabled(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_hidden_asset_tags_mces.json" golden_file_name: str = "tableau_hidden_asset_tags_mces_golden.json" @@ -1277,8 +1250,6 @@ def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_hidden_assets_without_ingest_tags(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() - new_config = config_source_default.copy() new_config["tags_for_hidden_assets"] = ["hidden", "private"] new_config["ingest_tags"] = False From 172736a9b3d291d6cb8fcaf775114abdf8853e1f Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 21:34:31 -0500 Subject: [PATCH 39/49] feat(ingest/dbt): support "Explore" page in dbt cloud (#12223) --- docs/how/updating-datahub.md | 1 + .../src/datahub/ingestion/source/dbt/dbt_cloud.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index a742ebe0cd8968..d6620fde0bf794 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -42,6 +42,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #12077: `Kafka` source no longer ingests schemas from schema registry as separate entities by default, set `ingest_schemas_as_entities` to `true` to ingest them - OpenAPI Update: PIT Keep Alive parameter added to scroll. NOTE: This parameter requires the `pointInTimeCreationEnabled` feature flag to be enabled and the `elasticSearch.implementation` configuration to be `elasticsearch`. This feature is not supported for OpenSearch at this time and the parameter will not be respected without both of these set. - OpenAPI Update 2: Previously there was an incorrectly marked parameter named `sort` on the generic list entities endpoint for v3. This parameter is deprecated and only supports a single string value while the documentation indicates it supports a list of strings. This documentation error has been fixed and the correct field, `sortCriteria`, is now documented which supports a list of strings. +- #12223: For dbt Cloud ingestion, the "View in dbt" link will point at the "Explore" page in the dbt Cloud UI. You can revert to the old behavior of linking to the dbt Cloud IDE by setting `external_url_mode: ide". ### Breaking Changes diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 66c5ef7179af41..5042f6d69b261a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -1,7 +1,7 @@ import logging from datetime import datetime from json import JSONDecodeError -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Literal, Optional, Tuple from urllib.parse import urlparse import dateutil.parser @@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig): description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.", ) + external_url_mode: Literal["explore", "ide"] = Field( + default="explore", + description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE', + ) + @root_validator(pre=True) def set_metadata_endpoint(cls, values: dict) -> dict: if values.get("access_url") and not values.get("metadata_endpoint"): @@ -527,5 +532,7 @@ def _parse_into_dbt_column( ) def get_external_url(self, node: DBTNode) -> Optional[str]: - # TODO: Once dbt Cloud supports deep linking to specific files, we can use that. - return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}" + if self.config.external_url_mode == "explore": + return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}" + else: + return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}" From 3ca8d09100eb649a5f191a32f2af8d300424818b Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:40:00 +0530 Subject: [PATCH 40/49] feat(ingest/snowflake): support email_as_user_identifier for queries v2 (#12219) --- .../source/snowflake/snowflake_config.py | 19 ++++--- .../source/snowflake/snowflake_queries.py | 45 ++++++++++++--- .../source/snowflake/snowflake_query.py | 6 +- .../source/snowflake/snowflake_usage_v2.py | 7 +-- .../source/snowflake/snowflake_utils.py | 40 ++++++++------ .../snowflake/test_snowflake_queries.py | 55 +++++++++++++++++++ 6 files changed, 132 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 2b2dcf860cdb07..12e5fb72b00de8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig( description="Whether to convert dataset urns to lowercase.", ) - -class SnowflakeUsageConfig(BaseUsageConfig): email_domain: Optional[str] = pydantic.Field( default=None, description="Email domain of your organization so users can be displayed on UI appropriately.", ) + + email_as_user_identifier: bool = Field( + default=True, + description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is " + "provided, generates email addresses for snowflake users with unset emails, based on their " + "username.", + ) + + +class SnowflakeUsageConfig(BaseUsageConfig): apply_view_usage_to_tables: bool = pydantic.Field( default=False, description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.", @@ -267,13 +275,6 @@ class SnowflakeV2Config( " Map of share name -> details of share.", ) - email_as_user_identifier: bool = Field( - default=True, - description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is " - "provided, generates email addresses for snowflake users with unset emails, based on their " - "username.", - ) - include_assertion_results: bool = Field( default=False, description="Whether to ingest assertion run results for assertions created using Datahub" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 174aad0bddd4a8..36825dc33fe7dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -66,6 +66,11 @@ logger = logging.getLogger(__name__) +# Define a type alias +UserName = str +UserEmail = str +UsersMapping = Dict[UserName, UserEmail] + class SnowflakeQueriesExtractorConfig(ConfigModel): # TODO: Support stateful ingestion for the time windows. @@ -114,11 +119,13 @@ class SnowflakeQueriesSourceConfig( class SnowflakeQueriesExtractorReport(Report): copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) + users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) sql_aggregator: Optional[SqlAggregatorReport] = None num_ddl_queries_dropped: int = 0 + num_users: int = 0 @dataclass @@ -225,6 +232,9 @@ def is_allowed_table(self, name: str) -> bool: def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: + with self.report.users_fetch_timer: + users = self.fetch_users() + # TODO: Add some logic to check if the cached audit log is stale or not. audit_log_file = self.local_temp_path / "audit_log.sqlite" use_cached_audit_log = audit_log_file.exists() @@ -248,7 +258,7 @@ def get_workunits_internal( queries.append(entry) with self.report.query_log_fetch_timer: - for entry in self.fetch_query_log(): + for entry in self.fetch_query_log(users): queries.append(entry) with self.report.audit_log_load_timer: @@ -263,6 +273,25 @@ def get_workunits_internal( shared_connection.close() audit_log_file.unlink(missing_ok=True) + def fetch_users(self) -> UsersMapping: + users: UsersMapping = dict() + with self.structured_reporter.report_exc("Error fetching users from Snowflake"): + logger.info("Fetching users from Snowflake") + query = SnowflakeQuery.get_all_users() + resp = self.connection.query(query) + + for row in resp: + try: + users[row["NAME"]] = row["EMAIL"] + self.report.num_users += 1 + except Exception as e: + self.structured_reporter.warning( + "Error parsing user row", + context=f"{row}", + exc=e, + ) + return users + def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: # Derived from _populate_external_lineage_from_copy_history. @@ -298,7 +327,7 @@ def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: yield result def fetch_query_log( - self, + self, users: UsersMapping ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]: query_log_query = _build_enriched_query_log_query( start_time=self.config.window.start_time, @@ -319,7 +348,7 @@ def fetch_query_log( assert isinstance(row, dict) try: - entry = self._parse_audit_log_row(row) + entry = self._parse_audit_log_row(row, users) except Exception as e: self.structured_reporter.warning( "Error parsing query log row", @@ -331,7 +360,7 @@ def fetch_query_log( yield entry def _parse_audit_log_row( - self, row: Dict[str, Any] + self, row: Dict[str, Any], users: UsersMapping ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]: json_fields = { "DIRECT_OBJECTS_ACCESSED", @@ -430,9 +459,11 @@ def _parse_audit_log_row( ) ) - # TODO: Fetch email addresses from Snowflake to map user -> email - # TODO: Support email_domain fallback for generating user urns. - user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"])) + user = CorpUserUrn( + self.identifiers.get_user_identifier( + res["user_name"], users.get(res["user_name"]) + ) + ) timestamp: datetime = res["query_start_time"] timestamp = timestamp.astimezone(timezone.utc) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index a94b39476b2c22..40bcfb514efd23 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -947,4 +947,8 @@ def dmf_assertion_results(start_time_millis: int, end_time_millis: int) -> str: AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}' ORDER BY MEASUREMENT_TIME ASC; -""" + """ + + @staticmethod + def get_all_users() -> str: + return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index aff15386c50833..4bdf559f293b51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -342,10 +342,9 @@ def _map_user_counts( filtered_user_counts.append( DatasetUserUsageCounts( user=make_user_urn( - self.get_user_identifier( + self.identifiers.get_user_identifier( user_count["user_name"], user_email, - self.config.email_as_user_identifier, ) ), count=user_count["total"], @@ -453,9 +452,7 @@ def _get_operation_aspect_work_unit( reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(start_time.timestamp() * 1000) user_urn = make_user_urn( - self.get_user_identifier( - user_name, user_email, self.config.email_as_user_identifier - ) + self.identifiers.get_user_identifier(user_name, user_email) ) # NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 8e0c97aa135e84..885bee1ccdb908 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -300,6 +300,28 @@ def get_quoted_identifier_for_schema(db_name, schema_name): def get_quoted_identifier_for_table(db_name, schema_name, table_name): return f'"{db_name}"."{schema_name}"."{table_name}"' + # Note - decide how to construct user urns. + # Historically urns were created using part before @ from user's email. + # Users without email were skipped from both user entries as well as aggregates. + # However email is not mandatory field in snowflake user, user_name is always present. + def get_user_identifier( + self, + user_name: str, + user_email: Optional[str], + ) -> str: + if user_email: + return self.snowflake_identifier( + user_email + if self.identifier_config.email_as_user_identifier is True + else user_email.split("@")[0] + ) + return self.snowflake_identifier( + f"{user_name}@{self.identifier_config.email_domain}" + if self.identifier_config.email_as_user_identifier is True + and self.identifier_config.email_domain is not None + else user_name + ) + class SnowflakeCommonMixin(SnowflakeStructuredReportMixin): platform = "snowflake" @@ -315,24 +337,6 @@ def structured_reporter(self) -> SourceReport: def identifiers(self) -> SnowflakeIdentifierBuilder: return SnowflakeIdentifierBuilder(self.config, self.report) - # Note - decide how to construct user urns. - # Historically urns were created using part before @ from user's email. - # Users without email were skipped from both user entries as well as aggregates. - # However email is not mandatory field in snowflake user, user_name is always present. - def get_user_identifier( - self, - user_name: str, - user_email: Optional[str], - email_as_user_identifier: bool, - ) -> str: - if user_email: - return self.identifiers.snowflake_identifier( - user_email - if email_as_user_identifier is True - else user_email.split("@")[0] - ) - return self.identifiers.snowflake_identifier(user_name) - # TODO: Revisit this after stateful ingestion can commit checkpoint # for failures that do not affect the checkpoint # TODO: Add additional parameters to match the signature of the .warning and .failure methods diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py index 82f5691bcee3de..ae0f23d93215d4 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py @@ -22,3 +22,58 @@ def test_source_close_cleans_tmp(snowflake_connect, tmp_path): # This closes QueriesExtractor which in turn closes SqlParsingAggregator source.close() assert len(os.listdir(tmp_path)) == 0 + + +@patch("snowflake.connector.connect") +def test_user_identifiers_email_as_identifier(snowflake_connect, tmp_path): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + }, + "email_as_user_identifier": True, + "email_domain": "example.com", + }, + PipelineContext("run-id"), + ) + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username@example.com" + ) + assert ( + source.identifiers.get_user_identifier("username", None) + == "username@example.com" + ) + + # We'd do best effort to use email as identifier, but would keep username as is, + # if email can't be formed. + source.identifiers.identifier_config.email_domain = None + + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username@example.com" + ) + + assert source.identifiers.get_user_identifier("username", None) == "username" + + +@patch("snowflake.connector.connect") +def test_user_identifiers_username_as_identifier(snowflake_connect, tmp_path): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + }, + "email_as_user_identifier": False, + }, + PipelineContext("run-id"), + ) + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username" + ) + assert source.identifiers.get_user_identifier("username", None) == "username" From 29e4528ae5117ccdb6f0685b8571c2afdcc19f57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 11:12:40 +0100 Subject: [PATCH 41/49] fix(tableau): retry if 502 error code (#12233) --- .../datahub/ingestion/source/tableau/tableau.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 508500ffe489b9..df59cae3fad232 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -186,6 +186,15 @@ except ImportError: REAUTHENTICATE_ERRORS = (NonXMLResponseError,) +RETRIABLE_ERROR_CODES = [ + 408, # Request Timeout + 429, # Too Many Requests + 500, # Internal Server Error + 502, # Bad Gateway + 503, # Service Unavailable + 504, # Gateway Timeout +] + logger: logging.Logger = logging.getLogger(__name__) # Replace / with | @@ -287,7 +296,7 @@ def make_tableau_client(self, site: str) -> Server: max_retries=Retry( total=self.max_retries, backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], + status_forcelist=RETRIABLE_ERROR_CODES, ) ) server._session.mount("http://", adapter) @@ -1212,9 +1221,11 @@ def get_connection_object_page( except InternalServerError as ise: # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry. - if ise.code == 504: + # Extended with other retryable errors. + if ise.code in RETRIABLE_ERROR_CODES: if retries_remaining <= 0: raise ise + logger.info(f"Retrying query due to error {ise.code}") return self.get_connection_object_page( query=query, connection_type=connection_type, From d7de7eb2a65385b0a4458f9c26cc8b1a42158cc1 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Fri, 27 Dec 2024 17:51:43 +0530 Subject: [PATCH 42/49] ci: remove qodana (#12227) --- .github/workflows/qodana-scan.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/qodana-scan.yml diff --git a/.github/workflows/qodana-scan.yml b/.github/workflows/qodana-scan.yml deleted file mode 100644 index 750cf24ad38e57..00000000000000 --- a/.github/workflows/qodana-scan.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Qodana -on: - workflow_dispatch: - pull_request: - push: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - qodana: - runs-on: ubuntu-latest - steps: - - uses: acryldata/sane-checkout-action@v3 - - name: "Qodana Scan" - uses: JetBrains/qodana-action@v2022.3.4 - - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: ${{ runner.temp }}/qodana/results/qodana.sarif.json - cache-default-branch-only: true From ac8e539457ef984cb61329a449585fa86fc5d3c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 16:14:32 +0100 Subject: [PATCH 43/49] chore(tableau): adjust visibility of info message (#12235) --- .../src/datahub/ingestion/source/tableau/tableau.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index df59cae3fad232..d47e10c9eb5c62 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -920,10 +920,7 @@ def dataset_browse_prefix(self) -> str: return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" def _re_authenticate(self) -> None: - self.report.info( - message="Re-authenticating to Tableau", - context=f"site='{self.site_content_url}'", - ) + logger.info(f"Re-authenticating to Tableau site '{self.site_content_url}'") # Sign-in again may not be enough because Tableau sometimes caches invalid sessions # so we need to recreate the Tableau Server object self.server = self.config.make_tableau_client(self.site_content_url) From ed8639e401d30b842fac66b52636f5c1ab0c71b7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 27 Dec 2024 13:46:49 -0500 Subject: [PATCH 44/49] chore(python): test with python 3.11 (#11280) Co-authored-by: Tamas Nemeth Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .github/workflows/dagster-plugin.yml | 6 +++--- .github/workflows/metadata-ingestion.yml | 4 ++-- .github/workflows/prefect-plugin.yml | 4 ++-- metadata-ingestion-modules/airflow-plugin/setup.py | 4 ---- metadata-ingestion-modules/dagster-plugin/README.md | 3 +-- metadata-ingestion-modules/dagster-plugin/setup.py | 3 --- metadata-ingestion-modules/gx-plugin/README.md | 3 +-- metadata-ingestion-modules/gx-plugin/setup.py | 3 --- metadata-ingestion-modules/prefect-plugin/README.md | 2 +- metadata-ingestion-modules/prefect-plugin/setup.py | 6 +----- metadata-ingestion/setup.py | 10 ++++------ .../src/datahub/ingestion/source/s3/source.py | 2 +- .../tests/integration/feast/test_feast_repository.py | 8 ++++++++ 13 files changed, 24 insertions(+), 34 deletions(-) diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index d8a9cd7bfd6a35..ae9a0b1605cdf3 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -30,11 +30,11 @@ jobs: DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: - python-version: ["3.9", "3.10"] + python-version: ["3.9", "3.11"] include: - python-version: "3.9" extraPythonRequirement: "dagster>=1.3.3" - - python-version: "3.10" + - python-version: "3.11" extraPythonRequirement: "dagster>=1.3.3" fail-fast: false steps: @@ -57,7 +57,7 @@ jobs: if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && uv pip freeze - uses: actions/upload-artifact@v4 - if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} + if: ${{ always() && matrix.python-version == '3.11' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) path: | diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index ad00c6d1551d1d..106cba1473982e 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -33,7 +33,7 @@ jobs: # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} strategy: matrix: - python-version: ["3.8", "3.10"] + python-version: ["3.8", "3.11"] command: [ "testQuick", @@ -43,7 +43,7 @@ jobs: ] include: - python-version: "3.8" - - python-version: "3.10" + - python-version: "3.11" fail-fast: false steps: - name: Free up disk space diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index e4a70426f3a618..d77142a1f00ded 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -30,7 +30,7 @@ jobs: DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] fail-fast: false steps: - name: Set up JDK 17 @@ -52,7 +52,7 @@ jobs: if: always() run: source metadata-ingestion-modules/prefect-plugin/venv/bin/activate && uv pip freeze - uses: actions/upload-artifact@v4 - if: ${{ always() && matrix.python-version == '3.10'}} + if: ${{ always() && matrix.python-version == '3.11'}} with: name: Test Results (Prefect Plugin ${{ matrix.python-version}}) path: | diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 3209233184d55a..2693aab0700da3 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -148,10 +148,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/dagster-plugin/README.md b/metadata-ingestion-modules/dagster-plugin/README.md index 8e1460957ed9ff..5113fc37dcc222 100644 --- a/metadata-ingestion-modules/dagster-plugin/README.md +++ b/metadata-ingestion-modules/dagster-plugin/README.md @@ -1,4 +1,3 @@ # Datahub Dagster Plugin -See the DataHub Dagster docs for details. - +See the [DataHub Dagster docs](https://datahubproject.io/docs/lineage/dagster/) for details. diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 0e0685cb378c1b..22c15497bd8070 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -107,9 +107,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/gx-plugin/README.md b/metadata-ingestion-modules/gx-plugin/README.md index 1ffd87a955432d..9d50235a093d63 100644 --- a/metadata-ingestion-modules/gx-plugin/README.md +++ b/metadata-ingestion-modules/gx-plugin/README.md @@ -1,4 +1,3 @@ # Datahub GX Plugin -See the DataHub GX docs for details. - +See the [DataHub GX docs](https://datahubproject.io/docs/metadata-ingestion/integration_docs/great-expectations) for details. diff --git a/metadata-ingestion-modules/gx-plugin/setup.py b/metadata-ingestion-modules/gx-plugin/setup.py index 73d5d1a9a02f18..40afc81a98f9c8 100644 --- a/metadata-ingestion-modules/gx-plugin/setup.py +++ b/metadata-ingestion-modules/gx-plugin/setup.py @@ -118,9 +118,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/prefect-plugin/README.md b/metadata-ingestion-modules/prefect-plugin/README.md index 0896942e78ef61..f21e00b4945135 100644 --- a/metadata-ingestion-modules/prefect-plugin/README.md +++ b/metadata-ingestion-modules/prefect-plugin/README.md @@ -28,7 +28,7 @@ The `prefect-datahub` collection allows you to easily integrate DataHub's metada ## Prerequisites -- Python 3.7+ +- Python 3.8+ - Prefect 2.0.0+ and < 3.0.0+ - A running instance of DataHub diff --git a/metadata-ingestion-modules/prefect-plugin/setup.py b/metadata-ingestion-modules/prefect-plugin/setup.py index 7e56fe8b6ad114..70b0e958195645 100644 --- a/metadata-ingestion-modules/prefect-plugin/setup.py +++ b/metadata-ingestion-modules/prefect-plugin/setup.py @@ -103,10 +103,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", @@ -120,7 +116,7 @@ def get_long_description(): ], # Package info. zip_safe=False, - python_requires=">=3.7", + python_requires=">=3.8", package_dir={"": "src"}, packages=setuptools.find_namespace_packages(where="./src"), entry_points=entry_points, diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c6994dd6d5aa65..986dc189cb29ba 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -298,8 +298,8 @@ } data_lake_profiling = { - "pydeequ~=1.1.0", - "pyspark~=3.3.0", + "pydeequ>=1.1.0", + "pyspark~=3.5.0", } delta_lake = { @@ -318,7 +318,7 @@ # 0.1.11 appears to have authentication issues with azure databricks # 0.22.0 has support for `include_browse` in metadata list apis "databricks-sdk>=0.30.0", - "pyspark~=3.3.0", + "pyspark~=3.5.0", "requests", # Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes # Version 3.0.0 required SQLAlchemy > 2.0.21 @@ -874,9 +874,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", @@ -917,6 +914,7 @@ "sync-file-emitter", "sql-parser", "iceberg", + "feast", } else set() ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 3ddf47b70cdf80..ceac9e96d1ddd0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -225,7 +225,7 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext): self.init_spark() def init_spark(self): - os.environ.setdefault("SPARK_VERSION", "3.3") + os.environ.setdefault("SPARK_VERSION", "3.5") spark_version = os.environ["SPARK_VERSION"] # Importing here to avoid Deequ dependency for non profiling use cases diff --git a/metadata-ingestion/tests/integration/feast/test_feast_repository.py b/metadata-ingestion/tests/integration/feast/test_feast_repository.py index 7f04337145dc36..80d7c6311a9589 100644 --- a/metadata-ingestion/tests/integration/feast/test_feast_repository.py +++ b/metadata-ingestion/tests/integration/feast/test_feast_repository.py @@ -1,3 +1,6 @@ +import sys + +import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline @@ -6,6 +9,11 @@ FROZEN_TIME = "2020-04-14 07:00:00" +# The test is skipped for python 3.11 due to conflicting dependencies in installDev +# setup that requires pydantic < 2 for majority plugins. Note that the test works with +# python 3.11 if run with standalone virtual env setup with feast plugin alone using +# `pip install acryl-datahub[feast]` since it allows pydantic > 2 +@pytest.mark.skipif(sys.version_info > (3, 11), reason="Skipped on Python 3.11+") @freeze_time(FROZEN_TIME) def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/feast" From d0423547ba559c6059ffc35f9ed153036bf0e45d Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 27 Dec 2024 13:50:28 -0500 Subject: [PATCH 45/49] feat(ingest): add parse_ts_millis helper (#12231) --- .../assertion_circuit_breaker.py | 9 ++--- .../src/datahub/emitter/mce_builder.py | 18 +++++++++- .../src/datahub/emitter/mcp_builder.py | 9 ++--- .../src/datahub/emitter/mcp_patch_builder.py | 4 +-- .../src/datahub/emitter/rest_emitter.py | 4 +-- .../datahub/ingestion/api/source_helpers.py | 8 ++--- .../source/bigquery_v2/bigquery_schema.py | 25 +++---------- .../source/datahub/datahub_kafka_reader.py | 3 +- .../source/sql/sql_generic_profiler.py | 11 +++--- .../ingestion/source/state/checkpoint.py | 3 +- .../datahub/ingestion/source/unity/proxy.py | 35 +++++-------------- .../src/datahub/utilities/time.py | 11 ++++-- .../dbt_enabled_with_schemas_mces_golden.json | 10 +++--- .../dbt_test_column_meta_mapping_golden.json | 10 +++--- ...test_prefer_sql_parser_lineage_golden.json | 34 +++++++++--------- ...bt_test_test_model_performance_golden.json | 34 +++++++++--------- ...th_complex_owner_patterns_mces_golden.json | 10 +++--- ...th_data_platform_instance_mces_golden.json | 10 +++--- ...h_non_incremental_lineage_mces_golden.json | 10 +++--- ..._target_platform_instance_mces_golden.json | 10 +++--- .../tests/unit/sdk/test_mce_builder.py | 17 +++++++++ .../tests/unit/serde/test_codegen.py | 6 ++-- smoke-test/smoke.sh | 2 ++ 23 files changed, 145 insertions(+), 148 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py b/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py index 9d2a65663ba37d..283cdaa8333338 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py @@ -1,6 +1,6 @@ import logging from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Any, Dict, List, Optional from pydantic import Field @@ -10,6 +10,7 @@ CircuitBreakerConfig, ) from datahub.api.graphql import Assertion, Operation +from datahub.emitter.mce_builder import parse_ts_millis logger: logging.Logger = logging.getLogger(__name__) @@ -49,7 +50,7 @@ def get_last_updated(self, urn: str) -> Optional[datetime]: if not operations: return None else: - return datetime.fromtimestamp(operations[0]["lastUpdatedTimestamp"] / 1000) + return parse_ts_millis(operations[0]["lastUpdatedTimestamp"]) def _check_if_assertion_failed( self, assertions: List[Dict[str, Any]], last_updated: Optional[datetime] = None @@ -93,7 +94,7 @@ class AssertionResult: logger.info(f"Found successful assertion: {assertion_urn}") result = False if last_updated is not None: - last_run = datetime.fromtimestamp(last_assertion.time / 1000) + last_run = parse_ts_millis(last_assertion.time) if last_updated > last_run: logger.error( f"Missing assertion run for {assertion_urn}. The dataset was updated on {last_updated} but the last assertion run was at {last_run}" @@ -117,7 +118,7 @@ def is_circuit_breaker_active(self, urn: str) -> bool: ) if not last_updated: - last_updated = datetime.now() - self.config.time_delta + last_updated = datetime.now(tz=timezone.utc) - self.config.time_delta logger.info( f"Dataset {urn} doesn't have last updated or check_last_assertion_time is false, using calculated min assertion date {last_updated}" ) diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 69946c575908b5..110624aa61cb89 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -6,7 +6,7 @@ import os import re import time -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from typing import ( TYPE_CHECKING, @@ -103,6 +103,22 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]: return int(ts.timestamp() * 1000) +@overload +def parse_ts_millis(ts: float) -> datetime: + ... + + +@overload +def parse_ts_millis(ts: None) -> None: + ... + + +def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]: + if ts is None: + return None + return datetime.fromtimestamp(ts / 1000, tz=timezone.utc) + + def make_data_platform_urn(platform: str) -> str: if platform.startswith("urn:li:dataPlatform:"): return platform diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index 293157f8a1ed05..c8eb62a2e1de23 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -4,8 +4,8 @@ from pydantic.main import BaseModel from datahub.cli.env_utils import get_boolean_env_variable -from datahub.emitter.enum_helpers import get_enum_options from datahub.emitter.mce_builder import ( + ALL_ENV_TYPES, Aspect, datahub_guid, make_container_urn, @@ -25,7 +25,6 @@ ContainerClass, DomainsClass, EmbedClass, - FabricTypeClass, GlobalTagsClass, MetadataChangeEventClass, OwnerClass, @@ -206,11 +205,7 @@ def gen_containers( # Extra validation on the env field. # In certain cases (mainly for backwards compatibility), the env field will actually # have a platform instance name. - env = ( - container_key.env - if container_key.env in get_enum_options(FabricTypeClass) - else None - ) + env = container_key.env if container_key.env in ALL_ENV_TYPES else None container_urn = container_key.as_urn() diff --git a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py index 779b42e1e1ee99..1ed8ce1d5a6158 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py @@ -2,7 +2,7 @@ import time from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, Iterable, List, Optional, Sequence, Union +from typing import Any, Dict, List, Optional, Sequence, Union from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE from datahub.emitter.serialization_helper import pre_json_transform @@ -75,7 +75,7 @@ def _add_patch( # TODO: Validate that aspectName is a valid aspect for this entityType self.patches[aspect_name].append(_Patch(op, path, value)) - def build(self) -> Iterable[MetadataChangeProposalClass]: + def build(self) -> List[MetadataChangeProposalClass]: return [ MetadataChangeProposalClass( entityUrn=self.urn, diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 675717b5ec4829..04242c8bf45d2b 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -3,7 +3,7 @@ import logging import os from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union import requests from deprecated import deprecated @@ -288,7 +288,7 @@ def emit_mcp( def emit_mcps( self, - mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], + mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], async_flag: Optional[bool] = None, ) -> int: logger.debug("Attempting to emit batch mcps") diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 7791ea2797be34..f3e5b1db6a1c85 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -1,5 +1,4 @@ import logging -from datetime import datetime, timezone from typing import ( TYPE_CHECKING, Dict, @@ -14,7 +13,7 @@ ) from datahub.configuration.time_window_config import BaseTimeWindowConfig -from datahub.emitter.mce_builder import make_dataplatform_instance_urn +from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import entity_supports_aspect from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -479,10 +478,7 @@ def auto_empty_dataset_usage_statistics( if invalid_timestamps: logger.warning( f"Usage statistics with unexpected timestamps, bucket_duration={config.bucket_duration}:\n" - ", ".join( - str(datetime.fromtimestamp(ts / 1000, tz=timezone.utc)) - for ts in invalid_timestamps - ) + ", ".join(str(parse_ts_millis(ts)) for ts in invalid_timestamps) ) for bucket in bucket_timestamps: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 3ce34be8dc89df..cbe1f6eb978247 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -1,7 +1,7 @@ import logging from collections import defaultdict from dataclasses import dataclass, field -from datetime import datetime, timezone +from datetime import datetime from functools import lru_cache from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Optional @@ -15,6 +15,7 @@ TimePartitioningType, ) +from datahub.emitter.mce_builder import parse_ts_millis from datahub.ingestion.api.source import SourceReport from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels @@ -393,13 +394,7 @@ def _make_bigquery_table( name=table.table_name, created=table.created, table_type=table.table_type, - last_altered=( - datetime.fromtimestamp( - table.get("last_altered") / 1000, tz=timezone.utc - ) - if table.get("last_altered") is not None - else None - ), + last_altered=parse_ts_millis(table.get("last_altered")), size_in_bytes=table.get("bytes"), rows_count=table.get("row_count"), comment=table.comment, @@ -460,11 +455,7 @@ def _make_bigquery_view(view: bigquery.Row) -> BigqueryView: return BigqueryView( name=view.table_name, created=view.created, - last_altered=( - datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc) - if view.get("last_altered") is not None - else None - ), + last_altered=(parse_ts_millis(view.get("last_altered"))), comment=view.comment, view_definition=view.view_definition, materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW, @@ -705,13 +696,7 @@ def _make_bigquery_table_snapshot(snapshot: bigquery.Row) -> BigqueryTableSnapsh return BigqueryTableSnapshot( name=snapshot.table_name, created=snapshot.created, - last_altered=( - datetime.fromtimestamp( - snapshot.get("last_altered") / 1000, tz=timezone.utc - ) - if snapshot.get("last_altered") is not None - else None - ), + last_altered=parse_ts_millis(snapshot.get("last_altered")), comment=snapshot.comment, ddl=snapshot.ddl, snapshot_time=snapshot.snapshot_time, diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py index 56a3d55abb184f..ba073533eccfb5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py @@ -12,6 +12,7 @@ from confluent_kafka.schema_registry.avro import AvroDeserializer from datahub.configuration.kafka import KafkaConsumerConnectionConfig +from datahub.emitter.mce_builder import parse_ts_millis from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.datahub.config import DataHubSourceConfig @@ -92,7 +93,7 @@ def _poll_partition( if mcl.created and mcl.created.time > stop_time.timestamp() * 1000: logger.info( f"Stopped reading from kafka, reached MCL " - f"with audit stamp {datetime.fromtimestamp(mcl.created.time / 1000)}" + f"with audit stamp {parse_ts_millis(mcl.created.time)}" ) break diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index bd6c23cc2d4644..c91be9b494c006 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -7,7 +7,10 @@ from sqlalchemy import create_engine, inspect from sqlalchemy.engine.reflection import Inspector -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mce_builder import ( + make_dataset_urn_with_platform_instance, + parse_ts_millis, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.ge_data_profiler import ( @@ -245,11 +248,7 @@ def is_dataset_eligible_for_profiling( # If profiling state exists we have to carry over to the new state self.state_handler.add_to_state(dataset_urn, last_profiled) - threshold_time: Optional[datetime] = ( - datetime.fromtimestamp(last_profiled / 1000, timezone.utc) - if last_profiled - else None - ) + threshold_time: Optional[datetime] = parse_ts_millis(last_profiled) if ( not threshold_time and self.config.profiling.profile_if_updated_since_days is not None diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py index 5bfd48eb754d53..2c7a4a8b6c137d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py @@ -12,6 +12,7 @@ import pydantic from datahub.configuration.common import ConfigModel +from datahub.emitter.mce_builder import parse_ts_millis from datahub.metadata.schema_classes import ( DatahubIngestionCheckpointClass, IngestionCheckpointStateClass, @@ -144,7 +145,7 @@ def create_from_checkpoint_aspect( ) logger.info( f"Successfully constructed last checkpoint state for job {job_name} " - f"with timestamp {datetime.fromtimestamp(checkpoint_aspect.timestampMillis/1000, tz=timezone.utc)}" + f"with timestamp {parse_ts_millis(checkpoint_aspect.timestampMillis)}" ) return checkpoint return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 11827bace4b5a1..9b96953794dcd5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -4,7 +4,7 @@ import dataclasses import logging -from datetime import datetime, timezone +from datetime import datetime from typing import Any, Dict, Iterable, List, Optional, Union, cast from unittest.mock import patch @@ -27,6 +27,7 @@ from databricks.sdk.service.workspace import ObjectType import datahub +from datahub.emitter.mce_builder import parse_ts_millis from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy from datahub.ingestion.source.unity.proxy_profiling import ( UnityCatalogProxyProfilingMixin, @@ -211,16 +212,8 @@ def workspace_notebooks(self) -> Iterable[Notebook]: id=obj.object_id, path=obj.path, language=obj.language, - created_at=( - datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc) - if obj.created_at - else None - ), - modified_at=( - datetime.fromtimestamp(obj.modified_at / 1000, tz=timezone.utc) - if obj.modified_at - else None - ), + created_at=parse_ts_millis(obj.created_at), + modified_at=parse_ts_millis(obj.modified_at), ) def query_history( @@ -452,17 +445,9 @@ def _create_table( properties=obj.properties or {}, owner=obj.owner, generation=obj.generation, - created_at=( - datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc) - if obj.created_at - else None - ), + created_at=(parse_ts_millis(obj.created_at) if obj.created_at else None), created_by=obj.created_by, - updated_at=( - datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc) - if obj.updated_at - else None - ), + updated_at=(parse_ts_millis(obj.updated_at) if obj.updated_at else None), updated_by=obj.updated_by, table_id=obj.table_id, comment=obj.comment, @@ -500,12 +485,8 @@ def _create_query(self, info: QueryInfo) -> Optional[Query]: query_id=info.query_id, query_text=info.query_text, statement_type=info.statement_type, - start_time=datetime.fromtimestamp( - info.query_start_time_ms / 1000, tz=timezone.utc - ), - end_time=datetime.fromtimestamp( - info.query_end_time_ms / 1000, tz=timezone.utc - ), + start_time=parse_ts_millis(info.query_start_time_ms), + end_time=parse_ts_millis(info.query_end_time_ms), user_id=info.user_id, user_name=info.user_name, executed_as_user_id=info.executed_as_user_id, diff --git a/metadata-ingestion/src/datahub/utilities/time.py b/metadata-ingestion/src/datahub/utilities/time.py index 0df7afb19935f7..e8338ce068c844 100644 --- a/metadata-ingestion/src/datahub/utilities/time.py +++ b/metadata-ingestion/src/datahub/utilities/time.py @@ -1,6 +1,8 @@ import time from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import datetime + +from datahub.emitter.mce_builder import make_ts_millis, parse_ts_millis def get_current_time_in_seconds() -> int: @@ -9,12 +11,15 @@ def get_current_time_in_seconds() -> int: def ts_millis_to_datetime(ts_millis: int) -> datetime: """Converts input timestamp in milliseconds to a datetime object with UTC timezone""" - return datetime.fromtimestamp(ts_millis / 1000, tz=timezone.utc) + return parse_ts_millis(ts_millis) def datetime_to_ts_millis(dt: datetime) -> int: """Converts a datetime object to timestamp in milliseconds""" - return int(round(dt.timestamp() * 1000)) + # TODO: Deprecate these helpers in favor of make_ts_millis and parse_ts_millis. + # The other ones support None with a typing overload. + # Also possibly move those helpers to this file. + return make_ts_millis(dt) @dataclass diff --git a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json index dc8c400b291574..fb25531e685265 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json @@ -2658,7 +2658,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2930,7 +2930,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3180,7 +3180,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3430,7 +3430,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3680,7 +3680,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json index 60f5bf4fbca9a1..69c4b9cce0b17b 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json @@ -3024,7 +3024,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3296,7 +3296,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3546,7 +3546,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3796,7 +3796,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4046,7 +4046,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json index 42a416473ae243..0361e899b5b390 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json @@ -564,7 +564,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -636,7 +636,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -657,7 +657,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1019,7 +1019,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1095,7 +1095,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1116,7 +1116,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1347,7 +1347,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1418,7 +1418,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1439,7 +1439,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1871,7 +1871,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1942,7 +1942,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1963,7 +1963,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -3140,7 +3140,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3341,7 +3341,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3523,7 +3523,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3705,7 +3705,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3887,7 +3887,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json index c281ea3eed0fa0..c59620f010343d 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json @@ -564,7 +564,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -636,7 +636,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -657,7 +657,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1019,7 +1019,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1095,7 +1095,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1116,7 +1116,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1347,7 +1347,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1418,7 +1418,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1439,7 +1439,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1871,7 +1871,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1942,7 +1942,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1963,7 +1963,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -3504,7 +3504,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3773,7 +3773,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4023,7 +4023,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4273,7 +4273,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4523,7 +4523,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json index 495fa32569f569..23b5525b712d09 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json @@ -2598,7 +2598,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2867,7 +2867,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3117,7 +3117,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3367,7 +3367,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3617,7 +3617,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json index 20b7cf4a1c26ca..da22458f5624c1 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json @@ -2610,7 +2610,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2880,7 +2880,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3131,7 +3131,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3382,7 +3382,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3633,7 +3633,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json index 80ca85a5e6c61b..0b44fe77cd62ae 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json @@ -2599,7 +2599,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2868,7 +2868,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3118,7 +3118,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3368,7 +3368,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3618,7 +3618,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json index 1e6e4d8ba94a2e..3174847dd7e7ad 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json @@ -2599,7 +2599,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2868,7 +2868,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3118,7 +3118,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3368,7 +3368,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3618,7 +3618,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/unit/sdk/test_mce_builder.py b/metadata-ingestion/tests/unit/sdk/test_mce_builder.py index d7c84f7863b407..3bdbf07bf28b7d 100644 --- a/metadata-ingestion/tests/unit/sdk/test_mce_builder.py +++ b/metadata-ingestion/tests/unit/sdk/test_mce_builder.py @@ -1,3 +1,5 @@ +from datetime import datetime, timezone + import datahub.emitter.mce_builder as builder from datahub.metadata.schema_classes import ( DataFlowInfoClass, @@ -55,3 +57,18 @@ def test_make_group_urn() -> None: assert ( builder.make_group_urn("urn:li:corpuser:someUser") == "urn:li:corpuser:someUser" ) + + +def test_ts_millis() -> None: + assert builder.make_ts_millis(None) is None + assert builder.parse_ts_millis(None) is None + + assert ( + builder.make_ts_millis(datetime(2024, 1, 1, 2, 3, 4, 5, timezone.utc)) + == 1704074584000 + ) + + # We only have millisecond precision, don't support microseconds. + ts = datetime.now(timezone.utc).replace(microsecond=0) + ts_millis = builder.make_ts_millis(ts) + assert builder.parse_ts_millis(ts_millis) == ts diff --git a/metadata-ingestion/tests/unit/serde/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py index 98d62d5643ff2d..b49f7153129136 100644 --- a/metadata-ingestion/tests/unit/serde/test_codegen.py +++ b/metadata-ingestion/tests/unit/serde/test_codegen.py @@ -6,11 +6,10 @@ import pytest import typing_inspect -from datahub.emitter.enum_helpers import get_enum_options +from datahub.emitter.mce_builder import ALL_ENV_TYPES from datahub.metadata.schema_classes import ( ASPECT_CLASSES, KEY_ASPECTS, - FabricTypeClass, FineGrainedLineageClass, MetadataChangeEventClass, OwnershipClass, @@ -164,8 +163,7 @@ def _err(msg: str) -> None: def test_enum_options(): # This is mainly a sanity check to ensure that it doesn't do anything too crazy. - env_options = get_enum_options(FabricTypeClass) - assert "PROD" in env_options + assert "PROD" in ALL_ENV_TYPES def test_urn_types() -> None: diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index ec8188ebf5f4db..1d209b4ba82195 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -22,7 +22,9 @@ else echo "datahub:datahub" > ~/.datahub/plugins/frontend/auth/user.props python3 -m venv venv + set +x source venv/bin/activate + set -x python -m pip install --upgrade 'uv>=0.1.10' uv pip install -r requirements.txt fi From 4e3103e2661f3149f823d1cdda0980fffb7010d3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 27 Dec 2024 13:50:43 -0500 Subject: [PATCH 46/49] fix(ingest): use `typing_extensions.Self` (#12230) --- metadata-ingestion/scripts/avro_codegen.py | 7 +++---- metadata-ingestion/setup.py | 2 +- .../src/datahub/configuration/common.py | 7 ++----- .../src/datahub/ingestion/api/closeable.py | 6 +++--- .../api/ingestion_job_checkpointing_provider_base.py | 11 ++++------- .../src/datahub/ingestion/api/report.py | 5 ++++- metadata-ingestion/src/datahub/ingestion/api/sink.py | 7 ++++--- .../src/datahub/utilities/urns/_urn_base.py | 12 +++++------- 8 files changed, 26 insertions(+), 31 deletions(-) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index e5792da32fb5d7..2841985ad07808 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -154,7 +154,6 @@ def merge_schemas(schemas_obj: List[dict]) -> str: # Patch add_name method to NOT complain about duplicate names. class NamesWithDups(avro.schema.Names): def add_name(self, name_attr, space_attr, new_schema): - to_add = avro.schema.Name(name_attr, space_attr, self.default_namespace) assert to_add.name assert to_add.space @@ -626,7 +625,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str: class {class_name}(_SpecificUrn): ENTITY_TYPE: ClassVar[str] = "{entity_type}" - URN_PARTS: ClassVar[int] = {arg_count} + _URN_PARTS: ClassVar[int] = {arg_count} def __init__(self, {init_args}, *, _allow_coercion: bool = True) -> None: if _allow_coercion: @@ -640,8 +639,8 @@ def __init__(self, {init_args}, *, _allow_coercion: bool = True) -> None: @classmethod def _parse_ids(cls, entity_ids: List[str]) -> "{class_name}": - if len(entity_ids) != cls.URN_PARTS: - raise InvalidUrnError(f"{class_name} should have {{cls.URN_PARTS}} parts, got {{len(entity_ids)}}: {{entity_ids}}") + if len(entity_ids) != cls._URN_PARTS: + raise InvalidUrnError(f"{class_name} should have {{cls._URN_PARTS}} parts, got {{len(entity_ids)}}: {{entity_ids}}") return cls({parse_ids_mapping}, _allow_coercion=False) @classmethod diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 986dc189cb29ba..8357262537bcf8 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -15,7 +15,7 @@ base_requirements = { # Our min version of typing_extensions is somewhat constrained by Airflow. - "typing_extensions>=3.10.0.2", + "typing_extensions>=4.2.0", # Actual dependencies. "typing-inspect", # pydantic 1.8.2 is incompatible with mypy 0.910. diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 7df007e087979c..08817d9d5fdb93 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -10,7 +10,6 @@ List, Optional, Type, - TypeVar, Union, runtime_checkable, ) @@ -19,14 +18,12 @@ from cached_property import cached_property from pydantic import BaseModel, Extra, ValidationError from pydantic.fields import Field -from typing_extensions import Protocol +from typing_extensions import Protocol, Self from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.utilities.dedup_list import deduplicate_list -_ConfigSelf = TypeVar("_ConfigSelf", bound="ConfigModel") - REDACT_KEYS = { "password", "token", @@ -109,7 +106,7 @@ def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: schema_extra = _schema_extra @classmethod - def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf: + def parse_obj_allow_extras(cls, obj: Any) -> Self: if PYDANTIC_VERSION_2: try: with unittest.mock.patch.dict( diff --git a/metadata-ingestion/src/datahub/ingestion/api/closeable.py b/metadata-ingestion/src/datahub/ingestion/api/closeable.py index 80a5008ed63683..7b8e1a36162c92 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/closeable.py +++ b/metadata-ingestion/src/datahub/ingestion/api/closeable.py @@ -1,9 +1,9 @@ from abc import abstractmethod from contextlib import AbstractContextManager from types import TracebackType -from typing import Optional, Type, TypeVar +from typing import Optional, Type -_Self = TypeVar("_Self", bound="Closeable") +from typing_extensions import Self class Closeable(AbstractContextManager): @@ -11,7 +11,7 @@ class Closeable(AbstractContextManager): def close(self) -> None: pass - def __enter__(self: _Self) -> _Self: + def __enter__(self) -> Self: # This method is mainly required for type checking. return self diff --git a/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py b/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py index 3680546d307d97..c1a49ce82e6e05 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +++ b/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py @@ -1,6 +1,8 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import Any, Dict, NewType, Optional, Type, TypeVar +from typing import Any, Dict, NewType, Optional + +from typing_extensions import Self import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel @@ -17,9 +19,6 @@ class IngestionCheckpointingProviderConfig(ConfigModel): pass -_Self = TypeVar("_Self", bound="IngestionCheckpointingProviderBase") - - @dataclass() class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStatesMap]): """ @@ -32,9 +31,7 @@ def __init__(self, name: str, commit_policy: CommitPolicy = CommitPolicy.ALWAYS) @classmethod @abstractmethod - def create( - cls: Type[_Self], config_dict: Dict[str, Any], ctx: PipelineContext - ) -> "_Self": + def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> Self: pass @abstractmethod diff --git a/metadata-ingestion/src/datahub/ingestion/api/report.py b/metadata-ingestion/src/datahub/ingestion/api/report.py index ade2832f1b669d..32810189acd00b 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/report.py +++ b/metadata-ingestion/src/datahub/ingestion/api/report.py @@ -42,7 +42,10 @@ def to_pure_python_obj(some_val: Any) -> Any: return some_val.as_obj() elif isinstance(some_val, pydantic.BaseModel): return Report.to_pure_python_obj(some_val.dict()) - elif dataclasses.is_dataclass(some_val): + elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type): + # The `is_dataclass` function returns `True` for both instances and classes. + # We need an extra check to ensure an instance was passed in. + # https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass return dataclasses.asdict(some_val) elif isinstance(some_val, list): return [Report.to_pure_python_obj(v) for v in some_val if v is not None] diff --git a/metadata-ingestion/src/datahub/ingestion/api/sink.py b/metadata-ingestion/src/datahub/ingestion/api/sink.py index 62feb7b5a02e66..655e6bb22fa8d1 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/sink.py +++ b/metadata-ingestion/src/datahub/ingestion/api/sink.py @@ -3,6 +3,8 @@ from dataclasses import dataclass, field from typing import Any, Generic, Optional, Type, TypeVar, cast +from typing_extensions import Self + from datahub.configuration.common import ConfigModel from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit @@ -79,7 +81,6 @@ def on_failure( SinkReportType = TypeVar("SinkReportType", bound=SinkReport, covariant=True) SinkConfig = TypeVar("SinkConfig", bound=ConfigModel, covariant=True) -Self = TypeVar("Self", bound="Sink") class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta): @@ -90,7 +91,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta): report: SinkReportType @classmethod - def get_config_class(cls: Type[Self]) -> Type[SinkConfig]: + def get_config_class(cls) -> Type[SinkConfig]: config_class = get_class_from_annotation(cls, Sink, ConfigModel) assert config_class, "Sink subclasses must define a config class" return cast(Type[SinkConfig], config_class) @@ -112,7 +113,7 @@ def __post_init__(self) -> None: pass @classmethod - def create(cls: Type[Self], config_dict: dict, ctx: PipelineContext) -> "Self": + def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self": return cls(ctx, cls.get_config_class().parse_obj(config_dict)) def handle_work_unit_start(self, workunit: WorkUnit) -> None: diff --git a/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py b/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py index 7dadd16fb7f1c2..7996fe0d7b89b7 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py +++ b/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py @@ -1,9 +1,10 @@ import functools import urllib.parse from abc import abstractmethod -from typing import ClassVar, Dict, List, Optional, Type, TypeVar +from typing import ClassVar, Dict, List, Optional, Type from deprecated import deprecated +from typing_extensions import Self from datahub.utilities.urns.error import InvalidUrnError @@ -42,9 +43,6 @@ def _split_entity_id(entity_id: str) -> List[str]: return parts -_UrnSelf = TypeVar("_UrnSelf", bound="Urn") - - @functools.total_ordering class Urn: """ @@ -88,7 +86,7 @@ def entity_ids(self) -> List[str]: return self._entity_ids @classmethod - def from_string(cls: Type[_UrnSelf], urn_str: str) -> "_UrnSelf": + def from_string(cls, urn_str: str) -> Self: """ Creates an Urn from its string representation. @@ -174,7 +172,7 @@ def __hash__(self) -> int: @classmethod @deprecated(reason="prefer .from_string") - def create_from_string(cls: Type[_UrnSelf], urn_str: str) -> "_UrnSelf": + def create_from_string(cls, urn_str: str) -> Self: return cls.from_string(urn_str) @deprecated(reason="prefer .entity_ids") @@ -270,5 +268,5 @@ def underlying_key_aspect_type(cls) -> Type: @classmethod @abstractmethod - def _parse_ids(cls: Type[_UrnSelf], entity_ids: List[str]) -> _UrnSelf: + def _parse_ids(cls, entity_ids: List[str]) -> Self: raise NotImplementedError() From 6b6d820eea3e7c1297381b2b9ad9b37e22cd9c5d Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Sat, 28 Dec 2024 01:49:15 +0530 Subject: [PATCH 47/49] feat(businessAttribute): generate platform events on association/removal with schemaField (#12224) --- metadata-io/build.gradle | 2 +- ...hemaFieldBusinessAttributeChangeEvent.java | 38 ++++++ ...usinessAttributesChangeEventGenerator.java | 98 ++++++++++++++ ...essAttributesChangeEventGeneratorTest.java | 124 ++++++++++++++++++ .../event/EntityChangeEventGeneratorHook.java | 22 +++- .../src/main/resources/application.yaml | 1 + ...tyChangeEventGeneratorRegistryFactory.java | 2 + 7 files changed, 279 insertions(+), 8 deletions(-) create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 516a77d59d50bd..88bbfa2e10c4c1 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -102,7 +102,7 @@ dependencies { testImplementation(testFixtures(project(":entity-registry"))) testAnnotationProcessor externalDependency.lombok - + testImplementation project(':mock-entity-registry') constraints { implementation(externalDependency.log4jCore) { because("previous versions are vulnerable to CVE-2021-45105") diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java new file mode 100644 index 00000000000000..1f1252e2085452 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java @@ -0,0 +1,38 @@ +package com.linkedin.metadata.timeline.data.dataset.schema; + +import com.google.common.collect.ImmutableMap; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.metadata.timeline.data.ChangeCategory; +import com.linkedin.metadata.timeline.data.ChangeEvent; +import com.linkedin.metadata.timeline.data.ChangeOperation; +import com.linkedin.metadata.timeline.data.SemanticChangeType; +import lombok.Builder; + +public class SchemaFieldBusinessAttributeChangeEvent extends ChangeEvent { + @Builder(builderMethodName = "schemaFieldBusinessAttributeChangeEventBuilder") + public SchemaFieldBusinessAttributeChangeEvent( + String entityUrn, + ChangeCategory category, + ChangeOperation operation, + String modifier, + AuditStamp auditStamp, + SemanticChangeType semVerChange, + String description, + Urn parentUrn, + Urn businessAttributeUrn, + Urn datasetUrn) { + super( + entityUrn, + category, + operation, + modifier, + ImmutableMap.of( + "parentUrn", parentUrn.toString(), + "businessAttributeUrn", businessAttributeUrn.toString(), + "datasetUrn", datasetUrn.toString()), + auditStamp, + semVerChange, + description); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java new file mode 100644 index 00000000000000..69d20f2f41bd56 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java @@ -0,0 +1,98 @@ +package com.linkedin.metadata.timeline.eventgenerator; + +import com.linkedin.businessattribute.BusinessAttributeAssociation; +import com.linkedin.businessattribute.BusinessAttributes; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.metadata.timeline.data.ChangeCategory; +import com.linkedin.metadata.timeline.data.ChangeEvent; +import com.linkedin.metadata.timeline.data.ChangeOperation; +import com.linkedin.metadata.timeline.data.SemanticChangeType; +import com.linkedin.metadata.timeline.data.dataset.schema.SchemaFieldBusinessAttributeChangeEvent; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class BusinessAttributesChangeEventGenerator + extends EntityChangeEventGenerator { + + private static final String BUSINESS_ATTRIBUTE_ADDED_FORMAT = + "BusinessAttribute '%s' added to entity '%s'."; + private static final String BUSINESS_ATTRIBUTE_REMOVED_FORMAT = + "BusinessAttribute '%s' removed from entity '%s'."; + + @Override + public List getChangeEvents( + @Nonnull Urn urn, + @Nonnull String entityName, + @Nonnull String aspectName, + @Nonnull Aspect from, + @Nonnull Aspect to, + @Nonnull AuditStamp auditStamp) { + log.debug( + "Calling BusinessAttributesChangeEventGenerator for entity {} and aspect {}", + entityName, + aspectName); + return computeDiff(urn, entityName, aspectName, from.getValue(), to.getValue(), auditStamp); + } + + private List computeDiff( + Urn urn, + String entityName, + String aspectName, + BusinessAttributes previousValue, + BusinessAttributes newValue, + AuditStamp auditStamp) { + List changeEvents = new ArrayList<>(); + + BusinessAttributeAssociation previousAssociation = + previousValue != null ? previousValue.getBusinessAttribute() : null; + BusinessAttributeAssociation newAssociation = + newValue != null ? newValue.getBusinessAttribute() : null; + + if (Objects.nonNull(previousAssociation) && Objects.isNull(newAssociation)) { + changeEvents.add( + createChangeEvent( + previousAssociation, + urn, + ChangeOperation.REMOVE, + BUSINESS_ATTRIBUTE_REMOVED_FORMAT, + auditStamp)); + + } else if (Objects.isNull(previousAssociation) && Objects.nonNull(newAssociation)) { + changeEvents.add( + createChangeEvent( + newAssociation, + urn, + ChangeOperation.ADD, + BUSINESS_ATTRIBUTE_ADDED_FORMAT, + auditStamp)); + } + return changeEvents; + } + + private ChangeEvent createChangeEvent( + BusinessAttributeAssociation businessAttributeAssociation, + Urn entityUrn, + ChangeOperation changeOperation, + String format, + AuditStamp auditStamp) { + return SchemaFieldBusinessAttributeChangeEvent.schemaFieldBusinessAttributeChangeEventBuilder() + .entityUrn(entityUrn.toString()) + .category(ChangeCategory.BUSINESS_ATTRIBUTE) + .operation(changeOperation) + .modifier(businessAttributeAssociation.getBusinessAttributeUrn().toString()) + .auditStamp(auditStamp) + .semVerChange(SemanticChangeType.MINOR) + .description( + String.format( + format, businessAttributeAssociation.getBusinessAttributeUrn().getId(), entityUrn)) + .parentUrn(entityUrn) + .businessAttributeUrn(businessAttributeAssociation.getBusinessAttributeUrn()) + .datasetUrn(entityUrn.getIdAsUrn()) + .build(); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java new file mode 100644 index 00000000000000..fb4c5ca3f96881 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java @@ -0,0 +1,124 @@ +package com.linkedin.metadata.timeline.eventgenerator; + +import static org.testng.AssertJUnit.assertEquals; + +import com.linkedin.businessattribute.BusinessAttributeAssociation; +import com.linkedin.businessattribute.BusinessAttributes; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.BusinessAttributeUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.data.ByteString; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.models.AspectSpec; +import com.linkedin.metadata.timeline.data.ChangeEvent; +import com.linkedin.metadata.timeline.data.ChangeOperation; +import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.mxe.SystemMetadata; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.List; +import mock.MockEntitySpec; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +public class BusinessAttributesChangeEventGeneratorTest extends AbstractTestNGSpringContextTests { + + private static Urn getSchemaFieldUrn() throws URISyntaxException { + return Urn.createFromString( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD),user_id)"); + } + + private static final String BUSINESS_ATTRIBUTE_URN = + "urn:li:businessAttribute:cypressTestAttribute"; + + private static AuditStamp getTestAuditStamp() throws URISyntaxException { + return new AuditStamp() + .setActor(Urn.createFromString("urn:li:corpuser:__datahub_system")) + .setTime(1683829509553L); + } + + private static Aspect getBusinessAttributes( + BusinessAttributeAssociation association) { + return new Aspect<>( + new BusinessAttributes().setBusinessAttribute(association), new SystemMetadata()); + } + + private static Aspect getNullBusinessAttributes() { + MockEntitySpec mockEntitySpec = new MockEntitySpec("schemaField"); + BusinessAttributes businessAttributes = new BusinessAttributes(); + final AspectSpec aspectSpec = + mockEntitySpec.createAspectSpec(businessAttributes, Constants.BUSINESS_ATTRIBUTE_ASPECT); + final RecordTemplate nullAspect = + GenericRecordUtils.deserializeAspect( + ByteString.copyString("{}", StandardCharsets.UTF_8), "application/json", aspectSpec); + return new Aspect(nullAspect, new SystemMetadata()); + } + + @Test + public void testBusinessAttributeAddition() throws Exception { + BusinessAttributesChangeEventGenerator businessAttributesChangeEventGenerator = + new BusinessAttributesChangeEventGenerator(); + + Urn urn = getSchemaFieldUrn(); + String entity = "schemaField"; + String aspect = "businessAttributes"; + AuditStamp auditStamp = getTestAuditStamp(); + + Aspect from = getNullBusinessAttributes(); + Aspect to = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + + List actual = + businessAttributesChangeEventGenerator.getChangeEvents( + urn, entity, aspect, from, to, auditStamp); + assertEquals(1, actual.size()); + assertEquals(ChangeOperation.ADD.name(), actual.get(0).getOperation().name()); + assertEquals(getSchemaFieldUrn(), Urn.createFromString(actual.get(0).getEntityUrn())); + } + + @Test + public void testBusinessAttributeRemoval() throws Exception { + BusinessAttributesChangeEventGenerator test = new BusinessAttributesChangeEventGenerator(); + + Urn urn = getSchemaFieldUrn(); + String entity = "schemaField"; + String aspect = "businessAttributes"; + AuditStamp auditStamp = getTestAuditStamp(); + + Aspect from = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + Aspect to = getNullBusinessAttributes(); + + List actual = test.getChangeEvents(urn, entity, aspect, from, to, auditStamp); + assertEquals(1, actual.size()); + assertEquals(ChangeOperation.REMOVE.name(), actual.get(0).getOperation().name()); + assertEquals(getSchemaFieldUrn(), Urn.createFromString(actual.get(0).getEntityUrn())); + } + + @Test + public void testNoChange() throws Exception { + BusinessAttributesChangeEventGenerator test = new BusinessAttributesChangeEventGenerator(); + + Urn urn = getSchemaFieldUrn(); + String entity = "schemaField"; + String aspect = "businessAttributes"; + AuditStamp auditStamp = getTestAuditStamp(); + + Aspect from = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + Aspect to = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + + List actual = test.getChangeEvents(urn, entity, aspect, from, to, auditStamp); + assertEquals(0, actual.size()); + } +} diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java index de570cc91b2fe7..17e34f151ae018 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java @@ -1,7 +1,5 @@ package com.linkedin.metadata.kafka.hook.event; -import static com.linkedin.metadata.Constants.SCHEMA_FIELD_ENTITY_NAME; - import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import com.linkedin.common.AuditStamp; @@ -27,6 +25,7 @@ import com.linkedin.platform.event.v1.Parameters; import io.datahubproject.metadata.context.OperationContext; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.Set; @@ -65,6 +64,7 @@ public class EntityChangeEventGeneratorHook implements MetadataChangeLogHook { Constants.ASSERTION_RUN_EVENT_ASPECT_NAME, Constants.DATA_PROCESS_INSTANCE_RUN_EVENT_ASPECT_NAME, Constants.BUSINESS_ATTRIBUTE_INFO_ASPECT_NAME, + Constants.BUSINESS_ATTRIBUTE_ASPECT, // Entity Lifecycle Event Constants.DATASET_KEY_ASPECT_NAME, @@ -83,13 +83,12 @@ public class EntityChangeEventGeneratorHook implements MetadataChangeLogHook { private static final Set SUPPORTED_OPERATIONS = ImmutableSet.of("CREATE", "UPSERT", "DELETE"); - private static final Set ENTITY_EXCLUSIONS = ImmutableSet.of(SCHEMA_FIELD_ENTITY_NAME); - private final EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry; private final OperationContext systemOperationContext; private final SystemEntityClient systemEntityClient; private final Boolean isEnabled; @Getter private final String consumerGroupSuffix; + private final List entityExclusions; @Autowired public EntityChangeEventGeneratorHook( @@ -98,13 +97,16 @@ public EntityChangeEventGeneratorHook( final EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry, @Nonnull final SystemEntityClient entityClient, @Nonnull @Value("${entityChangeEvents.enabled:true}") Boolean isEnabled, - @Nonnull @Value("${entityChangeEvents.consumerGroupSuffix}") String consumerGroupSuffix) { + @Nonnull @Value("${entityChangeEvents.consumerGroupSuffix}") String consumerGroupSuffix, + @Nonnull @Value("#{'${entityChangeEvents.entityExclusions}'.split(',')}") + List entityExclusions) { this.systemOperationContext = systemOperationContext; this.entityChangeEventGeneratorRegistry = Objects.requireNonNull(entityChangeEventGeneratorRegistry); this.systemEntityClient = Objects.requireNonNull(entityClient); this.isEnabled = isEnabled; this.consumerGroupSuffix = consumerGroupSuffix; + this.entityExclusions = entityExclusions; } @VisibleForTesting @@ -113,7 +115,13 @@ public EntityChangeEventGeneratorHook( @Nonnull final EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry, @Nonnull final SystemEntityClient entityClient, @Nonnull Boolean isEnabled) { - this(systemOperationContext, entityChangeEventGeneratorRegistry, entityClient, isEnabled, ""); + this( + systemOperationContext, + entityChangeEventGeneratorRegistry, + entityClient, + isEnabled, + "", + Collections.emptyList()); } @Override @@ -202,7 +210,7 @@ private List generateChangeEvents( private boolean isEligibleForProcessing(final MetadataChangeLog log) { return SUPPORTED_OPERATIONS.contains(log.getChangeType().toString()) && SUPPORTED_ASPECT_NAMES.contains(log.getAspectName()) - && !ENTITY_EXCLUSIONS.contains(log.getEntityType()); + && !entityExclusions.contains(log.getEntityType()); } private void emitPlatformEvent( diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index b997bc108e4ba1..f6fa4a37fdadbc 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -467,6 +467,7 @@ featureFlags: entityChangeEvents: enabled: ${ENABLE_ENTITY_CHANGE_EVENTS_HOOK:true} consumerGroupSuffix: ${ECE_CONSUMER_GROUP_SUFFIX:} + entityExclusions: ${ECE_ENTITY_EXCLUSIONS:schemaField} # provides a comma separated list of entities to exclude from the ECE hook views: enabled: ${VIEWS_ENABLED:true} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java index cd8eb4f1218db4..10770b83ad8811 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.timeline.eventgenerator.AssertionRunEventChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.BusinessAttributeAssociationChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.BusinessAttributeInfoChangeEventGenerator; +import com.linkedin.metadata.timeline.eventgenerator.BusinessAttributesChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DataProcessInstanceRunEventChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DatasetPropertiesChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DeprecationChangeEventGenerator; @@ -59,6 +60,7 @@ protected EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry( BUSINESS_ATTRIBUTE_INFO_ASPECT_NAME, new BusinessAttributeInfoChangeEventGenerator()); registry.register( BUSINESS_ATTRIBUTE_ASSOCIATION, new BusinessAttributeAssociationChangeEventGenerator()); + registry.register(BUSINESS_ATTRIBUTE_ASPECT, new BusinessAttributesChangeEventGenerator()); // Entity Lifecycle Differs registry.register(DATASET_KEY_ASPECT_NAME, new EntityKeyChangeEventGenerator<>()); From b79857fd948d29e41611b29678b8c66a91c6f62b Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Sun, 29 Dec 2024 18:52:05 +0530 Subject: [PATCH 48/49] fix(ingest/sql-common): sql_common to use SqlParsingAggregator (#12220) --- .../src/datahub/ingestion/source/sql/hive.py | 15 + .../ingestion/source/sql/hive_metastore.py | 7 + .../ingestion/source/sql/mssql/source.py | 2 +- .../ingestion/source/sql/sql_common.py | 143 ++--- .../ingestion/source/sql/sql_report.py | 2 + .../hive_metastore_mces_golden_1.json | 120 ++-- .../hive_metastore_mces_golden_3.json | 120 ++-- .../hive_metastore_mces_golden_5.json | 120 ++-- .../hive/hive_mces_all_db_golden.json | 541 +++++++++++++++++- .../integration/hive/hive_mces_golden.json | 537 ++++++++++++++++- .../tests/integration/hive/hive_setup.sql | 2 + .../mysql/mysql_mces_no_db_golden.json | 144 ++++- .../golden_test_ingest_with_database.json | 318 ++++++++-- .../golden_test_ingest_with_out_database.json | 456 ++++++++++++--- .../postgres_all_db_mces_with_db_golden.json | 186 ++++-- .../postgres_mces_with_db_golden.json | 154 ++++- .../golden_mces_mssql_no_db_to_file.json | 232 +++++++- .../golden_mces_mssql_no_db_with_filter.json | 106 +++- .../golden_mces_mssql_to_file.json | 106 +++- ...golden_mces_mssql_with_lower_case_urn.json | 284 +++++++-- .../trino_hive_instance_mces_golden.json | 166 ++++-- .../trino/trino_hive_mces_golden.json | 166 ++++-- 22 files changed, 3254 insertions(+), 673 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index fad54fda453786..6d67ab29b3a3d8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -838,3 +838,18 @@ def _process_view( entityUrn=dataset_urn, aspect=view_properties_aspect, ).as_workunit() + + if view_definition and self.config.include_view_lineage: + default_db = None + default_schema = None + try: + default_db, default_schema = self.get_db_schema(dataset_name) + except ValueError: + logger.warning(f"Invalid view identifier: {dataset_name}") + + self.aggregator.add_view_definition( + view_urn=dataset_urn, + view_definition=view_definition, + default_db=default_db, + default_schema=default_schema, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py index adb171d4ad54b6..60ecbaf38838a6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py @@ -123,6 +123,10 @@ class HiveMetastore(BasicSQLAlchemyConfig): description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']", ) + include_view_lineage: bool = Field( + default=False, description="", hidden_from_docs=True + ) + include_catalog_name_in_ids: bool = Field( default=False, description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`", @@ -160,6 +164,9 @@ def get_sql_alchemy_url( @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") @capability(SourceCapability.DATA_PROFILING, "Not Supported", False) @capability(SourceCapability.CLASSIFICATION, "Not Supported", False) +@capability( + SourceCapability.LINEAGE_COARSE, "View lineage is not supported", supported=False +) class HiveMetastoreSource(SQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 9d8b67041998ce..a2338f14196d77 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -724,7 +724,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ): yield from auto_workunit( generate_procedure_lineage( - schema_resolver=self.schema_resolver, + schema_resolver=self.get_schema_resolver(), procedure=procedure, procedure_job_urn=MSSQLDataJob(entity=procedure).urn, is_temp_table=self.is_temp_table, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 4e22930e7a2a0b..a0bd9ce0760bd1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -11,7 +11,6 @@ Dict, Iterable, List, - MutableMapping, Optional, Set, Tuple, @@ -36,7 +35,6 @@ make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.sql_parsing_builder import SqlParsingBuilder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import capability from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage @@ -79,7 +77,6 @@ StatefulIngestionSourceBase, ) from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass -from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.com.linkedin.pegasus2avro.schema import ( @@ -106,17 +103,11 @@ GlobalTagsClass, SubTypesClass, TagAssociationClass, - UpstreamClass, ViewPropertiesClass, ) from datahub.sql_parsing.schema_resolver import SchemaResolver -from datahub.sql_parsing.sqlglot_lineage import ( - SqlParsingResult, - sqlglot_lineage, - view_definition_lineage_helper, -) +from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator from datahub.telemetry import telemetry -from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.sqlalchemy_type_converter import ( get_native_data_type_for_sqlalchemy_type, @@ -347,17 +338,19 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) ) self.views_failed_parsing: Set[str] = set() - self.schema_resolver: SchemaResolver = SchemaResolver( + + self.discovered_datasets: Set[str] = set() + self.aggregator = SqlParsingAggregator( platform=self.platform, platform_instance=self.config.platform_instance, env=self.config.env, + graph=self.ctx.graph, + generate_lineage=self.include_lineage, + generate_usage_statistics=False, + generate_operations=False, + eager_graph_load=False, ) - self.discovered_datasets: Set[str] = set() - self._view_definition_cache: MutableMapping[str, str] - if self.config.use_file_backed_cache: - self._view_definition_cache = FileBackedDict[str]() - else: - self._view_definition_cache = {} + self.report.sql_aggregator = self.aggregator.report @classmethod def test_connection(cls, config_dict: dict) -> TestConnectionReport: @@ -572,36 +565,9 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit profile_requests, profiler, platform=self.platform ) - if self.config.include_view_lineage: - yield from self.get_view_lineage() - - def get_view_lineage(self) -> Iterable[MetadataWorkUnit]: - builder = SqlParsingBuilder( - generate_lineage=True, - generate_usage_statistics=False, - generate_operations=False, - ) - for dataset_name in self._view_definition_cache.keys(): - # TODO: Ensure that the lineage generated from the view definition - # matches the dataset_name. - view_definition = self._view_definition_cache[dataset_name] - result = self._run_sql_parser( - dataset_name, - view_definition, - self.schema_resolver, - ) - if result and result.out_tables: - # This does not yield any workunits but we use - # yield here to execute this method - yield from builder.process_sql_parsing_result( - result=result, - query=view_definition, - is_view_ddl=True, - include_column_lineage=self.config.include_view_column_lineage, - ) - else: - self.views_failed_parsing.add(dataset_name) - yield from builder.gen_workunits() + # Generate workunit for aggregated SQL parsing results + for mcp in self.aggregator.gen_metadata(): + yield mcp.as_workunit() def get_identifier( self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any @@ -760,16 +726,6 @@ def _process_table( ) dataset_snapshot.aspects.append(dataset_properties) - if self.config.include_table_location_lineage and location_urn: - external_upstream_table = UpstreamClass( - dataset=location_urn, - type=DatasetLineageTypeClass.COPY, - ) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_snapshot.urn, - aspect=UpstreamLineage(upstreams=[external_upstream_table]), - ).as_workunit() - extra_tags = self.get_extra_tags(inspector, schema, table) pk_constraints: dict = inspector.get_pk_constraint(table, schema) partitions: Optional[List[str]] = self.get_partitions(inspector, schema, table) @@ -795,7 +751,7 @@ def _process_table( dataset_snapshot.aspects.append(schema_metadata) if self._save_schema_to_resolver(): - self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) + self.aggregator.register_schema(dataset_urn, schema_metadata) self.discovered_datasets.add(dataset_name) db_name = self.get_db_name(inspector) @@ -815,6 +771,13 @@ def _process_table( ), ) + if self.config.include_table_location_lineage and location_urn: + self.aggregator.add_known_lineage_mapping( + upstream_urn=location_urn, + downstream_urn=dataset_snapshot.urn, + lineage_type=DatasetLineageTypeClass.COPY, + ) + if self.config.domain: assert self.domain_registry yield from get_domain_wu( @@ -1089,6 +1052,7 @@ def _process_view( self.config.platform_instance, self.config.env, ) + try: columns = inspector.get_columns(view, schema) except KeyError: @@ -1108,7 +1072,7 @@ def _process_view( canonical_schema=schema_fields, ) if self._save_schema_to_resolver(): - self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) + self.aggregator.register_schema(dataset_urn, schema_metadata) self.discovered_datasets.add(dataset_name) description, properties, _ = self.get_table_properties(inspector, schema, view) @@ -1117,7 +1081,18 @@ def _process_view( view_definition = self._get_view_definition(inspector, schema, view) properties["view_definition"] = view_definition if view_definition and self.config.include_view_lineage: - self._view_definition_cache[dataset_name] = view_definition + default_db = None + default_schema = None + try: + default_db, default_schema = self.get_db_schema(dataset_name) + except ValueError: + logger.warning(f"Invalid view identifier: {dataset_name}") + self.aggregator.add_view_definition( + view_urn=dataset_urn, + view_definition=view_definition, + default_db=default_db, + default_schema=default_schema, + ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, @@ -1169,48 +1144,9 @@ def _save_schema_to_resolver(self): hasattr(self.config, "include_lineage") and self.config.include_lineage ) - def _run_sql_parser( - self, view_identifier: str, query: str, schema_resolver: SchemaResolver - ) -> Optional[SqlParsingResult]: - try: - database, schema = self.get_db_schema(view_identifier) - except ValueError: - logger.warning(f"Invalid view identifier: {view_identifier}") - return None - raw_lineage = sqlglot_lineage( - query, - schema_resolver=schema_resolver, - default_db=database, - default_schema=schema, - ) - view_urn = make_dataset_urn_with_platform_instance( - self.platform, - view_identifier, - self.config.platform_instance, - self.config.env, - ) - - if raw_lineage.debug_info.table_error: - logger.debug( - f"Failed to parse lineage for view {view_identifier}: " - f"{raw_lineage.debug_info.table_error}" - ) - self.report.num_view_definitions_failed_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Table-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.table_error}" - ) - return None - - elif raw_lineage.debug_info.column_error: - self.report.num_view_definitions_failed_column_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Column-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.column_error}" - ) - else: - self.report.num_view_definitions_parsed += 1 - if raw_lineage.out_tables != [view_urn]: - self.report.num_view_definitions_view_urn_mismatch += 1 - return view_definition_lineage_helper(raw_lineage, view_urn) + @property + def include_lineage(self): + return self.config.include_view_lineage def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]: database, schema, _view = dataset_identifier.split(".", 2) @@ -1411,5 +1347,8 @@ def prepare_profiler_args( schema=schema, table=table, partition=partition, custom_sql=custom_sql ) + def get_schema_resolver(self) -> SchemaResolver: + return self.aggregator._schema_resolver + def get_report(self): return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_report.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_report.py index c445ce44a91449..785972b88a49d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_report.py @@ -5,6 +5,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) +from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport from datahub.utilities.lossy_collections import LossyList from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport from datahub.utilities.stats_collections import TopKDict, int_top_k_dict @@ -52,6 +53,7 @@ class SQLSourceReport( num_view_definitions_failed_parsing: int = 0 num_view_definitions_failed_column_parsing: int = 0 view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList) + sql_aggregator: Optional[SqlAggregatorReport] = None def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: """ diff --git a/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_1.json b/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_1.json index 3ba795a5d044a3..8d2e29078880d3 100644 --- a/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_1.json +++ b/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_1.json @@ -87,6 +87,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:1cfce89b5a05e1da5092d88ad9eb4589" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "hive-metastore-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", @@ -160,22 +176,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:1cfce89b5a05e1da5092d88ad9eb4589" - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "hive-metastore-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", @@ -238,7 +238,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258696" + "value": "1735298453" }, { "op": "add", @@ -268,7 +268,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -428,7 +428,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258696" + "value": "1735298453" }, { "op": "add", @@ -463,7 +463,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -672,10 +672,15 @@ "path": "/name", "value": "nested_struct_test" }, + { + "op": "add", + "path": "/customProperties/totalSize", + "value": "0" + }, { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258695" + "value": "1735298453" }, { "op": "add", @@ -697,11 +702,6 @@ "path": "/customProperties/numRows", "value": "0" }, - { - "op": "add", - "path": "/customProperties/totalSize", - "value": "0" - }, { "op": "add", "path": "/customProperties/table_type", @@ -715,7 +715,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -926,11 +926,6 @@ "path": "/customProperties/another.comment", "value": "This table has no partitions" }, - { - "op": "add", - "path": "/customProperties/numFiles", - "value": "1" - }, { "op": "add", "path": "/customProperties/numRows", @@ -943,13 +938,18 @@ }, { "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258689" + "path": "/customProperties/totalSize", + "value": "33" }, { "op": "add", - "path": "/customProperties/totalSize", - "value": "33" + "path": "/customProperties/numFiles", + "value": "1" + }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298448" }, { "op": "add", @@ -974,7 +974,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1164,6 +1164,11 @@ "path": "/customProperties/numRows", "value": "0" }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298442" + }, { "op": "add", "path": "/customProperties/numFiles", @@ -1174,11 +1179,6 @@ "path": "/customProperties/COLUMN_STATS_ACCURATE", "value": "{\"BASIC_STATS\":\"true\"}" }, - { - "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258680" - }, { "op": "add", "path": "/customProperties/rawDataSize", @@ -1202,7 +1202,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1386,6 +1386,11 @@ "path": "/customProperties/numRows", "value": "0" }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298441" + }, { "op": "add", "path": "/customProperties/numFiles", @@ -1396,11 +1401,6 @@ "path": "/customProperties/COLUMN_STATS_ACCURATE", "value": "{\"BASIC_STATS\":\"true\"}" }, - { - "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258680" - }, { "op": "add", "path": "/customProperties/rawDataSize", @@ -1424,7 +1424,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1576,7 +1576,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258672" + "value": "1735298433" }, { "op": "add", @@ -1591,7 +1591,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" }, { "op": "add", @@ -1637,31 +1637,31 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=string].baz", + "fieldPath": "[version=2.0].[type=int].foo", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false, - "isPartitioningKey": true, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { - "fieldPath": "[version=2.0].[type=int].foo", + "fieldPath": "[version=2.0].[type=string].baz", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "int", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + "isPartitioningKey": true, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].bar", diff --git a/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_3.json b/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_3.json index a9bf2cb26da49f..f408c6c0648486 100644 --- a/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_3.json +++ b/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_3.json @@ -87,6 +87,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:1cfce89b5a05e1da5092d88ad9eb4589" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "hive-metastore-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", @@ -160,22 +176,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:1cfce89b5a05e1da5092d88ad9eb4589" - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "hive-metastore-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", @@ -238,7 +238,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258696" + "value": "1735298453" }, { "op": "add", @@ -268,7 +268,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -428,7 +428,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258696" + "value": "1735298453" }, { "op": "add", @@ -463,7 +463,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -672,10 +672,15 @@ "path": "/name", "value": "nested_struct_test" }, + { + "op": "add", + "path": "/customProperties/totalSize", + "value": "0" + }, { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258695" + "value": "1735298453" }, { "op": "add", @@ -697,11 +702,6 @@ "path": "/customProperties/numRows", "value": "0" }, - { - "op": "add", - "path": "/customProperties/totalSize", - "value": "0" - }, { "op": "add", "path": "/customProperties/table_type", @@ -715,7 +715,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -926,11 +926,6 @@ "path": "/customProperties/another.comment", "value": "This table has no partitions" }, - { - "op": "add", - "path": "/customProperties/numFiles", - "value": "1" - }, { "op": "add", "path": "/customProperties/numRows", @@ -943,13 +938,18 @@ }, { "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258689" + "path": "/customProperties/totalSize", + "value": "33" }, { "op": "add", - "path": "/customProperties/totalSize", - "value": "33" + "path": "/customProperties/numFiles", + "value": "1" + }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298448" }, { "op": "add", @@ -974,7 +974,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1164,6 +1164,11 @@ "path": "/customProperties/numRows", "value": "0" }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298442" + }, { "op": "add", "path": "/customProperties/numFiles", @@ -1174,11 +1179,6 @@ "path": "/customProperties/COLUMN_STATS_ACCURATE", "value": "{\"BASIC_STATS\":\"true\"}" }, - { - "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258680" - }, { "op": "add", "path": "/customProperties/rawDataSize", @@ -1202,7 +1202,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1386,6 +1386,11 @@ "path": "/customProperties/numRows", "value": "0" }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298441" + }, { "op": "add", "path": "/customProperties/numFiles", @@ -1396,11 +1401,6 @@ "path": "/customProperties/COLUMN_STATS_ACCURATE", "value": "{\"BASIC_STATS\":\"true\"}" }, - { - "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258680" - }, { "op": "add", "path": "/customProperties/rawDataSize", @@ -1424,7 +1424,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1576,7 +1576,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258672" + "value": "1735298433" }, { "op": "add", @@ -1591,7 +1591,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" }, { "op": "add", @@ -1637,31 +1637,31 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=string].baz", + "fieldPath": "[version=2.0].[type=int].foo", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false, - "isPartitioningKey": true, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { - "fieldPath": "[version=2.0].[type=int].foo", + "fieldPath": "[version=2.0].[type=string].baz", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "int", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + "isPartitioningKey": true, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].bar", diff --git a/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_5.json b/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_5.json index 1937550e1bcbd0..7604a96aef8251 100644 --- a/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_5.json +++ b/metadata-ingestion/tests/integration/hive-metastore/hive_metastore_mces_golden_5.json @@ -87,6 +87,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:1cfce89b5a05e1da5092d88ad9eb4589" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "hive-metastore-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", @@ -160,22 +176,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:1cfce89b5a05e1da5092d88ad9eb4589" - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "hive-metastore-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:9ba2e350c97c893a91bcaee4838cdcae", @@ -238,7 +238,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258696" + "value": "1735298453" }, { "op": "add", @@ -268,7 +268,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -428,7 +428,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258696" + "value": "1735298453" }, { "op": "add", @@ -463,7 +463,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -672,10 +672,15 @@ "path": "/name", "value": "nested_struct_test" }, + { + "op": "add", + "path": "/customProperties/totalSize", + "value": "0" + }, { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258695" + "value": "1735298453" }, { "op": "add", @@ -697,11 +702,6 @@ "path": "/customProperties/numRows", "value": "0" }, - { - "op": "add", - "path": "/customProperties/totalSize", - "value": "0" - }, { "op": "add", "path": "/customProperties/table_type", @@ -715,7 +715,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -926,11 +926,6 @@ "path": "/customProperties/another.comment", "value": "This table has no partitions" }, - { - "op": "add", - "path": "/customProperties/numFiles", - "value": "1" - }, { "op": "add", "path": "/customProperties/numRows", @@ -943,13 +938,18 @@ }, { "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258689" + "path": "/customProperties/totalSize", + "value": "33" }, { "op": "add", - "path": "/customProperties/totalSize", - "value": "33" + "path": "/customProperties/numFiles", + "value": "1" + }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298448" }, { "op": "add", @@ -974,7 +974,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1164,6 +1164,11 @@ "path": "/customProperties/numRows", "value": "0" }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298442" + }, { "op": "add", "path": "/customProperties/numFiles", @@ -1174,11 +1179,6 @@ "path": "/customProperties/COLUMN_STATS_ACCURATE", "value": "{\"BASIC_STATS\":\"true\"}" }, - { - "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258680" - }, { "op": "add", "path": "/customProperties/rawDataSize", @@ -1202,7 +1202,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1386,6 +1386,11 @@ "path": "/customProperties/numRows", "value": "0" }, + { + "op": "add", + "path": "/customProperties/transient_lastDdlTime", + "value": "1735298441" + }, { "op": "add", "path": "/customProperties/numFiles", @@ -1396,11 +1401,6 @@ "path": "/customProperties/COLUMN_STATS_ACCURATE", "value": "{\"BASIC_STATS\":\"true\"}" }, - { - "op": "add", - "path": "/customProperties/transient_lastDdlTime", - "value": "1715258680" - }, { "op": "add", "path": "/customProperties/rawDataSize", @@ -1424,7 +1424,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" } ] }, @@ -1576,7 +1576,7 @@ { "op": "add", "path": "/customProperties/transient_lastDdlTime", - "value": "1715258672" + "value": "1735298433" }, { "op": "add", @@ -1591,7 +1591,7 @@ { "op": "add", "path": "/customProperties/create_date", - "value": "2024-05-09" + "value": "2024-12-27" }, { "op": "add", @@ -1637,31 +1637,31 @@ }, "fields": [ { - "fieldPath": "baz", + "fieldPath": "foo", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "string", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false, - "isPartitioningKey": true, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { - "fieldPath": "foo", + "fieldPath": "baz", "nullable": true, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "int", + "nativeDataType": "string", "recursive": false, "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + "isPartitioningKey": true, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "bar", diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json index b3922f76d7b0c7..a7716f7e10e55b 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json @@ -118,7 +118,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:23 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:56 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -128,7 +128,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166683", + "Table Parameters: transient_lastDdlTime": "1735218716", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -268,7 +268,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:23 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:56 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", @@ -280,7 +280,7 @@ "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1724166687", + "Table Parameters: transient_lastDdlTime": "1735218720", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -458,11 +458,11 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Table Type:": "VIRTUAL_VIEW", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218723", "SerDe Library:": "null", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -608,6 +608,187 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Table Type:": "VIRTUAL_VIEW", + "Table Parameters: transient_lastDdlTime": "1735218723", + "SerDe Library:": "null", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.array_struct_test_view", + "View Expanded Text:": "select `array_struct_test_view`.`property_id`, `array_struct_test_view`.`service` from `db1`.`array_struct_test_view`", + "View Rewrite Enabled:": "No" + }, + "name": "array_struct_test_view_2", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.array_struct_test_view_2", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)", @@ -639,7 +820,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:31 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:04 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -649,7 +830,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166691", + "Table Parameters: transient_lastDdlTime": "1735218724", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -793,7 +974,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -803,7 +984,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218723", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -996,7 +1177,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:20 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:53 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", @@ -1006,7 +1187,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1724166680", + "Table Parameters: transient_lastDdlTime": "1735218713", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1158,7 +1339,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:23 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:56 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -1168,7 +1349,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166683", + "Table Parameters: transient_lastDdlTime": "1735218716", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1339,14 +1520,14 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:02 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test_view_materialized", "Table Type:": "MATERIALIZED_VIEW", "Table Parameters: numFiles": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218722", "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", @@ -1519,7 +1700,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -1529,7 +1710,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218723", "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", @@ -1756,6 +1937,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW `db1.array_struct_test_view_2` AS select `array_struct_test_view`.`property_id`, `array_struct_test_view`.`service` from `db1`.`array_struct_test_view`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:8cc876554899e33efe67c389aaf29c4b", @@ -1875,7 +2074,7 @@ "customProperties": { "Database:": "db2", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:22 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:55 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db2.db/pokes", @@ -1884,7 +2083,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1724166683", + "Table Parameters: transient_lastDdlTime": "1735218716", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -2080,5 +2279,307 @@ "runId": "hive-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),property_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),service)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `db1.array_struct_test_view` AS\nSELECT\n `array_struct_test`.`property_id`,\n `array_struct_test`.`service`\nFROM `db1`.`array_struct_test`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),service)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:hive" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),property_id)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),service)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `db1.array_struct_test_view_2` AS\nSELECT\n `array_struct_test_view`.`property_id`,\n `array_struct_test_view`.`service`\nFROM `db1`.`array_struct_test_view`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),service)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:hive" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json index 4a0a4886d606ac..d24226e3f45449 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json @@ -118,7 +118,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:23 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:56 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -128,7 +128,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166683", + "Table Parameters: transient_lastDdlTime": "1735218716", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -268,7 +268,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:23 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:56 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", @@ -280,7 +280,7 @@ "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1724166687", + "Table Parameters: transient_lastDdlTime": "1735218720", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -458,11 +458,11 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Table Type:": "VIRTUAL_VIEW", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218723", "SerDe Library:": "null", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -608,6 +608,187 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Table Type:": "VIRTUAL_VIEW", + "Table Parameters: transient_lastDdlTime": "1735218723", + "SerDe Library:": "null", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.array_struct_test_view", + "View Expanded Text:": "select `array_struct_test_view`.`property_id`, `array_struct_test_view`.`service` from `db1`.`array_struct_test_view`", + "View Rewrite Enabled:": "No" + }, + "name": "array_struct_test_view_2", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.array_struct_test_view_2", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)", @@ -639,7 +820,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:31 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:04 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -649,7 +830,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166691", + "Table Parameters: transient_lastDdlTime": "1735218724", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -793,7 +974,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -803,7 +984,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218723", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -996,7 +1177,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:20 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:53 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", @@ -1006,7 +1187,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1724166680", + "Table Parameters: transient_lastDdlTime": "1735218713", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1158,7 +1339,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:23 UTC 2024", + "CreateTime:": "Thu Dec 26 13:11:56 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -1168,7 +1349,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166683", + "Table Parameters: transient_lastDdlTime": "1735218716", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1339,14 +1520,14 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:02 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test_view_materialized", "Table Type:": "MATERIALIZED_VIEW", "Table Parameters: numFiles": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218722", "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", @@ -1519,7 +1700,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Aug 20 15:11:30 UTC 2024", + "CreateTime:": "Thu Dec 26 13:12:03 UTC 2024", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -1529,7 +1710,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1724166690", + "Table Parameters: transient_lastDdlTime": "1735218723", "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", @@ -1755,5 +1936,325 @@ "runId": "hive-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW `db1.array_struct_test_view_2` AS select `array_struct_test_view`.`property_id`, `array_struct_test_view`.`service` from `db1`.`array_struct_test_view`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),property_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),service)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `db1.array_struct_test_view` AS\nSELECT\n `array_struct_test`.`property_id`,\n `array_struct_test`.`service`\nFROM `db1`.`array_struct_test`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD),service)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:hive" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),property_id)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),service)" + ], + "confidenceScore": 0.35, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `db1.array_struct_test_view_2` AS\nSELECT\n `array_struct_test_view`.`property_id`,\n `array_struct_test_view`.`service`\nFROM `db1`.`array_struct_test_view`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD),service)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view_2,PROD),service)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:hive" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cdb1.array_struct_test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_setup.sql b/metadata-ingestion/tests/integration/hive/hive_setup.sql index 323a78e24d10b3..c027c174c93553 100644 --- a/metadata-ingestion/tests/integration/hive/hive_setup.sql +++ b/metadata-ingestion/tests/integration/hive/hive_setup.sql @@ -42,6 +42,8 @@ select * from test_data; CREATE MATERIALIZED VIEW db1.struct_test_view_materialized as select * from db1.struct_test; CREATE VIEW db1.array_struct_test_view as select * from db1.array_struct_test; +CREATE VIEW db1.array_struct_test_view_2 as select * from db1.array_struct_test_view; + CREATE TABLE IF NOT EXISTS db1.nested_struct_test ( property_id INT, diff --git a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json index 14b03619de4c1b..974d10d535861d 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json +++ b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json @@ -1550,8 +1550,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `metadata_index_view` AS select `metadata_index`.`id` AS `id`,`metadata_index`.`urn` AS `urn`,`metadata_index`.`path` AS `path`,`metadata_index`.`doubleVal` AS `doubleVal` from `metadata_index`", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `metadata_index_view` AS select `metadata_index`.`id` AS `id`,`metadata_index`.`urn` AS `urn`,`metadata_index`.`path` AS `path`,`metadata_index`.`doubleVal` AS `doubleVal` from `metadata_index`" }, "name": "metadata_index_view", "tags": [] @@ -2701,35 +2701,42 @@ "upstreams": [ { "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29" } ], "fineGrainedLineages": [ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),doubleVal)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),id)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),doubleVal)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),id)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),urn)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),urn)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29" }, { "upstreamType": "FIELD_SET", @@ -2740,18 +2747,95 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),path)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),doubleVal)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),doubleVal)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE ALGORITHM=UNDEFINED\nDEFINER=\"root\"@\"localhost\"\nSQL SECURITY DEFINER VIEW `metadata_index_view` AS\nSELECT\n `metadata_index`.`id` AS `id`,\n `metadata_index`.`urn` AS `urn`,\n `metadata_index`.`path` AS `path`,\n `metadata_index`.`doubleVal` AS `doubleVal`\nFROM `metadata_index`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1586847600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),doubleVal)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),path)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),urn)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),urn)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),path)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),doubleVal)" } ] } @@ -2762,6 +2846,38 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mysql" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amysql%2Cmetagalaxy.metadata_index_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "glossaryTerm", "entityUrn": "urn:li:glossaryTerm:Email_Address", diff --git a/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_database.json b/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_database.json index abd9b2350638a2..8cf69535f30f66 100644 --- a/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_database.json +++ b/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_database.json @@ -17,7 +17,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -33,7 +33,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -49,7 +49,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -67,7 +67,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -83,7 +83,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -99,7 +99,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -122,7 +122,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -138,7 +138,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -154,7 +154,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -172,7 +172,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -193,7 +193,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -209,7 +209,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -272,7 +272,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -290,7 +290,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -315,7 +315,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -331,7 +331,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -394,7 +394,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -412,7 +412,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -437,7 +437,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -453,7 +453,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -470,8 +470,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table" }, "name": "view1", "description": "Some mock comment here ...", @@ -519,7 +519,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -537,7 +537,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -555,7 +555,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -580,7 +580,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -596,7 +596,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -619,7 +619,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -635,7 +635,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -651,7 +651,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -669,7 +669,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -690,7 +690,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -706,7 +706,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -769,7 +769,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -787,7 +787,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -812,7 +812,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -828,7 +828,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -891,7 +891,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -909,7 +909,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -934,7 +934,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -950,7 +950,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -967,8 +967,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table" }, "name": "view1", "description": "Some mock comment here ...", @@ -1016,7 +1016,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -1034,7 +1034,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -1052,7 +1052,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -1077,7 +1077,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -1091,11 +1091,16 @@ "upstreams": [ { "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema1.mock_table,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema1.view1%2CPROD%29" } ], "fineGrainedLineages": [ @@ -1108,7 +1113,8 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema1.view1,PROD),MOCK_COLUMN1)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema1.view1%2CPROD%29" }, { "upstreamType": "FIELD_SET", @@ -1119,14 +1125,94 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema1.view1,PROD),MOCK_COLUMN2)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema1.view1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW mock_view AS\nSELECT\n mock_column1,\n mock_column2\nFROM mock_table", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema1.mock_table,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema1.mock_table,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema1.mock_table,PROD),MOCK_COLUMN2)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema1.view1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema1.view1,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema1.view1,PROD),MOCK_COLUMN2)" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } }, @@ -1140,11 +1226,16 @@ "upstreams": [ { "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema2.mock_table,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema2.view1%2CPROD%29" } ], "fineGrainedLineages": [ @@ -1157,7 +1248,8 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema2.view1,PROD),MOCK_COLUMN1)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema2.view1%2CPROD%29" }, { "upstreamType": "FIELD_SET", @@ -1168,14 +1260,126 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema2.view1,PROD),MOCK_COLUMN2)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema2.view1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW mock_view AS\nSELECT\n mock_column1,\n mock_column2\nFROM mock_table", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema2.mock_table,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema2.mock_table,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,oradoc.schema2.mock_table,PROD),MOCK_COLUMN2)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema2.view1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema2.view1,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,OraDoc.schema2.view1,PROD),MOCK_COLUMN2)" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00-uzcdxn", + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2COraDoc.schema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-38ppfw", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_out_database.json b/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_out_database.json index dc0208586d1a19..d57ea0e21479da 100644 --- a/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_out_database.json +++ b/metadata-ingestion/tests/integration/oracle/golden_test_ingest_with_out_database.json @@ -17,7 +17,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -33,7 +33,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -49,7 +49,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -67,7 +67,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -83,7 +83,23 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0e497517e191d344b0c403231bc708d0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -106,7 +122,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -122,7 +138,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -138,7 +154,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -156,23 +172,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0e497517e191d344b0c403231bc708d0" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -193,7 +193,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -209,7 +209,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -272,7 +272,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -290,7 +290,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -315,7 +315,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -331,7 +331,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -394,7 +394,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -412,7 +412,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -437,7 +437,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -453,7 +453,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -470,8 +470,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table" }, "name": "view1", "description": "Some mock comment here ...", @@ -519,7 +519,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -537,7 +537,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -555,7 +555,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -580,7 +580,23 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0e497517e191d344b0c403231bc708d0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -603,7 +619,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -619,7 +635,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -635,7 +651,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -653,23 +669,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0e497517e191d344b0c403231bc708d0" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -690,7 +690,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -706,7 +706,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -769,7 +769,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -787,7 +787,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -812,7 +812,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -828,7 +828,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -891,7 +891,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -909,7 +909,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -934,7 +934,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -950,7 +950,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -967,8 +967,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table" }, "name": "view1", "description": "Some mock comment here ...", @@ -1016,7 +1016,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -1034,7 +1034,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -1052,7 +1052,7 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } }, @@ -1077,7 +1077,309 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "oracle-2022_02_03-07_00_00", + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN1)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN2)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW mock_view AS\nSELECT\n mock_column1,\n mock_column2\nFROM mock_table", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN2)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN1)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN2)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW mock_view AS\nSELECT\n mock_column1,\n mock_column2\nFROM mock_table", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN2)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-ss8owb", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json index 21898ca246b656..dea5123e93b144 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json @@ -87,6 +87,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a208486b83be39fa411922e07701d984", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0202f800c992262c01ae6bbd5ee313f7" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:a208486b83be39fa411922e07701d984", @@ -160,22 +176,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:a208486b83be39fa411922e07701d984", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0202f800c992262c01ae6bbd5ee313f7" - } - }, - "systemMetadata": { - "lastObserved": 1646575200000, - "runId": "postgres-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:a208486b83be39fa411922e07701d984", @@ -285,6 +285,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a6097853edba03be190d99ece4b307ff" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", @@ -358,22 +374,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a6097853edba03be190d99ece4b307ff" - } - }, - "systemMetadata": { - "lastObserved": 1646575200000, - "runId": "postgres-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", @@ -640,8 +640,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", - "is_view": "True" + "is_view": "True", + "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);" }, "name": "metadata_aspect_view", "tags": [] @@ -856,35 +856,105 @@ "upstreams": [ { "auditStamp": { + "time": 1646575200000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29" } ], "fineGrainedLineages": [ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "SELECT\n metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\nFROM metadata_aspect_v2\nWHERE\n (\n metadata_aspect_v2.version = 0\n )", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1646575200000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" } ] } @@ -894,5 +964,37 @@ "runId": "postgres-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:postgres" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json index fc4a0affac5618..e75ea679cecf26 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json @@ -87,6 +87,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a6097853edba03be190d99ece4b307ff" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", @@ -160,22 +176,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a6097853edba03be190d99ece4b307ff" - } - }, - "systemMetadata": { - "lastObserved": 1646575200000, - "runId": "postgres-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:51904fc8cd5cc729bc630decff284525", @@ -464,8 +464,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", - "is_view": "True" + "is_view": "True", + "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);" }, "name": "metadata_aspect_view", "tags": [] @@ -622,35 +622,105 @@ "upstreams": [ { "auditStamp": { + "time": 1646575200000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29" } ], "fineGrainedLineages": [ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "SELECT\n metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\nFROM metadata_aspect_v2\nWHERE\n (\n metadata_aspect_v2.version = 0\n )", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1646575200000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" } ] } @@ -661,6 +731,38 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:postgres" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Apostgres%2Cpostgrestest.public.metadata_aspect_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "glossaryTerm", "entityUrn": "urn:li:glossaryTerm:URN", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 72dcda25c1296c..0d9386dcda0cdb 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "a06cfdca-b65e-42de-8db2-8c21c183c5dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-26 12:03:35.420000", + "date_modified": "2024-12-26 12:03:35.590000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -2103,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2282,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-26 12:03:35.230000", + "date_modified": "2024-12-26 12:03:35.230000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2310,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-05 16:44:43.803000", - "date_modified": "2024-12-05 16:44:43.803000" + "date_created": "2024-12-26 12:03:35.237000", + "date_modified": "2024-12-26 12:03:35.237000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -4427,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] @@ -4891,11 +4891,16 @@ "upstreams": [ { "auditStamp": { + "time": 1615443388097, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29" } ] } @@ -4906,6 +4911,73 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW Foo.PersonsView AS\nSELECT\n *\nFROM Foo.Persons", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1735214618898, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", @@ -4916,35 +4988,105 @@ "upstreams": [ { "auditStamp": { + "time": 1615443388097, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CNewData.FooNew.View1%2CPROD%29" } ], "fineGrainedLineages": [ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),firstname)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),lastname)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD),firstname)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD),lastname)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CNewData.FooNew.View1%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),lastname)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),firstname)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD),lastname)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD),firstname)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CNewData.FooNew.View1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CNewData.FooNew.View1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW FooNew.View1 AS\nSELECT\n LastName,\n FirstName\nFROM FooNew.PersonsNew\nWHERE\n Age > 18", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1735214618906, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CNewData.FooNew.View1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),firstname)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),lastname)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD),lastname)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD),firstname)" } ] } @@ -4955,6 +5097,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CNewData.FooNew.View1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", @@ -5034,5 +5192,37 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CNewData.FooNew.View1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index 0df89ff1eb94d7..07098f0161fc3d 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "a06cfdca-b65e-42de-8db2-8c21c183c5dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-26 12:03:35.420000", + "date_modified": "2024-12-26 12:03:35.590000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -2103,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2282,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-26 12:03:35.230000", + "date_modified": "2024-12-26 12:03:35.230000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2638,11 +2638,16 @@ "upstreams": [ { "auditStamp": { + "time": 1615443388097, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29" } ] } @@ -2653,6 +2658,73 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW Foo.PersonsView AS\nSELECT\n *\nFROM Foo.Persons", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1735214621644, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", @@ -2716,5 +2788,21 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2CDemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index b36188405e7e11..bf30448469c309 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -137,11 +137,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", + "job_id": "a06cfdca-b65e-42de-8db2-8c21c183c5dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-19 12:34:45.843000", - "date_modified": "2024-12-19 12:34:46.017000", + "date_created": "2024-12-26 12:03:35.420000", + "date_modified": "2024-12-26 12:03:35.590000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -2532,8 +2532,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-19 12:34:45.660000", - "date_modified": "2024-12-19 12:34:45.660000" + "date_created": "2024-12-26 12:03:35.230000", + "date_modified": "2024-12-26 12:03:35.230000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2577,8 +2577,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-19 12:34:45.667000", - "date_modified": "2024-12-19 12:34:45.667000" + "date_created": "2024-12-26 12:03:35.237000", + "date_modified": "2024-12-26 12:03:35.237000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2968,11 +2968,67 @@ "upstreams": [ { "auditStamp": { + "time": 1615443388097, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.demodata.foo.persons,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cmy-instance.DemoData.Foo.PersonsView%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cmy-instance.DemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW Foo.PersonsView AS\nSELECT\n *\nFROM Foo.Persons", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1735214620908, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cmy-instance.DemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.demodata.foo.persons,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)" } ] } @@ -2983,6 +3039,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cmy-instance.DemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", @@ -3062,5 +3134,21 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cmy-instance.DemoData.Foo.PersonsView%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ebcadcc11dcbfa..ff27989d71de1b 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", + "job_id": "a06cfdca-b65e-42de-8db2-8c21c183c5dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-22 12:58:03.260000", - "date_modified": "2024-11-22 12:58:03.440000", + "date_created": "2024-12-26 12:03:35.420000", + "date_modified": "2024-12-26 12:03:35.590000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -2103,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2282,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-22 12:58:03.137000", - "date_modified": "2024-11-22 12:58:03.137000" + "date_created": "2024-12-26 12:03:35.230000", + "date_modified": "2024-12-26 12:03:35.230000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2310,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-11-22 12:58:03.140000", - "date_modified": "2024-11-22 12:58:03.140000" + "date_created": "2024-12-26 12:03:35.237000", + "date_modified": "2024-12-26 12:03:35.237000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -4427,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] @@ -4891,57 +4891,66 @@ "upstreams": [ { "auditStamp": { + "time": 1615443388097, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29" } ], "fineGrainedLineages": [ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),Age)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),ID)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),FirstName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),LastName)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),FirstName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),LastName)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),FirstName)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),ID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),FirstName)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),LastName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),LastName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),Age)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29" } ] } @@ -4952,6 +4961,97 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW Foo.PersonsView AS\nSELECT\n *\nFROM Foo.Persons", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1735214622805, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),FirstName)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),LastName)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),ID)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),LastName)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),FirstName)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),Age)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", @@ -4962,35 +5062,105 @@ "upstreams": [ { "auditStamp": { + "time": 1615443388097, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cnewdata.foonew.view1%2CPROD%29" } ], "fineGrainedLineages": [ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),FirstName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),LastName)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),FirstName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),LastName)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cnewdata.foonew.view1%2CPROD%29" }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),LastName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),FirstName)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),LastName)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),FirstName)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cnewdata.foonew.view1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cnewdata.foonew.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW FooNew.View1 AS\nSELECT\n LastName,\n FirstName\nFROM FooNew.PersonsNew\nWHERE\n Age > 18", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1735214622810, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cnewdata.foonew.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),FirstName)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),LastName)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),LastName)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),FirstName)" } ] } @@ -5001,6 +5171,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cnewdata.foonew.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -5151,5 +5337,37 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cdemodata.foo.personsview%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Amssql%2Cnewdata.foonew.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json index 6745268ea2c249..fe85b6b4396fb8 100644 --- a/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_hive_instance_mces_golden.json @@ -94,6 +94,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:46baa6eebd802861e5ee3d043456e171", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:46baa6eebd802861e5ee3d043456e171", @@ -169,22 +185,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:46baa6eebd802861e5ee3d043456e171", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:f311add3fdc7c16e8a50a63fe1dcce8b" - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-instance-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:46baa6eebd802861e5ee3d043456e171", @@ -246,7 +246,7 @@ "numrows": "1", "rawdatasize": "32", "totalsize": "33", - "transient_lastddltime": "1724180599" + "transient_lastddltime": "1735206396" }, "name": "array_struct_test", "description": "This table has array of structs", @@ -507,7 +507,7 @@ "numrows": "3", "rawdatasize": "94", "totalsize": "97", - "transient_lastddltime": "1724180605" + "transient_lastddltime": "1735206403" }, "name": "classification_test", "tags": [] @@ -766,7 +766,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180602" + "transient_lastddltime": "1735206400" }, "name": "map_test", "tags": [] @@ -993,7 +993,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180602" + "transient_lastddltime": "1735206400" }, "name": "nested_struct_test", "tags": [] @@ -1264,7 +1264,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1724180591" + "transient_lastddltime": "1735206384" }, "name": "pokes", "tags": [] @@ -1499,7 +1499,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180595" + "transient_lastddltime": "1735206390" }, "name": "struct_test", "tags": [] @@ -1750,7 +1750,7 @@ "customProperties": { "numfiles": "0", "totalsize": "0", - "transient_lastddltime": "1724180601" + "transient_lastddltime": "1735206399" }, "name": "struct_test_view_materialized", "tags": [] @@ -2004,7 +2004,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180595" + "transient_lastddltime": "1735206390" }, "name": "_test_table_underscore", "tags": [] @@ -2227,7 +2227,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180602" + "transient_lastddltime": "1735206400" }, "name": "union_test", "tags": [] @@ -2529,9 +2529,9 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1724180602", - "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", - "is_view": "True" + "transient_lastddltime": "1735206400", + "is_view": "True", + "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"" }, "name": "array_struct_test_view", "tags": [] @@ -2758,11 +2758,16 @@ "upstreams": [ { "auditStamp": { + "time": 1632398400000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Cproduction_warehouse.hivedb.db1.array_struct_test_view%2CPROD%29" } ], "fineGrainedLineages": [ @@ -2775,7 +2780,8 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test_view,PROD),property_id)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Cproduction_warehouse.hivedb.db1.array_struct_test_view%2CPROD%29" }, { "upstreamType": "FIELD_SET", @@ -2786,7 +2792,8 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test_view,PROD),service)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Cproduction_warehouse.hivedb.db1.array_struct_test_view%2CPROD%29" } ] } @@ -2797,6 +2804,85 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Cproduction_warehouse.hivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "SELECT\n \"property_id\",\n \"service\"\nFROM \"db1\".\"array_struct_test\"", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1632398400000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Cproduction_warehouse.hivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test,PROD),service)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test_view,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,production_warehouse.hivedb.db1.array_struct_test_view,PROD),service)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Cproduction_warehouse.hivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:trino" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,local_server.db1._test_table_underscore,PROD)", @@ -2956,5 +3042,21 @@ "runId": "trino-hive-instance-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Cproduction_warehouse.hivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-instance-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json index 34acf6a6e369bc..d68f014c93ac88 100644 --- a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json @@ -87,6 +87,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", @@ -160,22 +176,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "container", - "entityUrn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:c7a81f6ed9a7cdd0c74436ac2dc4d1f7" - } - }, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:304fd7ad57dc0ab32fb2cb778cbccd84", @@ -233,7 +233,7 @@ "numrows": "1", "rawdatasize": "32", "totalsize": "33", - "transient_lastddltime": "1724180599" + "transient_lastddltime": "1735206396" }, "name": "array_struct_test", "description": "This table has array of structs", @@ -473,7 +473,7 @@ "numrows": "3", "rawdatasize": "94", "totalsize": "97", - "transient_lastddltime": "1724180605" + "transient_lastddltime": "1735206403" }, "name": "classification_test", "tags": [] @@ -755,7 +755,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180602" + "transient_lastddltime": "1735206400" }, "name": "map_test", "tags": [] @@ -961,7 +961,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180602" + "transient_lastddltime": "1735206400" }, "name": "nested_struct_test", "tags": [] @@ -1211,7 +1211,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1724180591" + "transient_lastddltime": "1735206384" }, "name": "pokes", "tags": [] @@ -1425,7 +1425,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180595" + "transient_lastddltime": "1735206390" }, "name": "struct_test", "tags": [] @@ -1655,7 +1655,7 @@ "customProperties": { "numfiles": "0", "totalsize": "0", - "transient_lastddltime": "1724180601" + "transient_lastddltime": "1735206399" }, "name": "struct_test_view_materialized", "tags": [] @@ -1888,7 +1888,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180595" + "transient_lastddltime": "1735206390" }, "name": "_test_table_underscore", "tags": [] @@ -2090,7 +2090,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1724180602" + "transient_lastddltime": "1735206400" }, "name": "union_test", "tags": [] @@ -2371,9 +2371,9 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1724180602", - "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", - "is_view": "True" + "transient_lastddltime": "1735206400", + "is_view": "True", + "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"" }, "name": "array_struct_test_view", "tags": [] @@ -2579,11 +2579,16 @@ "upstreams": [ { "auditStamp": { + "time": 1632398400000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { "time": 0, - "actor": "urn:li:corpuser:unknown" + "actor": "urn:li:corpuser:_ingestion" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)", - "type": "VIEW" + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Chivedb.db1.array_struct_test_view%2CPROD%29" } ], "fineGrainedLineages": [ @@ -2596,7 +2601,8 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),property_id)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Chivedb.db1.array_struct_test_view%2CPROD%29" }, { "upstreamType": "FIELD_SET", @@ -2607,7 +2613,71 @@ "downstreams": [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),service)" ], - "confidenceScore": 1.0 + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Chivedb.db1.array_struct_test_view%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Chivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "SELECT\n \"property_id\",\n \"service\"\nFROM \"db1\".\"array_struct_test\"", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1632398400000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Chivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD),service)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),property_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),service)" } ] } @@ -2618,6 +2688,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Chivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:trino" + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1._test_table_underscore,PROD)", @@ -2778,6 +2864,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Atrino%2Chivedb.db1.array_struct_test_view%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "glossaryTerm", "entityUrn": "urn:li:glossaryTerm:Age", From 3723a3e4bcad1fdf2ab8a812b60b52139da398f5 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 30 Dec 2024 21:06:48 +0530 Subject: [PATCH 49/49] fix(ingest/gc): reduce logging, remove unnecessary sleeps (#12238) --- .../source/gc/dataprocess_cleanup.py | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 6d16aaab2d7980..3f7a1fc453bcdb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -170,6 +170,8 @@ class DataProcessCleanupReport(SourceReport): sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field( default_factory=TopKDict ) + num_data_flows_found: int = 0 + num_data_jobs_found: int = 0 class DataProcessCleanup: @@ -265,13 +267,17 @@ def keep_last_n_dpi( self.report.report_failure( f"Exception while deleting DPI: {e}", exc=e ) - if deleted_count_last_n % self.config.batch_size == 0: + if ( + deleted_count_last_n % self.config.batch_size == 0 + and deleted_count_last_n > 0 + ): logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}") if self.config.delay: logger.info(f"Sleeping for {self.config.delay} seconds") time.sleep(self.config.delay) - logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}") + if deleted_count_last_n > 0: + logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}") def delete_entity(self, urn: str, type: str) -> None: assert self.ctx.graph @@ -351,7 +357,10 @@ def remove_old_dpis( except Exception as e: self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e) - if deleted_count_retention % self.config.batch_size == 0: + if ( + deleted_count_retention % self.config.batch_size == 0 + and deleted_count_retention > 0 + ): logger.info( f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention" ) @@ -393,6 +402,7 @@ def get_data_flows(self) -> Iterable[DataFlowEntity]: scrollAcrossEntities = result.get("scrollAcrossEntities") if not scrollAcrossEntities: raise ValueError("Missing scrollAcrossEntities in response") + self.report.num_data_flows_found += scrollAcrossEntities.get("count") logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities") scroll_id = scrollAcrossEntities.get("nextScrollId") @@ -415,8 +425,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: assert self.ctx.graph dataFlows: Dict[str, DataFlowEntity] = {} - for flow in self.get_data_flows(): - dataFlows[flow.urn] = flow + if self.config.delete_empty_data_flows: + for flow in self.get_data_flows(): + dataFlows[flow.urn] = flow scroll_id: Optional[str] = None previous_scroll_id: Optional[str] = None @@ -443,6 +454,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if not scrollAcrossEntities: raise ValueError("Missing scrollAcrossEntities in response") + self.report.num_data_jobs_found += scrollAcrossEntities.get("count") logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities") scroll_id = scrollAcrossEntities.get("nextScrollId") @@ -481,7 +493,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: previous_scroll_id = scroll_id - logger.info(f"Deleted {deleted_jobs} DataJobs") + if deleted_jobs > 0: + logger.info(f"Deleted {deleted_jobs} DataJobs") # Delete empty dataflows if needed if self.config.delete_empty_data_flows: deleted_data_flows: int = 0