Skip to content

Commit

Permalink
Merge pull request #185 from genkey6/fuzzy_matching_from_prefix_provi…
Browse files Browse the repository at this point in the history
…ded_by_metadata

Fuzzy matching on prefix provided by metadata
  • Loading branch information
z3z1ma authored Dec 11, 2024
2 parents 34780d4 + 9c3296e commit b92355f
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 2 deletions.
4 changes: 4 additions & 0 deletions demo_duckdb/models/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ models:
- name: customer_lifetime_value
data_type: DOUBLE
description: ''

- name: customer_rank
data_type: VARCHAR
description: ''
- name: orders
description: This table has basic information about orders, as well as some derived facts based on payments

Expand Down
3 changes: 3 additions & 0 deletions demo_duckdb/models/staging/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ models:
- name: last_name
data_type: VARCHAR
description: ''
- name: rank
data_type: VARCHAR
description: ''
- name: stg_orders
columns:
- name: order_id
Expand Down
16 changes: 14 additions & 2 deletions src/dbt_osmosis/core/column_level_knowledge_propagator.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,15 +209,27 @@ def update_undocumented_columns_with_prior_knowledge(

changes_committed = 0
for column in undocumented_columns:
prior_knowledge: ColumnLevelKnowledge = get_prior_knowledge(knowledge, column)
original_knowledge = ColumnLevelKnowledgePropagator._get_original_knowledge(
node, column
)
if original_knowledge["meta"].get("osmosis_prefix", None):
column_without_prefix = column.removeprefix(
original_knowledge["meta"]["osmosis_prefix"]
)
else:
column_without_prefix = column

prior_knowledge: ColumnLevelKnowledge = get_prior_knowledge(
knowledge, column_without_prefix
)
progenitor = prior_knowledge.pop("progenitor", None)
prior_knowledge: ColumnLevelKnowledge = {
k: v for k, v in prior_knowledge.items() if k in inheritables
}

ColumnLevelKnowledgePropagator._merge_prior_knowledge_with_original_knowledge(
prior_knowledge,
ColumnLevelKnowledgePropagator._get_original_knowledge(node, column),
original_knowledge,
add_progenitor_to_meta,
progenitor,
)
Expand Down
152 changes: 152 additions & 0 deletions tests/test_column_level_knowledge_propagator.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def test_inherit_column_level_knowledge():
"constraints": [],
"quote": None,
},
"rank": {
"progenitor": "model.jaffle_shop_duckdb.stg_customers",
"generation": "generation_0",
"name": "rank",
"data_type": "VARCHAR",
"constraints": [],
"quote": None,
},
"order_id": {
"progenitor": "model.jaffle_shop_duckdb.stg_orders",
"generation": "generation_0",
Expand Down Expand Up @@ -561,6 +569,150 @@ def test_update_undocumented_columns_with_prior_knowledge_with_add_inheritance_f
assert set(target_node.columns["customer_id"]._extra["policy_tags"]) == set(["my_policy_tag1"])


def test_update_undocumented_columns_with_osmosis_prefix_meta_with_prior_knowledge():
manifest = load_manifest()
manifest.nodes["model.jaffle_shop_duckdb.stg_customers"].columns[
"rank"
].description = "THIS COLUMN IS UPDATED FOR TESTING"
manifest.nodes["model.jaffle_shop_duckdb.stg_customers"].columns["rank"].meta = {
"my_key": "my_value",
}
manifest.nodes["model.jaffle_shop_duckdb.stg_customers"].columns["rank"].tags = [
"my_tag1",
"my_tag2",
]

target_node_name = "model.jaffle_shop_duckdb.customers"
manifest.nodes[target_node_name].columns["customer_rank"].tags = set(
[
"my_tag3",
"my_tag4",
]
)
manifest.nodes[target_node_name].columns["customer_rank"].meta = {
"my_key": "my_old_value",
"my_new_key": "my_new_value",
"osmosis_prefix": "customer_",
}
target_node = manifest.nodes[target_node_name]
knowledge = ColumnLevelKnowledgePropagator.get_node_columns_with_inherited_knowledge(
manifest, target_node, placeholders=[""]
)
yaml_file_model_section = {
"columns": [
{
"name": "customer_rank",
}
]
}
undocumented_columns = target_node.columns.keys()
ColumnLevelKnowledgePropagator.update_undocumented_columns_with_prior_knowledge(
undocumented_columns,
target_node,
yaml_file_model_section,
knowledge,
skip_add_tags=False,
skip_merge_meta=False,
add_progenitor_to_meta=False,
)

assert yaml_file_model_section["columns"][0]["name"] == "customer_rank"
assert (
yaml_file_model_section["columns"][0]["description"] == "THIS COLUMN IS UPDATED FOR TESTING"
)
assert yaml_file_model_section["columns"][0]["meta"] == {
"my_key": "my_value",
"my_new_key": "my_new_value",
"osmosis_prefix": "customer_",
}
assert set(yaml_file_model_section["columns"][0]["tags"]) == set(
["my_tag1", "my_tag2", "my_tag3", "my_tag4"]
)

assert target_node.columns["customer_rank"].description == "THIS COLUMN IS UPDATED FOR TESTING"
assert target_node.columns["customer_rank"].meta == {
"my_key": "my_value",
"my_new_key": "my_new_value",
"osmosis_prefix": "customer_",
}
assert set(target_node.columns["customer_rank"].tags) == set(
["my_tag1", "my_tag2", "my_tag3", "my_tag4"]
)


def test_update_undocumented_columns_with_osmosis_prefix_meta_with_prior_knowledge_with_osmosis_keep_description():
manifest = load_manifest()
manifest.nodes["model.jaffle_shop_duckdb.stg_customers"].columns[
"rank"
].description = "THIS COLUMN IS UPDATED FOR TESTING"
manifest.nodes["model.jaffle_shop_duckdb.stg_customers"].columns["rank"].meta = {
"my_key": "my_value",
}
manifest.nodes["model.jaffle_shop_duckdb.stg_customers"].columns["rank"].tags = [
"my_tag1",
"my_tag2",
]

column_description_not_updated = (
"This column will not be updated as it has the 'osmosis_keep_description' attribute"
)
target_node_name = "model.jaffle_shop_duckdb.customers"

manifest.nodes[target_node_name].columns[
"customer_rank"
].description = column_description_not_updated
manifest.nodes[target_node_name].columns["customer_rank"].tags = set(
[
"my_tag3",
"my_tag4",
]
)
manifest.nodes[target_node_name].columns["customer_rank"].meta = {
"my_key": "my_value",
"osmosis_prefix": "customer_",
"osmosis_keep_description": True,
}

target_node = manifest.nodes[target_node_name]
knowledge = ColumnLevelKnowledgePropagator.get_node_columns_with_inherited_knowledge(
manifest, target_node, placeholders=[""]
)
yaml_file_model_section = {
"columns": [
{
"name": "customer_rank",
}
]
}
undocumented_columns = target_node.columns.keys()
ColumnLevelKnowledgePropagator.update_undocumented_columns_with_prior_knowledge(
undocumented_columns,
target_node,
yaml_file_model_section,
knowledge,
skip_add_tags=True,
skip_merge_meta=True,
add_progenitor_to_meta=False,
)

assert yaml_file_model_section["columns"][0]["name"] == "customer_rank"
assert yaml_file_model_section["columns"][0]["description"] == column_description_not_updated
assert yaml_file_model_section["columns"][0]["meta"] == {
"my_key": "my_value",
"osmosis_keep_description": True,
"osmosis_prefix": "customer_",
}
assert set(yaml_file_model_section["columns"][0]["tags"]) == set(["my_tag3", "my_tag4"])

assert target_node.columns["customer_rank"].description == column_description_not_updated
assert target_node.columns["customer_rank"].meta == {
"my_key": "my_value",
"osmosis_keep_description": True,
"osmosis_prefix": "customer_",
}
assert set(target_node.columns["customer_rank"].tags) == set(["my_tag3", "my_tag4"])


@pytest.mark.parametrize("use_unrendered_descriptions", [True, False])
def test_use_unrendered_descriptions(use_unrendered_descriptions):
manifest = load_manifest()
Expand Down

0 comments on commit b92355f

Please sign in to comment.