Skip to content

Commit

Permalink
Refactoring to make testing easier & adding tests (#96)
Browse files Browse the repository at this point in the history
* Add pytest package

* Use tool.poetry.group.dev.dependencies instead of tool.poetry.dev-dependencies

* make build_node_ancestor_tree staticmethod for testing

* Add jaffle_shop's manifest.json for testing

* Split files

* Add tests for knowledge propagation

* Add types

* Add test for get_prior_knowledge

* run pytest on github actions
  • Loading branch information
syou6162 authored Sep 21, 2023
1 parent e262c11 commit 0c2c8d6
Show file tree
Hide file tree
Showing 9 changed files with 11,716 additions and 85 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Tests

on:
- push
- pull_request

jobs:
tests:
name: Run pytest
runs-on: ubuntu-latest
steps:
- name: Check out the repository
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: "3.10"

- name: Upgrade pip
run: |
pip install --constraint=.github/workflows/constraints.txt pip
pip --version
- name: Install Poetry
run: |
pip install --constraint=.github/workflows/constraints.txt poetry
poetry --version
- name: Install required packages
run: |
poetry install
- name: Run pytest
run: |
poetry run python -m pytest
64 changes: 63 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,6 @@ dbt-duckdb = { version = ">=1.0.0", optional = true }
dbt-sqlite = { version = ">=1.0.0", optional = true }
dbt-postgres = { version = ">=1.0.0", optional = true }

[tool.poetry.dev-dependencies]
black = ">=21.9b0"
mypy = ">=0.910"
pylint = ">=2.11.1"

[tool.poetry.extras]
duckdb = ["dbt-duckdb"]
sqlite = ["dbt-sqlite"]
Expand All @@ -56,6 +51,12 @@ workbench = [
"streamlit-elements-fluence",
]

[tool.poetry.group.dev.dependencies]
black = ">=21.9b0"
mypy = ">=0.910"
pylint = ">=2.11.1"
pytest = "^7.4.2"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
Expand Down
94 changes: 94 additions & 0 deletions src/dbt_osmosis/core/column_level_knowledge_propagator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from typing import (
Any,
Dict,
List,
Optional,
)

from dbt_osmosis.vendored.dbt_core_interface.project import (
ManifestNode,
)


ColumnLevelKnowledge = Dict[str, Any]
Knowledge = Dict[str, ColumnLevelKnowledge]


def _build_node_ancestor_tree(
manifest: ManifestNode,
node: ManifestNode,
family_tree: Optional[Dict[str, List[str]]] = None,
members_found: Optional[List[str]] = None,
depth: int = 0,
) -> Dict[str, List[str]]:
"""Recursively build dictionary of parents in generational order"""
if family_tree is None:
family_tree = {}
if members_found is None:
members_found = []
if not hasattr(node, "depends_on"):
return family_tree
for parent in getattr(node.depends_on, "nodes", []):
member = manifest.nodes.get(parent, manifest.sources.get(parent))
if member and parent not in members_found:
family_tree.setdefault(f"generation_{depth}", []).append(parent)
members_found.append(parent)
# Recursion
family_tree = _build_node_ancestor_tree(
manifest, member, family_tree, members_found, depth + 1
)
return family_tree


def _inherit_column_level_knowledge(
manifest: ManifestNode,
family_tree: Dict[str, Any],
placeholders: List[str],
) -> Knowledge:
"""Inherit knowledge from ancestors in reverse insertion order to ensure that the most
recent ancestor is always the one to inherit from
"""
knowledge: Knowledge = {}
for generation in reversed(family_tree):
for ancestor in family_tree[generation]:
member: ManifestNode = manifest.nodes.get(ancestor, manifest.sources.get(ancestor))
if not member:
continue
for name, info in member.columns.items():
knowledge_default = {"progenitor": ancestor, "generation": generation}
knowledge.setdefault(name, knowledge_default)
deserialized_info = info.to_dict()
# Handle Info:
# 1. tags are additive
# 2. descriptions are overriden
# 3. meta is merged
# 4. tests are ignored until I am convinced those shouldn't be
# hand curated with love
if deserialized_info["description"] in placeholders:
deserialized_info.pop("description", None)
deserialized_info["tags"] = list(
set(deserialized_info.pop("tags", []) + knowledge[name].get("tags", []))
)
if not deserialized_info["tags"]:
deserialized_info.pop("tags") # poppin' tags like Macklemore
deserialized_info["meta"] = {
**knowledge[name].get("meta", {}),
**deserialized_info["meta"],
}
if not deserialized_info["meta"]:
deserialized_info.pop("meta")
knowledge[name].update(deserialized_info)
return knowledge


class ColumnLevelKnowledgePropagator:
@staticmethod
def get_node_columns_with_inherited_knowledge(
manifest: ManifestNode,
node: ManifestNode,
placeholders: List[str],
) -> Knowledge:
"""Build a knowledgebase for the model based on iterating through ancestors"""
family_tree = _build_node_ancestor_tree(manifest, node)
knowledge = _inherit_column_level_knowledge(manifest, family_tree, placeholders)
return knowledge
90 changes: 11 additions & 79 deletions src/dbt_osmosis/core/osmosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
MissingOsmosisConfig,
)
from dbt_osmosis.core.log_controller import logger
from dbt_osmosis.core.column_level_knowledge_propagator import (
ColumnLevelKnowledgePropagator,
ColumnLevelKnowledge,
Knowledge,
)
from dbt_osmosis.vendored.dbt_core_interface.project import (
ColumnInfo,
DbtProject,
Expand Down Expand Up @@ -762,81 +767,6 @@ def pretty_print_restructure_plan(blueprint: Dict[Path, SchemaFileMigration]) ->
)
)

def build_node_ancestor_tree(
self,
node: ManifestNode,
family_tree: Optional[Dict[str, List[str]]] = None,
members_found: Optional[List[str]] = None,
depth: int = 0,
) -> Dict[str, List[str]]:
"""Recursively build dictionary of parents in generational order"""
if family_tree is None:
family_tree = {}
if members_found is None:
members_found = []
if not hasattr(node, "depends_on"):
return family_tree
for parent in getattr(node.depends_on, "nodes", []):
member = self.manifest.nodes.get(parent, self.manifest.sources.get(parent))
if member and parent not in members_found:
family_tree.setdefault(f"generation_{depth}", []).append(parent)
members_found.append(parent)
# Recursion
family_tree = self.build_node_ancestor_tree(
member, family_tree, members_found, depth + 1
)
return family_tree

def inherit_column_level_knowledge(
self,
family_tree: Dict[str, Any],
) -> Dict[str, Dict[str, Any]]:
"""Inherit knowledge from ancestors in reverse insertion order to ensure that the most
recent ancestor is always the one to inherit from
"""
knowledge: Dict[str, Dict[str, Any]] = {}
for generation in reversed(family_tree):
for ancestor in family_tree[generation]:
member: ManifestNode = self.manifest.nodes.get(
ancestor, self.manifest.sources.get(ancestor)
)
if not member:
continue
for name, info in member.columns.items():
knowledge_default = {"progenitor": ancestor, "generation": generation}
knowledge.setdefault(name, knowledge_default)
deserialized_info = info.to_dict()
# Handle Info:
# 1. tags are additive
# 2. descriptions are overriden
# 3. meta is merged
# 4. tests are ignored until I am convinced those shouldn't be
# hand curated with love
if deserialized_info["description"] in self.placeholders:
deserialized_info.pop("description", None)
deserialized_info["tags"] = list(
set(deserialized_info.pop("tags", []) + knowledge[name].get("tags", []))
)
if not deserialized_info["tags"]:
deserialized_info.pop("tags") # poppin' tags like Macklemore
deserialized_info["meta"] = {
**knowledge[name].get("meta", {}),
**deserialized_info["meta"],
}
if not deserialized_info["meta"]:
deserialized_info.pop("meta")
knowledge[name].update(deserialized_info)
return knowledge

def get_node_columns_with_inherited_knowledge(
self,
node: ManifestNode,
) -> Dict[str, Dict[str, Any]]:
"""Build a knowledgebase for the model based on iterating through ancestors"""
family_tree = self.build_node_ancestor_tree(node)
knowledge = self.inherit_column_level_knowledge(family_tree)
return knowledge

@staticmethod
def get_column_sets(
database_columns: Iterable[str],
Expand Down Expand Up @@ -1032,9 +962,9 @@ def remove_columns_not_in_database(

@staticmethod
def get_prior_knowledge(
knowledge: Dict[str, Dict[str, Any]],
knowledge: Knowledge,
column: str,
) -> Dict[str, Any]:
) -> ColumnLevelKnowledge:
camel_column = re.sub("_(.)", lambda m: m.group(1).upper(), column)
prior_knowledge_candidates = list(filter(lambda k: k, [
knowledge.get(column),
Expand Down Expand Up @@ -1064,7 +994,9 @@ def update_undocumented_columns_with_prior_knowledge(
) -> int:
"""Update undocumented columns with prior knowledge in node and model simultaneously
THIS MUTATES THE NODE AND MODEL OBJECTS so that state is always accurate"""
knowledge = self.get_node_columns_with_inherited_knowledge(node)
knowledge: Knowledge = ColumnLevelKnowledgePropagator.get_node_columns_with_inherited_knowledge(
self.manifest, node, self.placeholders
)

inheritables = ["description"]
if not self.skip_add_tags:
Expand All @@ -1074,7 +1006,7 @@ def update_undocumented_columns_with_prior_knowledge(

changes_committed = 0
for column in undocumented_columns:
prior_knowledge = self.get_prior_knowledge(knowledge, column)
prior_knowledge: ColumnLevelKnowledge = self.get_prior_knowledge(knowledge, column)
progenitor = prior_knowledge.pop("progenitor", "Unknown")
prior_knowledge = {k: v for k, v in prior_knowledge.items() if k in inheritables}
if not prior_knowledge:
Expand Down
Empty file added tests/__init__.py
Empty file.
Loading

0 comments on commit 0c2c8d6

Please sign in to comment.