Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CT 2251 model yaml frontmatter #7100

Draft
wants to merge 25 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9ae7a70
Add yaml frontmatter code in yaml_helper.py
gshank Mar 1, 2023
7163a57
Split up yaml frontmatter in model parser
gshank Mar 1, 2023
063cad6
Process yaml frontmatter from model parser
gshank Mar 1, 2023
1b16597
Add test
gshank Mar 1, 2023
106a8ec
Tweak
gshank Mar 1, 2023
592d892
Fix some unit tests
gshank Mar 1, 2023
efec14f
Fix artifacts test
gshank Mar 1, 2023
882e2cd
Add test for partial parsing
gshank Mar 1, 2023
13d961c
Changie and tweak check for frontmatter because Windows
gshank Mar 2, 2023
cc1b521
Throw error on config in both model and schema file
gshank Mar 2, 2023
04e2767
fix test_parser.py
gshank Mar 2, 2023
dc2b73a
Fix separator regex to work on Windows
gshank Mar 2, 2023
f57301c
Add test running tests, fix "write_node" to specify directory different
gshank Mar 3, 2023
ade4e1e
Some comments
gshank Mar 4, 2023
e30671e
Also check for .py
gshank Mar 4, 2023
897b7a1
Merge branch 'main' into ct-2251-model_yaml_frontmatter
gshank Mar 22, 2023
e24fae5
test formatting
gshank Mar 22, 2023
c38a49c
Merge branch 'main' into ct-2251-model_yaml_frontmatter
gshank May 3, 2023
30aaa1d
Update manifest/v10.json
gshank May 3, 2023
e8d7d8b
Update expected_manifest.py
gshank May 3, 2023
82ed393
Merge branch 'main' into ct-2251-model_yaml_frontmatter
gshank May 18, 2023
9634808
Skip partial parsing check for patch from schema file
gshank May 22, 2023
aae8a75
Merge branch 'main' into ct-2251-model_yaml_frontmatter
gshank Jun 5, 2023
45637f7
Merge branch 'main' into ct-2251-model_yaml_frontmatter
gshank Jul 20, 2023
825738a
Merge branch 'main' into ct-2251-model_yaml_frontmatter
gshank Oct 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changes/unreleased/Features-20230302-095613.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Features
body: Support embedded yaml frontmatter in model nodes
time: 2023-03-02T09:56:13.106933-05:00
custom:
Author: gshank
Issue: "7099"
45 changes: 44 additions & 1 deletion core/dbt/clients/yaml_helper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import dbt.exceptions
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Tuple
import yaml

# the C version is faster, but it doesn't always exist
Expand All @@ -8,6 +9,9 @@
except ImportError:
from yaml import Loader, SafeLoader, Dumper # type: ignore # noqa: F401

FRONTMATTER_CHECK = ["---\n", "---\r\n"]
FRONTMATTER_DELIMITER = re.compile(r"^---", re.MULTILINE)
NON_WHITESPACE = re.compile(r"\S")

YAML_ERROR_MESSAGE = """
Syntax error near line {line_number}
Expand Down Expand Up @@ -61,3 +65,42 @@ def load_yaml_text(contents, path=None):
error = str(e)

raise dbt.exceptions.DbtValidationError(error)


def split_yaml_frontmatter(content: str) -> Tuple[Optional[str], str]:
"""Splits `content` into raw YAML frontmatter (as a raw string) and everything else proceeding.

Frontmatter is defined as a block of YAML appearing between two `---` tokens in an otherwise non-YAML document.
The frontmatter must be placed at the beginning of the file: anything other than whitespace preceding the first `---`
will cause the frontmatter block to be ignored, with a warning.
"""
parts = FRONTMATTER_DELIMITER.split(content, 2)
if len(parts) != 3:
# Zero or one `---` token, so return the original string
return None, content
elif NON_WHITESPACE.search(parts[0]) is not None:
# No frontmatter section or non-whitespace preceding the first `---`, so skip frontmatter
return None, content

frontmatter_content, after_footer = parts[1:]
return frontmatter_content, after_footer


def parse_yaml_frontmatter(frontmatter_content: str, original_content: str):
try:
parsed_yaml = safe_load(frontmatter_content)
except (yaml.scanner.ScannerError, yaml.YAMLError) as e:
if hasattr(e, "problem_mark"):
error = contextualized_yaml_error(original_content, e)
else:
error = str(e)
error = f"Error parsing YAML frontmatter: {error}"
raise dbt.exceptions.DbtValidationError(error)

return parsed_yaml


def has_yaml_frontmatter(content: str) -> bool:
"""Check first line for yaml frontmatter"""

return content.startswith(FRONTMATTER_CHECK[0]) or content.startswith(FRONTMATTER_CHECK[1])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think content could start with whitespace (including newlines) and this method would return False, while split_yaml_frontmatter would skip over leading whitespace.

Perhaps this would be better?

Suggested change
return content.startswith(FRONTMATTER_CHECK[0]) or content.startswith(FRONTMATTER_CHECK[1])
stripped = content.lstrip()
return stripped.startswith(FRONTMATTER_CHECK[0]) or stripped.startswith(FRONTMATTER_CHECK[1])

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Content should be stripped already, I think, when it's loaded in read_files. Did you actually see an error? Wouldn't hurt to double up on that though.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope! I honestly just drove by and looked over your approach. Cool that it's already stripped. It might be worth a test case?

17 changes: 16 additions & 1 deletion core/dbt/contracts/graph/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,8 @@ def patch(self, patch: "ParsedNodePatch"):
# explicitly pick out the parts to update so we don't inadvertently
# step on the model name or anything
# Note: config should already be updated
self.patch_path: Optional[str] = patch.file_id
if patch.file_id != self.file_id:
self.patch_path: Optional[str] = patch.file_id
# update created_at so process_docs will run in partial parsing
self.created_at = time.time()
self.description = patch.description
Expand Down Expand Up @@ -487,6 +488,7 @@ class HookNode(CompiledNode):
class ModelNode(CompiledNode):
resource_type: NodeType = field(metadata={"restrict": [NodeType.Model]})
access: AccessType = AccessType.Protected
yaml_config_dict: Dict[str, Any] = field(default_factory=dict)


# TODO: rm?
Expand Down Expand Up @@ -684,6 +686,19 @@ def same_contents(self, other) -> bool:
def test_node_type(self):
return "generic"

def write_node(self, target_path: str, subdirectory: str, payload: str):
# If this test came from yaml frontmatter, it will try to write to a directory
# with the same name as the model file, which won't work. So change the
# directory to model file name + .yaml.
file_path = self.original_file_path
if file_path.endswith(".sql") or file_path.endswith(".py"):
file_path = file_path + ".yaml"
path = os.path.join(file_path, self.path)
full_path = os.path.join(target_path, subdirectory, self.package_name, path)

write_file(full_path, payload)
return full_path


# ====================================
# Snapshot node
Expand Down
13 changes: 10 additions & 3 deletions core/dbt/parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,13 @@ def parse_node(self, block: ConfiguredBlockType) -> FinalNode:
fqn=fqn,
)
self.render_update(node, config)
result = self.transform(node)
self.add_result_node(block, result)
return result
# Snapshot is converted from IntermediateSnapshotNode in transform
final_node = self.transform(node)
self.add_result_node(block, final_node)
# This handles calling the schema model parser for the model parser.
# It needs to be after "add_result_node" has been called.
self.post_parse_node(final_node)
return final_node

def _update_node_relation_name(self, node: ManifestNode):
# Seed and Snapshot nodes and Models that are not ephemeral,
Expand All @@ -418,6 +422,9 @@ def parse_file(self, file_block: FileBlock) -> None:
def transform(self, node: IntermediateNode) -> FinalNode:
pass

def post_parse_node(self, node):
pass


class SimpleParser(
ConfiguredParser[ConfiguredBlockType, FinalNode, FinalNode],
Expand Down
41 changes: 41 additions & 0 deletions core/dbt/parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@
from dbt.node_types import NodeType, ModelLanguage
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FileBlock
from dbt.parser.schemas import SchemaParser, TestablePatchParser
from dbt.parser.generic_test_builders import YamlBlock
from dbt.clients.jinja import get_rendered
from dbt.clients.yaml_helper import (
has_yaml_frontmatter,
split_yaml_frontmatter,
parse_yaml_frontmatter,
)
import dbt.tracking as tracking
from dbt import utils
from dbt_extractor import ExtractionError, py_extract_from_source # type: ignore
Expand Down Expand Up @@ -174,10 +181,44 @@ def verify_python_model_code(node):

class ModelParser(SimpleSQLParser[ModelNode]):
def parse_from_dict(self, dct, validate=True) -> ModelNode:

# change raw_code to not have the yaml frontmatter
if dct["raw_code"] and has_yaml_frontmatter(dct["raw_code"]):
yaml_frontmatter, rest_of_contents = split_yaml_frontmatter(dct["raw_code"])
if yaml_frontmatter:
yaml_config_dict = parse_yaml_frontmatter(yaml_frontmatter, dct["raw_code"])
dct["yaml_config_dict"] = yaml_config_dict
dct["raw_code"] = rest_of_contents.strip()

if validate:
ModelNode.validate(dct)
return ModelNode.from_dict(dct)

def post_parse_node(self, node: ModelNode):
# This handles parsing yaml frontmatter by calling the
# schema model parser
if node.yaml_config_dict:
# The generic test processing expects to be able to lookup the node, so we add it here.
# The node should already have been added to the manifest by "add_result_node"
# in the base "parse_node" method.
self.manifest.ref_lookup.add_node(node)

# Create the YamlBlock that the schema model parser needs.
source_file = self.manifest.files[node.file_id]
block = FileBlock(file=source_file)
entry = deepcopy(node.yaml_config_dict)
# Make a dictionary like the schema model parser expects.
entry["name"] = node.name
dct = {"models": [entry]}
yaml_block = YamlBlock.from_file_block(block, dct)

# Create the SchemaParser needed by the "TestablePatchParser"
schema_parser = SchemaParser(self.project, self.manifest, self.root_project)
model_parser = TestablePatchParser(schema_parser, yaml_block, "models")
# parse the schema model patch and create the tests
for test_block in model_parser.parse():
schema_parser.parse_tests(test_block)

@property
def resource_type(self) -> NodeType:
return NodeType.Model
Expand Down
3 changes: 2 additions & 1 deletion core/dbt/parser/partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@ def update_mssat_in_saved(self, new_source_file, old_source_file):
if self.already_scheduled_for_parsing(old_source_file):
return

# These files only have one node except for snapshots
# These files only have one node except for snapshots and
# model files with yaml frontmatter
unique_ids = []
if old_source_file.nodes:
unique_ids = old_source_file.nodes
Expand Down
28 changes: 21 additions & 7 deletions core/dbt/parser/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,8 +863,8 @@ def parse_patch(self, block: TargetBlock[NodeTarget], refs: ParserRef) -> None:
config=block.target.config,
access=block.target.access,
)
assert isinstance(self.yaml.file, SchemaSourceFile)
source_file: SchemaSourceFile = self.yaml.file
# This used to assert SchemaSourceFile, but removed to support yaml frontmatter
source_file = self.yaml.file
if patch.yaml_key in ["models", "seeds", "snapshots"]:
unique_id = self.manifest.ref_lookup.get_unique_id(patch.name, None)
if unique_id:
Expand Down Expand Up @@ -907,9 +907,11 @@ def parse_patch(self, block: TargetBlock[NodeTarget], refs: ParserRef) -> None:

# all nodes in the disabled dict have the same unique_id so just grab the first one
# to append with the uniqe id
source_file.append_patch(patch.yaml_key, found_nodes[0].unique_id)
if isinstance(source_file, SchemaSourceFile):
source_file.append_patch(patch.yaml_key, found_nodes[0].unique_id)
for node in found_nodes:
node.patch_path = source_file.file_id
if isinstance(source_file, SchemaSourceFile):
node.patch_path = source_file.file_id
# re-calculate the node config with the patch config. Always do this
# for the case when no config is set to ensure the default of true gets captured
if patch.config:
Expand All @@ -929,11 +931,23 @@ def parse_patch(self, block: TargetBlock[NodeTarget], refs: ParserRef) -> None:
# patches can't be overwritten
node = self.manifest.nodes.get(unique_id)
if node:
if node.patch_path:
package_name, existing_file_path = node.patch_path.split("://")
# If patch_path is set, then a schema file has already been processed for this node.
# If the node has a yaml_config_dict and THIS invocation of the schema parser is from
# a different file, then this is a schema file duplicate of config already done in the model
# via yaml frontmatter.
if node.patch_path or (
hasattr(node, "yaml_config_dict")
and node.yaml_config_dict
and node.file_id != source_file.file_id
):
if node.patch_path:
_, existing_file_path = node.patch_path.split("://")
else: # yaml frontmatter
existing_file_path = node.original_file_path
raise DuplicatePatchPathError(patch, existing_file_path)

source_file.append_patch(patch.yaml_key, node.unique_id)
if isinstance(source_file, SchemaSourceFile):
source_file.append_patch(patch.yaml_key, node.unique_id)
# re-calculate the node config with the patch config. Always do this
# for the case when no config is set to ensure the default of true gets captured
if patch.config:
Expand Down
Loading