Skip to content

Commit

Permalink
Configure Airflow tasks using dbt model meta (#1339)
Browse files Browse the repository at this point in the history
The various dbt models have unique characteristics, and some may require
the use of custom pools, queues, or other specific configurations. To
support such cases, this update introduces the ability to add necessary
information in the meta section of the dbt model.yaml. This metadata is
then passed as kwargs to the corresponding Airflow tasks, enabling
model-specific customization and enhanced task configuration.


**here is sample**

DbtTaskGroup - default_args for all dbt models
```python
    dbt_task_group = DbtTaskGroup(
        project_config=,
        profile_config=ProfileConfig,
        default_args={'pool': dbt_pool}
    )
```


```yaml
version: 2

models:
  - name: name
    description: description
    meta:
      owner: '[email protected]'
      cosmos:
        pool: abcd
```

**result**

general pool
![스크린샷 2024-11-25 오후 10 15
26](https://github.com/user-attachments/assets/ea492a19-488f-4dbb-b4b9-2be20dc191be)
custom pool
![스크린샷 2024-11-25 오후 10 15
40](https://github.com/user-attachments/assets/f6055cc0-94f6-4789-9b05-4e02e34bcd5f)


## Related Issue(s)

Closes: #881
Closes: #1325
  • Loading branch information
wornjs authored Dec 17, 2024
1 parent 4a3e12b commit cbd8622
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 3 deletions.
4 changes: 4 additions & 0 deletions cosmos/airflow/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def create_test_task_metadata(
extra_context = {}

task_owner = ""
airflow_task_config = {}
if test_indirect_selection != TestIndirectSelection.EAGER:
task_args["indirect_selection"] = test_indirect_selection.value
if node is not None:
Expand All @@ -111,6 +112,7 @@ def create_test_task_metadata(

extra_context = {"dbt_node_config": node.context_dict}
task_owner = node.owner
airflow_task_config = node.airflow_task_config

elif render_config is not None: # TestBehavior.AFTER_ALL
task_args["select"] = render_config.select
Expand All @@ -120,6 +122,7 @@ def create_test_task_metadata(
return TaskMetadata(
id=test_task_name,
owner=task_owner,
airflow_task_config=airflow_task_config,
operator_class=calculate_operator_class(
execution_mode=execution_mode,
dbt_class="DbtTest",
Expand Down Expand Up @@ -214,6 +217,7 @@ def create_task_metadata(
task_metadata = TaskMetadata(
id=task_id,
owner=node.owner,
airflow_task_config=node.airflow_task_config,
operator_class=calculate_operator_class(
execution_mode=execution_mode, dbt_class=dbt_resource_to_class[node.resource_type]
),
Expand Down
3 changes: 3 additions & 0 deletions cosmos/core/airflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ def get_airflow_task(task: Task, dag: DAG, task_group: TaskGroup | None = None)
if task.owner != "":
task_kwargs["owner"] = task.owner

for k, v in task.airflow_task_config.items():
task_kwargs[k] = v

airflow_task = Operator(
task_id=task.id,
dag=dag,
Expand Down
1 change: 1 addition & 0 deletions cosmos/core/graph/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class Task(CosmosEntity):
"""

owner: str = ""
airflow_task_config: Dict[str, Any] = field(default_factory=dict)
operator_class: str = "airflow.operators.empty.EmptyOperator"
arguments: Dict[str, Any] = field(default_factory=dict)
extra_context: Dict[str, Any] = field(default_factory=dict)
29 changes: 28 additions & 1 deletion cosmos/dbt/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from functools import cached_property
from pathlib import Path
from subprocess import PIPE, Popen
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any, Dict, Optional

from airflow.models import Variable

Expand Down Expand Up @@ -67,6 +67,33 @@ class DbtNode:
has_freshness: bool = False
has_test: bool = False

@property
def airflow_task_config(self) -> Dict[str, Any]:
"""
This method is designed to extend the dbt project's functionality by incorporating Airflow-related metadata into the dbt YAML configuration.
Since dbt projects are independent of Airflow, adding Airflow-specific information to the `meta` field within the dbt YAML allows Airflow tasks to
utilize this information during execution.
Examples: pool, pool_slots, queue, ...
Returns:
Dict[str, Any]: A dictionary containing custom metadata configurations for integration with Airflow.
"""

if "meta" in self.config:
meta = self.config["meta"]
if "cosmos" in meta:
cosmos = meta["cosmos"]
if isinstance(cosmos, dict):
if "operator_kwargs" in cosmos:
operator_kwargs = cosmos["operator_kwargs"]
if isinstance(operator_kwargs, dict):
return operator_kwargs
else:
logger.error(f"Invalid type: 'operator_kwargs' in meta.cosmos must be a dict.")
else:
logger.error(f"Invalid type: 'cosmos' in meta must be a dict.")
return {}

@property
def resource_name(self) -> str:
"""
Expand Down
Binary file added docs/_static/custom_airflow_pool.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
33 changes: 33 additions & 0 deletions docs/getting_started/custom-airflow-properties.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
.. _custom-airflow-properties:

Airflow Configuration Overrides with Astronomer Cosmos
======================================================

**Astronomer Cosmos** allows you to override Airflow configurations for each dbt task (dbt operator) via the dbt YAML file.

Sample dbt Model YAML
++++++++++++

Check warning on line 9 in docs/getting_started/custom-airflow-properties.rst

View workflow job for this annotation

GitHub Actions / pages

Title underline too short.

.. code-block:: yaml
version: 2
models:
- name: name
description: description
meta:
cosmos:
operator_args:
pool: abcd
Explanation
++++++++++++

By adding Airflow configurations under **cosmos** in the **meta** field, you can set independent Airflow configurations for each task.
For example, in the YAML above, the **pool** setting is applied to the specific dbt task.
This approach allows for more granular control over Airflow settings per task within your dbt model definitions.

.. image:: ../_static/custom_airflow_pool.png
:alt: Result of applying Custom Airflow Pool
43 changes: 41 additions & 2 deletions tests/airflow/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
depends_on=[parent_node.unique_id],
file_path=SAMPLE_PROJ_PATH / "gen3/models/child.sql",
tags=["nightly"],
config={"materialized": "table"},
config={"materialized": "table", "meta": {"cosmos": {"operator_kwargs": {"queue": "custom_queue"}}}},
)

child2_node = DbtNode(
Expand All @@ -71,7 +71,7 @@
depends_on=[parent_node.unique_id],
file_path=SAMPLE_PROJ_PATH / "gen3/models/child2_v2.sql",
tags=["nightly"],
config={"materialized": "table"},
config={"materialized": "table", "meta": {"cosmos": {"operator_kwargs": {"pool": "custom_pool"}}}},
)

sample_nodes_list = [parent_seed, parent_node, test_parent_node, child_node, child2_node]
Expand Down Expand Up @@ -750,3 +750,42 @@ def test_owner(dbt_extra_config, expected_owner):

assert len(output.leaves) == 1
assert output.leaves[0].owner == expected_owner


def test_custom_meta():
with DAG("test-id", start_date=datetime(2022, 1, 1)) as dag:
task_args = {
"project_dir": SAMPLE_PROJ_PATH,
"conn_id": "fake_conn",
"profile_config": ProfileConfig(
profile_name="default",
target_name="default",
profile_mapping=PostgresUserPasswordProfileMapping(
conn_id="fake_conn",
profile_args={"schema": "public"},
),
),
}
build_airflow_graph(
nodes=sample_nodes,
dag=dag,
execution_mode=ExecutionMode.LOCAL,
test_indirect_selection=TestIndirectSelection.EAGER,
task_args=task_args,
render_config=RenderConfig(
test_behavior=TestBehavior.AFTER_EACH,
source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR,
),
dbt_project_name="astro_shop",
)
# test custom meta (queue, pool)
for task in dag.tasks:
if task.task_id == "child2_v2_run":
assert task.pool == "custom_pool"
else:
assert task.pool == "default_pool"

if task.task_id == "child_run":
assert task.queue == "custom_queue"
else:
assert task.queue == "default"

0 comments on commit cbd8622

Please sign in to comment.