diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/package_spec.py b/.buildkite/dagster-buildkite/dagster_buildkite/package_spec.py index 84528cbeb9163..03a58aee22b2f 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/package_spec.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/package_spec.py @@ -125,6 +125,7 @@ class PackageSpec: queue: Optional[BuildkiteQueue] = None run_pytest: bool = True always_run_if: Optional[Callable[[], bool]] = None + skip_if: Optional[Callable[[], Optional[str]]] = None def __post_init__(self): if not self.name: @@ -161,9 +162,13 @@ def build_steps(self) -> List[BuildkiteTopLevelStep]: if v not in unsupported_python_versions ] - pytest_python_versions = sorted( - list(set(default_python_versions) - set(unsupported_python_versions)) - ) + pytest_python_versions = [ + AvailablePythonVersion(v) + for v in sorted( + set(e.value for e in default_python_versions) + - set(e.value for e in unsupported_python_versions) + ) + ] # Use highest supported python version if no defaults_match if len(pytest_python_versions) == 0: pytest_python_versions = [supported_python_versions[-1]] @@ -243,22 +248,39 @@ def requirements(self): @property def skip_reason(self) -> Optional[str]: - # Memoize so we don't log twice - if self._should_skip is False: - return None - - if self.always_run_if and self.always_run_if(): - self._should_skip = False - return None - - if self._skip_reason: + """Provides a message if this package's steps should be skipped on this run, and no message if the package's steps should be run. + We actually use this to determine whether or not to run the package. + + Because we use an archaic version of python to build our images, we can't use `cached_property`, and so we reinvent the wheel here with + self._should_skip and self._skip_reason. When we determine definitively that a package should or shouldn't be skipped, we cache the result on self._should_skip + as a boolean (it starts out as None), and cache the skip reason (or lack thereof) on self._skip_reason. + """ + # If self._should_skip is not None, then the result is cached on self._skip_reason and we can return it. + if self._should_skip is not None: + if self._should_skip is True: + assert ( + self._skip_reason is not None + ), "Expected skip reason to be set if self._should_skip is True." return self._skip_reason + # If the result is not cached, check for NO_SKIP signifier first, so that it always + # takes precedent. if message_contains("NO_SKIP"): logging.info(f"Building {self.name} because NO_SKIP set") self._should_skip = False return None + if self.always_run_if and self.always_run_if(): + self._should_skip = False + self._skip_reason = None + return None + if self.skip_if and self.skip_if(): + self._skip_reason = self.skip_if() + self._should_skip = True + return self._skip_reason + # Take account of feature_branch changes _after_ skip_if so that skip_if + # takes precedent. This way, integration tests can run on branch but won't be + # forced to run on every master commit. if not is_feature_branch(os.getenv("BUILDKITE_BRANCH", "")): logging.info(f"Building {self.name} we're not on a feature branch") self._should_skip = False diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_main.py b/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_main.py index fc8dc0426c6a2..a79cfcde575d1 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_main.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_main.py @@ -52,6 +52,7 @@ def build_dagster_oss_main_steps() -> List[BuildkiteStep]: ), "DAGSTER_CHECKOUT_DEPTH": _get_setting("DAGSTER_CHECKOUT_DEPTH") or "100", "OSS_COMPAT_SLIM": "1" if oss_compat_slim else "", + "DAGSTER_FROM_OSS": "1" if pipeline_name == "internal" else "0", }, ), ) diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_nightly_pipeline.py b/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_nightly_pipeline.py index 23dc34b5f87ca..d0e307db08400 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_nightly_pipeline.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/dagster_oss_nightly_pipeline.py @@ -42,6 +42,28 @@ def build_dagster_oss_nightly_steps() -> List[BuildkiteStep]: ], pytest_extra_cmds=k8s_extra_cmds, ), + PackageSpec( + "examples/experimental/dagster-dlift/kitchen-sink", + name="dbt-cloud-live-tests", + env_vars=[ + "KS_DBT_CLOUD_ACCOUNT_ID", + "KS_DBT_CLOUD_PROJECT_ID", + "KS_DBT_CLOUD_TOKEN", + "KS_DBT_CLOUD_ACCESS_URL", + "KS_DBT_CLOUD_DISCOVERY_API_URL", + ], + ), + PackageSpec( + "examples/experimental/dagster-airlift/examples/dbt-example", + name="airlift-demo-live-tests", + env_vars=[ + "KS_DBT_CLOUD_ACCOUNT_ID", + "KS_DBT_CLOUD_PROJECT_ID", + "KS_DBT_CLOUD_TOKEN", + "KS_DBT_CLOUD_ACCESS_URL", + "KS_DBT_CLOUD_DISCOVERY_API_URL", + ], + ), ] ) diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/prerelease_package.py b/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/prerelease_package.py index 7bda9c847b57d..f5611f2048fa4 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/prerelease_package.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/pipelines/prerelease_package.py @@ -1,5 +1,3 @@ -import re -from pathlib import Path from typing import List from dagster_buildkite.python_version import AvailablePythonVersion @@ -17,14 +15,6 @@ def build_prerelease_package_steps() -> List[BuildkiteStep]: + _get_uncustomized_pkg_roots("examples/experimental", []) ) - # Get only packages that have a fixed version in setup.py - filtered_packages = [] - for package in packages: - setup_file = Path(package) / "setup.py" - contents = setup_file.read_text() - if re.findall(r"version=\"[\d\.]+\"", contents): - filtered_packages.append(package) - input_step: BlockStep = { "block": ":question: Choose package", "prompt": None, @@ -39,7 +29,7 @@ def build_prerelease_package_steps() -> List[BuildkiteStep]: else package, "value": package, } - for package in filtered_packages + for package in packages ], "hint": None, "default": None, diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/python_version.py b/.buildkite/dagster-buildkite/dagster_buildkite/python_version.py index 529b59c32661c..68b27a661340e 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/python_version.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/python_version.py @@ -5,9 +5,8 @@ from dagster_buildkite.utils import is_release_branch, safe_getenv -class AvailablePythonVersion(str, Enum): +class AvailablePythonVersion(Enum): # Ordering is important here, because some steps will take the highest/lowest available version. - V3_8 = "3.8" V3_9 = "3.9" V3_10 = "3.10" V3_11 = "3.11" @@ -21,6 +20,12 @@ def get_all(cls) -> List["AvailablePythonVersion"]: def get_default(cls) -> "AvailablePythonVersion": return cls["V3_11"] + # Useful for providing to `PackageSpec.unsupported_python_versions` when you only want to test + # the default version. + @classmethod + def get_all_except_default(cls) -> List["AvailablePythonVersion"]: + return [v for v in cls.get_all() if v != cls.get_default()] + @classmethod def get_pytest_defaults(cls) -> List["AvailablePythonVersion"]: branch_name = safe_getenv("BUILDKITE_BRANCH") @@ -55,6 +60,6 @@ def from_major_minor(cls, major: int, minor: int) -> "AvailablePythonVersion": @classmethod def to_tox_factor(cls, version: "AvailablePythonVersion") -> str: - ver_parts = version.split(".") + ver_parts = version.value.split(".") major, minor = ver_parts[0], ver_parts[1] return f"py{major}{minor}" diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/step_builder.py b/.buildkite/dagster-buildkite/dagster_buildkite/step_builder.py index f8ab1d790620e..0ef854abf775b 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/step_builder.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/step_builder.py @@ -109,7 +109,7 @@ def on_test_image( raise Exception(f"Unsupported python version for test image: {ver}.") return self.on_python_image( - image=f"buildkite-test:py{ver}-{BUILDKITE_TEST_IMAGE_VERSION}", + image=f"buildkite-test:py{ver.value}-{BUILDKITE_TEST_IMAGE_VERSION}", env=env, ) diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/steps/helm.py b/.buildkite/dagster-buildkite/dagster_buildkite/steps/helm.py index a21df88a5b717..c90ee12e306dd 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/steps/helm.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/steps/helm.py @@ -20,7 +20,6 @@ def build_helm_steps() -> List[BuildkiteStep]: os.path.join("helm", "dagster", "schema"), unsupported_python_versions=[ # run helm schema tests only once, on the latest python version - AvailablePythonVersion.V3_8, AvailablePythonVersion.V3_9, AvailablePythonVersion.V3_10, AvailablePythonVersion.V3_11, diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/steps/integration.py b/.buildkite/dagster-buildkite/dagster_buildkite/steps/integration.py index 74c27cd8e0497..98ea3ec22004e 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/steps/integration.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/steps/integration.py @@ -6,7 +6,11 @@ GCP_CREDS_LOCAL_FILE, LATEST_DAGSTER_RELEASE, ) -from dagster_buildkite.package_spec import PackageSpec, UnsupportedVersionsFunction +from dagster_buildkite.package_spec import ( + PackageSpec, + PytestExtraCommandsFunction, + UnsupportedVersionsFunction, +) from dagster_buildkite.python_version import AvailablePythonVersion from dagster_buildkite.step_builder import BuildkiteQueue from dagster_buildkite.steps.test_project import test_project_depends_fn @@ -60,12 +64,12 @@ def build_backcompat_suite_steps() -> List[BuildkiteTopLevelStep]: ) -def backcompat_extra_cmds(_, factor: str) -> List[str]: +def backcompat_extra_cmds(_, factor: Optional[str]) -> List[str]: tox_factor_map = { "user-code-latest-release": LATEST_DAGSTER_RELEASE, "user-code-earliest-release": EARLIEST_TESTED_RELEASE, } - + assert factor webserver_version = DAGSTER_CURRENT_BRANCH webserver_library_version = _get_library_version(webserver_version) user_code_version = tox_factor_map[factor] @@ -163,7 +167,7 @@ def build_auto_materialize_perf_suite_steps(): def daemon_pytest_extra_cmds(version: AvailablePythonVersion, _): return [ - "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version, + "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version.value, 'export DAGSTER_DOCKER_REPOSITORY="$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com"', "pushd integration_tests/test_suites/daemon-test-suite/monitoring_daemon_tests/", "docker-compose up -d --remove-orphans", @@ -182,7 +186,7 @@ def daemon_pytest_extra_cmds(version: AvailablePythonVersion, _): # ######################## -def build_k8s_suite_steps(): +def build_k8s_suite_steps() -> List[BuildkiteTopLevelStep]: pytest_tox_factors = ["-default", "-subchart"] directory = os.path.join("integration_tests", "test_suites", "k8s-test-suite") return build_integration_suite_steps( @@ -201,7 +205,7 @@ def build_k8s_suite_steps(): def build_integration_suite_steps( directory: str, pytest_tox_factors: Optional[List[str]], - pytest_extra_cmds: Optional[Callable] = None, + pytest_extra_cmds: Optional[PytestExtraCommandsFunction] = None, queue=None, always_run_if: Optional[Callable[[], bool]] = None, unsupported_python_versions: Optional[ @@ -229,19 +233,19 @@ def build_integration_suite_steps( ).build_steps() -def k8s_integration_suite_pytest_extra_cmds(version: str, _) -> List[str]: +def k8s_integration_suite_pytest_extra_cmds(version: AvailablePythonVersion, _) -> List[str]: return [ - "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version, + "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version.value, 'export DAGSTER_DOCKER_REPOSITORY="$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com"', "aws ecr get-login --no-include-email --region us-west-2 | sh", ] -def celery_k8s_integration_suite_pytest_extra_cmds(version: str, _) -> List[str]: +def celery_k8s_integration_suite_pytest_extra_cmds(version: AvailablePythonVersion, _) -> List[str]: cmds = [ 'export AIRFLOW_HOME="/airflow"', "mkdir -p $${AIRFLOW_HOME}", - "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version, + "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version.value, 'export DAGSTER_DOCKER_REPOSITORY="$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com"', "aws ecr get-login --no-include-email --region us-west-2 | sh", ] diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/steps/packages.py b/.buildkite/dagster-buildkite/dagster_buildkite/steps/packages.py index 91b271775f46c..ddc6291ff301f 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/steps/packages.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/steps/packages.py @@ -6,6 +6,7 @@ from dagster_buildkite.defines import GCP_CREDS_FILENAME, GCP_CREDS_LOCAL_FILE, GIT_REPO_ROOT from dagster_buildkite.package_spec import PackageSpec from dagster_buildkite.python_version import AvailablePythonVersion +from dagster_buildkite.step_builder import BuildkiteQueue from dagster_buildkite.steps.test_project import test_project_depends_fn from dagster_buildkite.utils import ( BuildkiteStep, @@ -13,6 +14,8 @@ has_dagster_airlift_changes, has_storage_test_fixture_changes, network_buildkite_container, + skip_if_not_airlift_or_dlift_commit, + skip_if_not_dlift_commit, ) @@ -89,7 +92,7 @@ def _get_uncustomized_pkg_roots(root: str, custom_pkg_roots: List[str]) -> List[ # ######################## -def airflow_extra_cmds(version: str, _) -> List[str]: +def airflow_extra_cmds(version: AvailablePythonVersion, _) -> List[str]: return [ 'export AIRFLOW_HOME="/airflow"', "mkdir -p $${AIRFLOW_HOME}", @@ -157,9 +160,9 @@ def dagster_graphql_extra_cmds(_, tox_factor: Optional[str]) -> List[str]: ] -def celery_extra_cmds(version: str, _) -> List[str]: +def celery_extra_cmds(version: AvailablePythonVersion, _) -> List[str]: return [ - "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version, + "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version.value, 'export DAGSTER_DOCKER_REPOSITORY="$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com"', "pushd python_modules/libraries/dagster-celery", # Run the rabbitmq db. We are in docker running docker @@ -175,7 +178,7 @@ def celery_extra_cmds(version: str, _) -> List[str]: ] -def celery_docker_extra_cmds(version: str, _) -> List[str]: +def celery_docker_extra_cmds(version: AvailablePythonVersion, _) -> List[str]: return celery_extra_cmds(version, _) + [ "pushd python_modules/libraries/dagster-celery-docker/dagster_celery_docker_tests/", "docker-compose up -d --remove-orphans", @@ -189,9 +192,9 @@ def celery_docker_extra_cmds(version: str, _) -> List[str]: ] -def docker_extra_cmds(version: str, _) -> List[str]: +def docker_extra_cmds(version: AvailablePythonVersion, _) -> List[str]: return [ - "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version, + "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version.value, 'export DAGSTER_DOCKER_REPOSITORY="$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com"', "pushd python_modules/libraries/dagster-docker/dagster_docker_tests/", "docker-compose up -d --remove-orphans", @@ -227,9 +230,9 @@ def docker_extra_cmds(version: str, _) -> List[str]: ] -def k8s_extra_cmds(version: str, _) -> List[str]: +def k8s_extra_cmds(version: AvailablePythonVersion, _) -> List[str]: return [ - "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version, + "export DAGSTER_DOCKER_IMAGE_TAG=$${BUILDKITE_BUILD_ID}-" + version.value, 'export DAGSTER_DOCKER_REPOSITORY="$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com"', ] @@ -272,7 +275,6 @@ def k8s_extra_cmds(version: str, _) -> List[str]: PackageSpec( "examples/with_airflow", unsupported_python_versions=[ - AvailablePythonVersion.V3_9, AvailablePythonVersion.V3_10, AvailablePythonVersion.V3_11, AvailablePythonVersion.V3_12, @@ -288,15 +290,19 @@ def k8s_extra_cmds(version: str, _) -> List[str]: PackageSpec( "examples/docs_snippets", pytest_extra_cmds=docs_snippets_extra_cmds, - unsupported_python_versions=[ - # dependency on 3.9-incompatible extension libs - AvailablePythonVersion.V3_9, - # dagster-airflow dep - AvailablePythonVersion.V3_12, - ], + # The docs_snippets test suite also installs a ton of packages in the same environment, + # which is liable to cause dependency collisions. It's not necessary to test all these + # snippets in all python versions since we are testing the core code exercised by the + # snippets against all supported python versions. + unsupported_python_versions=AvailablePythonVersion.get_all_except_default(), ), PackageSpec( "examples/docs_beta_snippets", + # The docs_snippets test suite also installs a ton of packages in the same environment, + # which is liable to cause dependency collisions. It's not necessary to test all these + # snippets in all python versions since we are testing the core code exercised by the + # snippets against all supported python versions. + unsupported_python_versions=AvailablePythonVersion.get_all_except_default(), pytest_tox_factors=["all", "integrations"], ), PackageSpec( @@ -307,6 +313,9 @@ def k8s_extra_cmds(version: str, _) -> List[str]: ), PackageSpec( "examples/with_great_expectations", + unsupported_python_versions=[ + AvailablePythonVersion.V3_9, + ], ), PackageSpec( "examples/with_pyspark", @@ -361,9 +370,20 @@ def k8s_extra_cmds(version: str, _) -> List[str]: PackageSpec( "examples/experimental/dagster-airlift", ), + # Runs against live dbt cloud instance, we only want to run on commits and on the + # nightly build PackageSpec( "examples/experimental/dagster-airlift/examples/dbt-example", - always_run_if=has_dagster_airlift_changes, + skip_if=skip_if_not_airlift_or_dlift_commit, + env_vars=[ + "KS_DBT_CLOUD_ACCOUNT_ID", + "KS_DBT_CLOUD_PROJECT_ID", + "KS_DBT_CLOUD_TOKEN", + "KS_DBT_CLOUD_ACCESS_URL", + "KS_DBT_CLOUD_DISCOVERY_API_URL", + ], + timeout_in_minutes=30, + queue=BuildkiteQueue.DOCKER, ), PackageSpec( "examples/experimental/dagster-airlift/examples/perf-harness", @@ -377,6 +397,24 @@ def k8s_extra_cmds(version: str, _) -> List[str]: "examples/experimental/dagster-airlift/examples/kitchen-sink", always_run_if=has_dagster_airlift_changes, ), + PackageSpec( + "examples/experimental/dagster-dlift", + name="dlift", + ), + # Runs against live dbt cloud instance, we only want to run on commits and on the + # nightly build + PackageSpec( + "examples/experimental/dagster-dlift/kitchen-sink", + skip_if=skip_if_not_dlift_commit, + name="dlift-live", + env_vars=[ + "KS_DBT_CLOUD_ACCOUNT_ID", + "KS_DBT_CLOUD_PROJECT_ID", + "KS_DBT_CLOUD_TOKEN", + "KS_DBT_CLOUD_ACCESS_URL", + "KS_DBT_CLOUD_DISCOVERY_API_URL", + ], + ), ] @@ -418,7 +456,10 @@ def tox_factors_for_folder(tests_folder_name: str) -> List[str]: LIBRARY_PACKAGES_WITH_CUSTOM_CONFIG: List[PackageSpec] = [ PackageSpec( "python_modules/automation", - unsupported_python_versions=[AvailablePythonVersion.V3_12], + # automation is internal code that doesn't need to be tested in every python version. The + # test suite also installs a ton of packages in the same environment, which is liable to + # cause dependency collisions. + unsupported_python_versions=AvailablePythonVersion.get_all_except_default(), ), PackageSpec("python_modules/dagster-webserver", pytest_extra_cmds=ui_extra_cmds), PackageSpec( @@ -428,8 +469,7 @@ def tox_factors_for_folder(tests_folder_name: str) -> List[str]: "api_tests", "asset_defs_tests", "cli_tests", - "core_tests_pydantic1", - "core_tests_pydantic2", + "core_tests", "daemon_sensor_tests", "daemon_tests", "definitions_tests", @@ -437,8 +477,7 @@ def tox_factors_for_folder(tests_folder_name: str) -> List[str]: "general_tests_old_protobuf", "launcher_tests", "logging_tests", - "model_tests_pydantic1", - "model_tests_pydantic2", + "model_tests", "scheduler_tests", "storage_tests", "storage_tests_sqlalchemy_1_3", @@ -496,16 +535,12 @@ def tox_factors_for_folder(tests_folder_name: str) -> List[str]: "python_modules/libraries/dagster-dbt", pytest_tox_factors=[ f"{deps_factor}-{command_factor}" - for deps_factor in ["dbt17", "dbt18", "pydantic1"] + for deps_factor in ["dbt17", "dbt18"] for command_factor in ["cloud", "core-main", "core-derived-metadata"] ], ), PackageSpec( "python_modules/libraries/dagster-snowflake", - pytest_tox_factors=[ - "pydantic1", - "pydantic2", - ], env_vars=["SNOWFLAKE_ACCOUNT", "SNOWFLAKE_USER", "SNOWFLAKE_PASSWORD"], ), PackageSpec( @@ -560,10 +595,6 @@ def tox_factors_for_folder(tests_folder_name: str) -> List[str]: ), PackageSpec( "python_modules/libraries/dagster-databricks", - pytest_tox_factors=[ - "pydantic1", - "pydantic2", - ], ), PackageSpec( "python_modules/libraries/dagster-docker", @@ -640,6 +671,9 @@ def tox_factors_for_folder(tests_folder_name: str) -> List[str]: ), PackageSpec( "python_modules/libraries/dagster-ge", + unsupported_python_versions=[ + AvailablePythonVersion.V3_9, + ], ), PackageSpec( "python_modules/libraries/dagster-k8s", diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/steps/test_project.py b/.buildkite/dagster-buildkite/dagster_buildkite/steps/test_project.py index dc73185d23cf2..bbdcd26bf43a5 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/steps/test_project.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/steps/test_project.py @@ -49,16 +49,16 @@ def build_test_project_steps() -> List[GroupStep]: " $${GOOGLE_APPLICATION_CREDENTIALS}", "export" " BASE_IMAGE=$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com/test-project-base:py" - + version + + version.value + "-" + TEST_PROJECT_BASE_IMAGE_VERSION, # build and tag test image "export" " TEST_PROJECT_IMAGE=$${AWS_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com/test-project:$${BUILDKITE_BUILD_ID}-" - + version, + + version.value, "git config --global --add safe.directory /workdir", "./python_modules/dagster-test/dagster_test/test_project/build.sh " - + version + + version.value + " $${TEST_PROJECT_IMAGE}", # # push the built image @@ -67,7 +67,7 @@ def build_test_project_steps() -> List[GroupStep]: ) .on_python_image( # py version can be bumped when rebuilt - f"buildkite-build-test-project-image:py{AvailablePythonVersion.V3_8}-{BUILDKITE_BUILD_TEST_PROJECT_IMAGE_IMAGE_VERSION}", + f"buildkite-build-test-project-image:py{AvailablePythonVersion.V3_11.value}-{BUILDKITE_BUILD_TEST_PROJECT_IMAGE_IMAGE_VERSION}", [ "AIRFLOW_HOME", "AWS_ACCOUNT_ID", diff --git a/.buildkite/dagster-buildkite/dagster_buildkite/utils.py b/.buildkite/dagster-buildkite/dagster_buildkite/utils.py index 9b1bd174d89b6..d83b5d67a7785 100644 --- a/.buildkite/dagster-buildkite/dagster_buildkite/utils.py +++ b/.buildkite/dagster-buildkite/dagster_buildkite/utils.py @@ -100,7 +100,7 @@ class GroupStep(TypedDict): BuildkiteLeafStep = Union[CommandStep, TriggerStep, WaitStep] BuildkiteTopLevelStep = Union[CommandStep, GroupStep] -UV_PIN = "uv==0.4.8" +UV_PIN = "uv==0.4.30" def is_command_step(step: BuildkiteStep) -> TypeGuard[CommandStep]: @@ -307,6 +307,19 @@ def has_dagster_airlift_changes(): return any("dagster-airlift" in str(path) for path in ChangedFiles.all) +@functools.lru_cache(maxsize=None) +def skip_if_not_airlift_or_dlift_commit() -> Optional[str]: + """If no dlift or airlift files are touched, then do NOT run. Even if on master.""" + return ( + None + if ( + any("dagster-dlift" in str(path) for path in ChangedFiles.all) + or any("dagster-airlift" in str(path) for path in ChangedFiles.all) + ) + else "Not an airlift or dlift commit" + ) + + @functools.lru_cache(maxsize=None) def has_storage_test_fixture_changes(): # Attempt to ensure that changes to TestRunStorage and TestEventLogStorage suites trigger integration @@ -316,6 +329,15 @@ def has_storage_test_fixture_changes(): ) +def skip_if_not_dlift_commit() -> Optional[str]: + """If no dlift files are touched, then do NOT run. Even if on master.""" + return ( + None + if any("dagster-dlift" in str(path) for path in ChangedFiles.all) + else "Not a dlift commit" + ) + + def skip_if_no_helm_changes(): if message_contains("NO_SKIP"): return None diff --git a/.buildkite/dagster-buildkite/setup.py b/.buildkite/dagster-buildkite/setup.py index e4cef3a49d6b3..bcbc3c4b08c2e 100644 --- a/.buildkite/dagster-buildkite/setup.py +++ b/.buildkite/dagster-buildkite/setup.py @@ -9,7 +9,6 @@ description="Tools for buildkite automation", url="https://github.com/dagster-io/dagster/tree/master/.buildkite/dagster-buildkite", classifiers=[ - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "License :: OSI Approved :: Apache Software License", diff --git a/CHANGES.md b/CHANGES.md index 5f29c24acda2f..b5622a756c1d9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,182 @@ # Changelog +## 1.9.1 (core) / 0.25.1 (libraries) + +### New + +- `dagster project scaffold` now has an option to create dagster projects from templates with excluded files/filepaths. +- [ui] Filters in the asset catalog now persist when navigating subdirectories. +- [ui] The Run page now displays the partition(s) a run was for. +- [ui] Filtering on owners/groups/tags is now case-insensitive. +- [dagster-tableau] the helper function `parse_tableau_external_and_materializable_asset_specs` is now available to parse a list of Tableau asset specs into a list of external asset specs and materializable asset specs. +- [dagster-looker] Looker assets now by default have owner and URL metadata. +- [dagster-k8s] Added a per_step_k8s_config configuration option to the k8s_job_executor, allowing the k8s configuration of individual steps to be configured at run launch time (thanks @Kuhlwein!) +- [dagster-fivetran] Introduced `DagsterFivetranTranslator` to customize assets loaded from Fivetran. +- [dagster-snowflake] `dagster_snowflake.fetch_last_updated_timestamps` now supports ignoring tables not found in Snowflake instead of raising an error. + +### Bugfixes + +- Fixed issue which would cause a `default_automation_condition_sensor` to be constructed for user code servers running on dagster version < 1.9.0 even if the legacy `auto_materialize: use_sensors` configuration setting was set to `False`. +- Fixed an issue where running `dagster instance migrate` on Dagster version 1.9.0 constructed a SQL query that exceeded the maximum allowed depth. +- Fixed an issue where wiping a dynamically partitioned asset causes an error. +- [dagster-polars] `ImportError`s are no longer raised when bigquery libraries are not installed [#25708] + +### Documentation + +- [dagster-dbt] A guide on how to use dbt defer with Dagster branch deployments has been added to the dbt reference. + +# 1.9.0 (core) / 0.25.0 (libraries) + +## Major changes since 1.8.0 (core) / 0.24.0 (libraries) + +### Automation + +- Declarative Automation, the system which enables setting per-asset `AutomationConditions`, is no longer experimental. We now recommend using this system in all cases where asset-centric orchestration is desired. A suite of built-in static constructors have been added for common usecases, such as `AutomationCondition.on_missing()` (which can fill in missing partitions of assets as soon as upstream data is available), and `AutomationCondition.all_deps_blocking_checks_passed()` (which can prevent materialization of assets until all upstream blocking checks have passed). +- You can now assign `AutomationConditions` to asset checks, via the `automation_condition` parameter on `@asset_check` or `AssetCheckSpec`. +- You can now assign `AutomationConditions` to observable source assets, via the `automation_condition` parameter on `@observable_source_asset`. +- [experimental] You can now define custom subclasses of `AutomationCondition` to execute arbitrary Python code in the context of a broader expression. This allows you to compose built-in conditions with custom business logic. +- The `target` arguments on schedules and sensors are now marked stable, allowing a stable way for schedules and sensors to target asset selections without needing to define a job. + +### Integrations + +- Introduced a slate of integrations with business intelligence (BI) tools, enabling dashboards, views, and reports to be represented in the Dagster asset graph. + - [Looker](https://docs.dagster.io/integrations/looker) + - [Power BI](https://docs.dagster.io/integrations/powerbi) + - [Sigma](https://docs.dagster.io/integrations/sigma) + - [Tableau](https://docs.dagster.io/integrations/tableau) +- A rich set of metadata is now automatically collected by our suite of ELT integrations. + - The `dagster/table_name` metadata tag, containing the fully-qualified name of the destination model, has been added for Airbyte, dlt, Fivetran and Sling assets. + - The `dagster/row_count` metadata tag, containing the number of records loaded in the corresponding run, has been added for dlt and Sling assets. + - The `dagster/column_schema` metadata tag, containing column schema information of the destination tables, has been added for Fivetran assets. + - Column lineage information is now collected for Sling assets. +- [dagster-pipes](https://docs.dagster.io/concepts/dagster-pipes) are replacing the now deprecated Step Launchers as the new recommended approach for executing remote Spark jobs. Three new [Pipes clients](https://docs.dagster.io/_apidocs/libraries/dagster-aws#clients) for running Spark applications on Amazon Web Services have been added: + - `dagster_aws.pipes.PipesGlueClient` + - `dagster_aws.pipes.PipesEMRServerlessClient` + - `dagster_aws.pipes.PipesEMRClient` + +### UI + +- Several changes have been made to the information architecture to make it easier to find what you’re looking for: + - Backfills have been moved from their own tab underneath the Overview page to entries within the table on the Runs page. This reflects the fact that backfills and runs are similar entities that share most properties. You can continue to use the legacy Runs page with the “Revert to legacy Runs page” user setting. ([GitHub Discussion](https://github.com/dagster-io/dagster/discussions/24898)) + - “Jobs” is now a page reachable from the top-level navigation pane. It replaces the Jobs tab within the Overview page. + - “Automations” is now a page reachable from the top-level navigation pane. It replaces the schedule and sensor tabs within the Overview page. +- `@asset` and `AssetSpec` now have a `kinds` attribute that enables specifying labels that show up on asset nodes in the asset graph in the UI. This supersedes the `compute_kind` attribute. + +## Changes since 1.8.13 (core) / 0.24.13 (libraries) + +### New + +- The `tags` parameter to `@asset` and `AssetSpec` is no longer marked as experimental. +- The `@observable_source_asset` decorator now supports an `automation_condition` argument. +- `AutomationCondition` and associated APIs are no longer marked as experimental. +- Added a new `use_user_code_server` parameter to `AutomationConditionSensorDefinition`. If set, the sensor will be evaluated in the user code server (as traditional sensors are), allowing custom `AutomationCondition` subclasses to be evaluated. +- Added a new column to the BulkActions table, a new column to the Runs table, and a new BackfillTags table to improve the performance of the Runs page. To take advantage of these performance improvements, run `dagster instance migrate`. This migration involves a schema migration to add the new columns and table, and a data migration to populate the new columns for historical backfills and runs. +- Performance improvements when loading definitions with multi-assets with many asset keys. +- [ui] The previously-experimental changes to the top nav are now enabled for all users. +- [ui] Added new code location pages which provide information regarding library versions, metadata, and definitions. +- [ui] The new version of the Runs page is now enabled by default. To use the legacy version of the Runs page, toggle the "Revert to legacy Runs page" user setting. +- [ui] Clicking an asset with failed partitions on the asset health overview now takes you to a list of the failed partitions. +- [ui] The Materialize button runs pre-flight checks more efficiently, resulting in faster run launch times. +- [dagster-pipes] Added support for multi-container log streaming (thanks, [@MattyKuzyk](https://github.com/MattyKuzyk)!) +- [dagster-docker] `container_kwargs.stop_timeout` can now be set when using the `DockerRunLauncher` or `docker_executor` to configure the amount of time that Docker will wait when terminating a run for it to clean up before forcibly stopping it with a SIGKILL signal. +- [dagster-dbt] Performance improvements when loading definitions using `build_dbt_asset_selection`. + +### Bugfixes + +- [ui] Fixed redirect behavior on full pageloads of the legacy auto-materialize overview page. +- [ui] Plots for assets that emit materialization and observation events at different rates no longer display a time period missing the more frequent event type. +- [ui] Fixed issue causing scrolling to misbehave on the concurrency settings page. +- [helm] The blockOpConcurrencyLimitedRuns section of queuedRunCoordinator now correctly templates the appropriate config. +- [dagster-pipes] Fixed issue where k8s ops would fail after 4 hours (thanks, [@MattyKuzyk](https://github.com/MattyKuzyk)!) + +### Documentation + +- [dagster-dbt] Added guide for using dbt defer with Dagster branch deployments. +- [docs] Step Launchers documentation has been removed and replaced with references to Dagster Pipes. +- [docs] Fixed code example in Dagster Essentials (thanks, [@aleexharris](https://github.com/aleexharris)!) + +### Breaking Changes + +- `dagster` no longer supports Python 3.8, which hit EOL on 2024-10-07. +- `dagster` now requires `pydantic>=2`. +- By default, `AutomationConditionSensorDefinitions` will now emit backfills to handle cases where more than one partition of an asset is requested on a given tick. This allows that asset's `BackfillPolicy` to be respected. This feature can be disabled by setting `allow_backfills` to `False`. +- Passing a custom `PartitionsDefinition` subclass into a `Definitions` object now issues an error instead of a deprecation warning. +- `AssetExecutionContext` is no longer a subclass of `OpExecutionContext`. At this release, `AssetExecutionContext` and `OpExecutionContext` implement the same methods, but in the future, the methods implemented by each class may diverge. If you have written helper functions with `OpExecutionContext` type annotations, they may need to be updated to include `AssetExecutionContext` depending on your usage. Explicit calls to `isinstance(context, OpExecutionContext)` will now fail if `context` is an `AssetExecutionContext`. +- The `asset_selection` parameter on `AutomationConditionSensorDefinition` has been renamed to `target`, to align with existing sensor APIs. +- The experimental `freshness_policy_sensor` has been removed, as it relies on the long-deprecated `FreshnessPolicy` API. +- The deprecated `external_assets_from_specs` and `external_asset_from_spec` methods have been removed. Users should use `AssetsDefinition(specs=[...])`, or pass specs directly into the `Definitions` object instead. +- `AssetKey` objects can no longer be iterated over or indexed in to. This behavior was never an intended access pattern and in all observed cases was a mistake. +- The `dagster/relation_identifier` metadata key has been renamed to `dagster/table_name`. +- [dagster-ge] `dagster-ge` now only supports `great_expectations>=0.17.15`. The `ge_validation_op_factory` API has been replaced with the API previously called `ge_validation_op_factory_v3`. +- [dagster-aws] Removed deprecated parameters from `dagster_aws.pipes.PipesGlueClient.run`. +- [dagster-embedded-elt] Removed deprecated parameter `dlt_dagster_translator` from `@dlt_assets`. The `dagster_dlt_translator` parameter should be used instead. +- [dagster-polars] Dropped support for saving storage-level arbitrary metadata via IOManagers. + +### Deprecations + +- The `DataBricksPysparkStepLauncher`, `EmrPySparkStepLauncher`, and any custom subclass of `StepLauncher` have been marked as deprecated, but will not be removed from the codebase until Dagster 2.0 is released, meaning they will continue to function as they currently do for the foreseeable future. Their functionality has been superseded by the interfaces provided by `dagster-pipes`, and so future development work will be focused there. +- The experimental `multi_asset_sensor` has been marked as deprecated, as its main use cases have been superseded by the `AutomationCondition` APIs. However, it will not be removed until version 2.0.0. + +## 1.8.13 (core) / 0.24.13 (libraries) + +### New + +- Performance improvements when loading code locations using multi-assets with many asset keys. +- `AutomationCondition.in_progress()` now will be true if an asset partition is part of an in-progress backfill that has not yet executed it. The prior behavior, which only considered runs, is encapsulated in `AutomationCondition.execution_in_progress()`. +- [ui] Added tag filter to the jobs page. +- [ui] Preserve user login state for a longer period of time. +- [dagster-dbt] Performance improvements when loading definitions using `build_dbt_asset_selection`. +- [dagster-docker] `container_kwargs.stop_timeout` can now be set when using the `DockerRunLauncher` or `docker_executor` to configure the amount of time that Docker will wait when terminating a run for it to clean up before forcibly stopping it with a SIGKILL signal. +- [dagster-sigma] The Sigma integration now fetches initial API responses in parallel, speeding up initial load. +- [dagster-looker] Attempt to naively render liquid templates for derived table sql. +- [dagster-looker] Added support for views and explores that rely on refinements or extends. +- [dagster-looker] When fetching explores and dashboards from the Looker API, retrieve in parallel. + +### Bugfixes + +- Fixed an issue with `AutomationCondition.eager()` that could cause it to attempt to launch a second attempt of an asset in cases where it was skipped or failed during a run where one of its parents successfully materialized. +- Fixed an issue which would cause `AutomationConditionSensorDefinitions` to not be evaluated if the `use_user_code_server` value was toggled after the initial evaluation. +- Fixed an issue where configuration values for aliased pydantic fields would be dropped. +- [ui] Fix an issue in the code locations page where invalid query parameters could crash the page. +- [ui] Fix navigation between deployments when query parameters are present in the URL. +- [helm] the blockOpConcurrencyLimitedRuns section of queuedRunCoordinator now correctly templates the appropriate config. +- [dagster-sigma] Fixed pulling incomplete data for very large workspaces. + +## 1.8.12 (core) / 0.24.12 (libraries) + +### New + +- The `AutomationCondition.eager()`, `AutomationCondition.missing()`, and `AutomationCondition.on_cron` conditions are now compatible with asset checks. +- Added `AssetSelection.materializable()`, which returns only assets that are materializable in an existing selection. +- Added a new `AutomationCondition.all_deps_blocking_checks_passed` condition, which can be used to prevent materialization when any upstream blocking checks have failed. +- Added a `code_version` parameter to the `@graph_asset` decorator. +- If a `LaunchPartitionBackfill` mutation is submitted to GQL with invalid partition keys, it will now return an early `PartitionKeysNotFoundError`. +- `AssetSelection.checks_for_assets` now accepts `AssetKey`s and string asset keys, in addition to `AssetsDefinition`s. +- [ui] Added a search bar to partitions tab on the asset details page. +- [ui] Restored docked left nav behavior for wide viewports. +- [dagster-aws] `get_objects` now has a `since_last_modified` that enables only fetching objects modified after a given timestamp. +- [dagster-aws] New AWS EMR Dagster Pipes client (`dagster_aws.pipes.PipesEMRCLient` ) for running and monitoring AWS EMR jobs from Dagster. +- [dagster-looker] Pinned the looker-sdk dependency below 24.18.0 to avoid this issue: https://github.com/looker-open-source/sdk-codegen/issues/1518. + +### Bugfixes + +- Fixed an issue which could cause incorrect evaluation results when using self-dependent partition mappings with `AutomationConditions` that operate over dependencies. +- [ui] Fixed an issue where the breadcumb on asset pages would flicker nonstop. +- [dagster-embedded-elt] Fixed extraction of metadata for dlt assets whose source and destination identifiers differ. +- [dagster-databricks] Fixed a permissioning gap that existed with the `DatabricksPySparkStepLauncher`, so that permissions are now set correctly for non-admin users. +- [dagster-dbt] Fixed an issue where column metadata generated with `fetch_column_metadata` did not work properly for models imported through dbt dependencies. + +### Documentation + +- [dagster-k8s] `DagsterK8sPipesClient.run` now shows up in API docs. + +### Dagster Plus + +- [ui] Fixed a bug in the catalog UI where owners filters were not applied correctly. +- [ui] Fixed width of the column lineage dropdown selector on the asset page. +- [ui] Column lineage now correctly renders when set on asset definition metadata +- [ui] Fixed Settings link on the list of deployments, for users in the legacy navigation flag. + ## 1.8.11 (core) / 0.24.11 (libraries) ### New @@ -5770,7 +5947,7 @@ runLauncher: - [dagit] A “Copy config” button has been added to the run configuration dialog on Run pages. - [dagit] An “Open in Launchpad” button has been added to the run details page. - [dagit] The Run page now surfaces more information about start time and elapsed time in the header. -- [dagster-dbt] The dbt_cloud_resource has a new `get_runs()` function to get a list of runs matching certain paramters from the dbt Cloud API (thanks @[kstennettlull](https://github.com/kstennettlull)!) +- [dagster-dbt] The dbt_cloud_resource has a new `get_runs()` function to get a list of runs matching certain parameters from the dbt Cloud API (thanks @[kstennettlull](https://github.com/kstennettlull)!) - [dagster-snowflake] Added an `authenticator` field to the connection arguments for the `snowflake_resource` (thanks @swotai!). - [celery-docker] The celery docker executor has a new configuration entry `container_kwargs` that allows you to specify additional arguments to pass to your docker containers when they are run. diff --git a/MIGRATION.md b/MIGRATION.md index 519920c737cdd..080c365363019 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -2,6 +2,37 @@ When new releases include breaking changes or deprecations, this document describes how to migrate. +## Migrating to 1.9.0 + +## Database migration + +- This release includes database schema and data migrations to improve the performance of the Runs page. We highly recommend running these migrations to avoid slow page loads of the new Runs page. The migration will add a new column to the `runs` table, a new column to the `bulk_actions` table and a new `backfill_tags` table. A data migration will populate the new columns and table. Run `dagster instance migrate` to run the schema and data migration. + +## Notable behavior changes + +- Backfills have been moved from their own tab underneath the Overview page to entries within the table on the Runs page. This reflects the fact that backfills and runs are similar entities that share most properties. You can continue to use the legacy Runs page with the “Revert to legacy Runs page” user setting. ([GitHub Discussion](https://github.com/dagster-io/dagster/discussions/24898)) +- By default, `AutomationConditionSensorDefinitions` will now emit backfills to handle cases where more than one partition of an asset is requested on a given tick. This allows that asset's `BackfillPolicy` to be respected. This feature can be disabled by setting `allow_backfills` to `False` on the sensor definition. + +## Deprecations + +- The `DataBricksPysparkStepLauncher`, `EmrPySparkStepLauncher`, and any custom subclass of `StepLauncher` have been marked as deprecated, but will not be removed from the codebase until Dagster 2.0 is released, meaning they will continue to function as they currently do for the foreseeable future. Their functionality has been superseded by the interfaces provided by `dagster-pipes`, and so future development work will be focused there. +- The experimental `@multi_asset_sensor` has been marked as deprecated, but will not be removed from the codebase until Dagster 2.0 is released, meaning it will continue to function as it currently does for the foreseeable future. Its functionality has been largely superseded by the `AutomationCondition` system. + +## Breaking changes + +- `dagster` no longer supports Python 3.8, which hit EOL on 2024-10-07. +- `dagster` now requires `pydantic>=2` . +- Passing a custom `PartitionsDefinition` subclass into a `Definitions` object now issues an error instead of a deprecation warning. +- `AssetExecutionContext` is no longer a subclass of `OpExecutionContext`. At this release, `AssetExecutionContext` and `OpExecutionContext` implement the same methods, but in the future, the methods implemented by each class may diverge. If you have written helper functions with `OpExecutionContext` type annotations, they may need to be updated to include `AssetExecutionContext` depending on your usage. Explicit calls to `isinstance(context, OpExecutionContext)` will now fail if `context` is an `AssetExecutionContext`. +- The `dagster/relation_identifier` metadata key has been renamed to `dagster/table_name`. +- The `asset_selection` parameter on `AutomationConditionSensorDefinition` has been renamed to `target`, to align with existing sensor APIs. +- The experimental `freshness_policy_sensor` has been removed, as it relies on the long-deprecated `FreshnessPolicy` API. +- The deprecated `external_assets_from_specs` and `external_asset_from_spec` methods have been removed. Users should use `AssetsDefinition(specs=[...])`, or pass specs directly into the `Definitions` object instead. +- `AssetKey` objects can no longer be iterated over or indexed in to. This behavior was never an intended access pattern and in all observed cases was a mistake. +- [dagster-ge] `dagster-ge` now only supports `great_expectations>=0.17.15`. The `ge_validation_op_factory` API has been replaced with the API previously called `ge_validation_op_factory_v3`. +- [dagster-aws] Removed deprecated parameters from `dagster_aws.pipes.PipesGlueClient.run`. +- [dagster-embedded-elt] Removed deprecated parameter `dlt_dagster_translator` from `@dlt_assets`. The `dagster_dlt_translator` parameter should be used instead. + ## Migrating to 1.8.0 ### Notable behavior changes diff --git a/Makefile b/Makefile index 31adeb0c25445..1be48b8703620 100644 --- a/Makefile +++ b/Makefile @@ -51,13 +51,13 @@ prettier: ':!:README.md'` --write install_dev_python_modules: - python scripts/install_dev_python_modules.py -qqq + python scripts/install_dev_python_modules.py -q install_dev_python_modules_verbose: python scripts/install_dev_python_modules.py install_dev_python_modules_verbose_m1: - python scripts/install_dev_python_modules.py -qqq --include-prebuilt-grpcio-wheel + python scripts/install_dev_python_modules.py -q --include-prebuilt-grpcio-wheel graphql: cd js_modules/dagster-ui/; make generate-graphql; make generate-perms diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e8acffd0f9c5c..25a540a540bf1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,7 +14,6 @@ parameters: default: - api_tests - cli_tests - - core_tests_pydantic1 - general_tests - launcher_tests - daemon_tests diff --git a/docs/content/_apidocs.mdx b/docs/content/_apidocs.mdx index b89795bba5629..1436b4398b6e5 100644 --- a/docs/content/_apidocs.mdx +++ b/docs/content/_apidocs.mdx @@ -610,6 +610,16 @@ Dagster also provides a growing set of optional add-on libraries to integrate wi Includes implementations of run and event log storage built on Postgres. + + + PowerBI ( + dagster-powerbi) + + + Provides an integration to represent a PowerBI Workspace as a graph of + assets. + + Prometheus ( @@ -631,6 +641,16 @@ Dagster also provides a growing set of optional add-on libraries to integrate wi Provides utilities for issuing shell commands from Dagster jobs. + + + Sigma ( + dagster-sigma) + + + Provides an integration to represent a Sigma project as a graph of + assets. + + Slack ( @@ -680,6 +700,13 @@ Dagster also provides a growing set of optional add-on libraries to integrate wi posting files via SFTP. + + + Tableau ( + dagster-tableau) + + Provides a resource for integrating Tableau Workspaces + Twilio ( diff --git a/docs/content/_navigation.json b/docs/content/_navigation.json index 2e2e0cddfc1f2..e6b5afce9452a 100644 --- a/docs/content/_navigation.json +++ b/docs/content/_navigation.json @@ -170,7 +170,7 @@ "path": "/concepts/partitions-schedules-sensors/sensors" }, { - "title": "Declarative Automation (Experimental)", + "title": "Declarative Automation", "children": [ { "title": "Overview", @@ -386,6 +386,14 @@ "title": "Dagster Pipes + AWS Glue", "path": "/concepts/dagster-pipes/aws-glue" }, + { + "title": "Dagster Pipes + AWS EMR", + "path": "/concepts/dagster-pipes/aws-emr" + }, + { + "title": "Dagster Pipes + AWS EMR Serverless", + "path": "/concepts/dagster-pipes/aws-emr-serverless" + }, { "title": "Dagster Pipes + AWS Lambda", "path": "/concepts/dagster-pipes/aws-lambda" @@ -401,6 +409,10 @@ { "title": "Details and customization", "path": "/concepts/dagster-pipes/dagster-pipes-details-and-customization" + }, + { + "title": "Migrating from Step Launchers to Dagster Pipes", + "path": "/guides/migrations/from-step-launchers-to-pipes" } ] }, @@ -900,6 +912,46 @@ } ] }, + { + "title": "Airlift", + "path": "/integrations/airlift", + "children": [ + { + "title": "Airflow Migration Tutorial", + "path": "/integrations/airlift/tutorial/overview", + "children": [ + { + "title": "Part 1: Setup local Airflow", + "path": "/integrations/airlift/tutorial/setup" + }, + { + "title": "Part 2: Peering to the Airflow Instance", + "path": "/integrations/airlift/tutorial/peer" + }, + { + "title": "Part 3: Observing assets", + "path": "/integrations/airlift/tutorial/observe" + }, + { + "title": "Part 4: Migrating assets", + "path": "/integrations/airlift/tutorial/migrate" + }, + { + "title": "Part 5: Decomissioning the Airflow DAG", + "path": "/integrations/airlift/tutorial/decomission" + } + ] + }, + { + "title": "Reference", + "path": "/integrations/airlift/reference" + }, + { + "title": "DAG-level migration", + "path": "/integrations/airlift/full_dag" + } + ] + }, { "title": "Airflow", "path": "/integrations/airflow" @@ -1036,6 +1088,10 @@ } ] }, + { + "title": "Looker", + "path": "/integrations/looker" + }, { "title": "OpenAI", "path": "/integrations/openai" @@ -1049,7 +1105,15 @@ "path": "/integrations/pandera" }, { - "title": "PySpark", + "title": "Power BI", + "path": "/integrations/powerbi" + }, + { + "title": "Sigma", + "path": "/integrations/sigma" + }, + { + "title": "Spark", "path": "/integrations/spark" }, { @@ -1075,6 +1139,10 @@ } ] }, + { + "title": "Tableau", + "path": "/integrations/tableau" + }, { "title": "All integrations", "path": "/integrations" @@ -1242,6 +1310,10 @@ { "title": "Validating data with Dagster Type factories", "path": "/guides/dagster/dagster_type_factories" + }, + { + "title": "Migrating from Spark Step Launchers to Dagster Pipes", + "path": "/guides/migrations/from-step-launchers-to-pipes" } ] }, @@ -1384,6 +1456,10 @@ "title": "Dagster Pipes (dagster-pipes)", "path": "/_apidocs/libraries/dagster-pipes" }, + { + "title": "Airlift (dagster-airlift)", + "path": "/_apidocs/libraries/dagster-airlift" + }, { "title": "Airbyte (dagster-airbyte)", "path": "/_apidocs/libraries/dagster-airbyte" @@ -1532,6 +1608,10 @@ "title": "PostgreSQL (dagster-postgres)", "path": "/_apidocs/libraries/dagster-postgres" }, + { + "title": "PowerBI (dagster-powerbi)", + "path": "/_apidocs/libraries/dagster-powerbi" + }, { "title": "Prometheus (dagster-prometheus)", "path": "/_apidocs/libraries/dagster-prometheus" @@ -1544,6 +1624,10 @@ "title": "Shell (dagster-shell)", "path": "/_apidocs/libraries/dagster-shell" }, + { + "title": "Sigma (dagster-sigma)", + "path": "/_apidocs/libraries/dagster-sigma" + }, { "title": "Slack (dagster-slack)", "path": "/_apidocs/libraries/dagster-slack" @@ -1568,6 +1652,10 @@ "title": "SSH / SFTP (dagster-ssh)", "path": "/_apidocs/libraries/dagster-ssh" }, + { + "title": "Tableau (dagster-tableau)", + "path": "/_apidocs/libraries/dagster-tableau" + }, { "title": "Twilio (dagster-twilio)", "path": "/_apidocs/libraries/dagster-twilio" diff --git a/docs/content/about/releases.mdx b/docs/content/about/releases.mdx index ed11d3856432a..22957f0d6f71e 100644 --- a/docs/content/about/releases.mdx +++ b/docs/content/about/releases.mdx @@ -23,12 +23,22 @@ The "experimental" marker allows us to offer new APIs to users and rapidly itera Experimental APIs may change or disappear within any release, but we try to avoid breaking them within minor releases if they have been around for a long time. +### Superseded APIs + +The "superseded" marker indicates that we recommend avoiding an API, usually because there's a preferred option that should be used instead. + +Like non-deprecated public stable APIs, deprecated public stable APIs will not break within any major release after 1.0. + +Superseded APIs do not have a known scheduled removal point, but we recommend avoiding them in new code. + ### Deprecated APIs The "deprecated" marker indicates that we recommend avoiding an API, usually because there's a preferred option that should be used instead. Like non-deprecated public stable APIs, deprecated public stable APIs will not break within any major release after 1.0. +Unlike superseded APIs, deprecated APIs have a known scheduled removal point. We will provide guidance on when to expect the removal. + --- ## Dagster integration libraries diff --git a/docs/content/api/modules.json.gz b/docs/content/api/modules.json.gz index e6ded41ba7968..d5ddeaf4808ce 100644 Binary files a/docs/content/api/modules.json.gz and b/docs/content/api/modules.json.gz differ diff --git a/docs/content/api/searchindex.json.gz b/docs/content/api/searchindex.json.gz index 04ac95d2d0152..47ef82c02e5f2 100644 Binary files a/docs/content/api/searchindex.json.gz and b/docs/content/api/searchindex.json.gz differ diff --git a/docs/content/api/sections.json.gz b/docs/content/api/sections.json.gz index e3dd9e9dae1bf..67c672f040ab3 100644 Binary files a/docs/content/api/sections.json.gz and b/docs/content/api/sections.json.gz differ diff --git a/docs/content/concepts.mdx b/docs/content/concepts.mdx index ec9302948e48a..56d7260d5bec9 100644 --- a/docs/content/concepts.mdx +++ b/docs/content/concepts.mdx @@ -232,6 +232,14 @@ Dagster Pipes is a toolkit for building integrations between Dagster and externa title="Dagster Pipes + AWS Glue" href="/concepts/dagster-pipes/aws-glue" > + + .** [For checks defined in the same operation as assets](#defining-checks-and-assets-together), you can explicitly raise an exception to block downstream execution. -- **Assets with an currently do not respect blocking asset checks** and will execute even if a blocking check on an upstream asset failed. --- diff --git a/docs/content/concepts/assets/asset-observations.mdx b/docs/content/concepts/assets/asset-observations.mdx index 37f8fda98565e..4cebdae12fe5e 100644 --- a/docs/content/concepts/assets/asset-observations.mdx +++ b/docs/content/concepts/assets/asset-observations.mdx @@ -119,7 +119,7 @@ observed to have a newer data version than the data version it had when a downstream asset was materialized, then the downstream asset will be given a label in the Dagster UI that indicates that upstream data has changed. - can be used to automatically + can be used to automatically materialize downstream assets when this occurs. The decorator provides a convenient way to define source assets with observation functions. The below observable source asset takes a file hash and returns it as the data version. Every time you run the observation function, a new observation will be generated with this hash set as its data version. diff --git a/docs/content/concepts/assets/external-assets.mdx b/docs/content/concepts/assets/external-assets.mdx index 28d052a7ed371..8e271eeb27a70 100644 --- a/docs/content/concepts/assets/external-assets.mdx +++ b/docs/content/concepts/assets/external-assets.mdx @@ -75,8 +75,6 @@ height={1654} ### External assets with dependencies -External assets can depend only on other external assets. - Dependencies are defined by using the `deps` argument of . This enables Dagster to model entire graphs of assets scheduled and orchestrated by other systems. In the following example, we have two assets: `raw_logs` and `processed_logs`. The `processed_logs` asset is produced by a scheduled computation in another orchestration system. Using external assets allows you to model both assets in Dagster. diff --git a/docs/content/concepts/automation.mdx b/docs/content/concepts/automation.mdx index 09c0d5813a936..2ca86f2ff48fa 100644 --- a/docs/content/concepts/automation.mdx +++ b/docs/content/concepts/automation.mdx @@ -7,7 +7,7 @@ description: "Learn to automatically run your Dagster pipelines." Dagster offers several ways to run data pipelines without manual intervention, including traditional scheduling and event-based triggers. Automating your Dagster pipelines can boost efficiency and ensure that data is produced consistently and reliably. -When one of Dagster's automation methods is triggered, a **tick** is created, which indicates that a **run** should occur. The tick will kick off a run, which is a single instance of a pipeline being executed. +When one of Dagster's automation methods is triggered, a **tick** is created. A tick is an opportunity for one or more **runs** to be launched. A run will either materialize a selection of assets or execute a job. Some schedules and sensors will launch runs on every tick. Others have associated logic that is executed on each tick which determines the runs to be launched. In this guide, we'll cover the available automation methods Dagster provides and when to use each one. @@ -29,11 +29,11 @@ In this section, we'll touch on each of the automation methods currently support ### Schedules -Schedules are Dagster's imperative approach, which allow you to specify when a job should run, such as Mondays at 9:00 AM. Jobs triggered by schedules can contain a subset of [assets][assets] or [ops][ops]. Refer to the [Schedules documentation][schedules] to learn more. +Schedules are Dagster's imperative option for automation. They allow you to specify exactly when a run should be launched, such as Mondays at 9:00 AM. Schedules can target a selection of [assets][assets] or a [job][jobs]. Refer to the [Schedules documentation][schedules] to learn more. ### Sensors -You can use sensors to run a job or materialize an asset in response to specific events. Sensors periodically check and execute logic to know whether to kick off a run. They are commonly used for situations where you want to materialize an asset after some externally observable event happens, such as: +Sensors launch runs in response to a detected event. They periodically check and execute logic to detect an event and conditionally launch runs. They are commonly used for situations where you want to materialize an asset on some externally observable trigger, such as: - A new file arrives in a specific location, such as Amazon S3 - A webhook notification is received @@ -54,7 +54,7 @@ Materialization conditions are declared on an asset-by-asset basis. Refer to the ### Asset Sensors -Asset sensors trigger jobs when a specified asset is materialized. Using asset sensors, you can instigate runs across jobs and code locations and keep downstream assets up-to-date with ease. +Asset sensors launch runs when a specified asset is materialized. Using asset sensors, you can instigate runs across jobs and code locations and keep downstream assets up-to-date with ease. Refer to the [Asset Sensor documentation][asset-sensors] to learn more. diff --git a/docs/content/concepts/automation/declarative-automation.mdx b/docs/content/concepts/automation/declarative-automation.mdx index ea8b8a9257709..ae7400358cd51 100644 --- a/docs/content/concepts/automation/declarative-automation.mdx +++ b/docs/content/concepts/automation/declarative-automation.mdx @@ -5,13 +5,9 @@ description: "Dagster can automatically execute assets or checks when criteria a # Declarative Automation - - This feature is currently experimental. - +Dagster automatically tracks a large amount of information about events that impact the status of your assets, and the dependencies between them. Declarative Automation is a framework that lets you access this information to make intelligent decisions. -Dagster can automatically execute assets or checks when criteria are met, enabling a declarative approach to automation. Instead of defining explicit workflows and schedules, you describe the conditions under which they should be executed, and the system executes runs in response. - -Declarative Automation includes pre-built conditions to handle common use cases, such as executing on a periodic schedule or whenever an upstream dependency updates, but conditions can be customized in a fine-grained manner, allowing precise control over when work gets executed. +Pre-built conditions are provided to handle common use cases, such as executing on a periodic schedule or whenever an upstream dependency updates, but conditions can be customized in a fine-grained manner, allowing precise control over when work gets executed. --- @@ -21,8 +17,7 @@ Using Declarative Automation helps you: - Ensure you're working with the most up-to-date data - Optimize resource usage by only materializing assets or executing checks when needed -- Simplify how your team understands their assets by consolidating all asset logic to a single location -- Avoid thinking about specific workflow boundaries, such as a [schedule accounting for timezones or Daylight Savings Time](/concepts/automation/schedules/customizing-executing-timezones) +- Precisely define when specific assets should be updated based on the state of other assets --- @@ -38,7 +33,7 @@ Before continuing, you should be familiar with: ## How it works -Declarative Automation is an automation method that executes runs when conditions are met. This method contains two main components: +Declarative Automation has two main components: - **An automation condition (**), which represents when an individual asset or check should be executed. - **A sensor (**), which evaluates each and launches runs in response to their status. @@ -135,9 +130,9 @@ Automation conditions describe the conditions under which work should be execute With assets, automation conditions can be set on the decorator or on an : ```python -from dagster import AssetSpec, AutomationCondition, asset +import dagster as dg -@asset(automation_condition=AutomationCondition.eager()) +@dg.asset(automation_condition=dg.AutomationCondition.eager()) def my_eager_asset(): ... AssetSpec("my_cron_asset", automation_condition=AutomationCondition.on_cron("@daily")) @@ -146,26 +141,25 @@ AssetSpec("my_cron_asset", automation_condition=AutomationCondition.on_cron("@da The same is true for asset checks: ```python -from dagster import AssetCheckResult, AssetCheckSpec, AutomationCondition, asset_check - +import dagster as dg -@asset_check(asset=..., automation_condition=AutomationCondition.cron_tick_passed("@daily")) -def expensive_check() -> AssetCheckResult: - return AssetCheckResult(passed=True) +@dg.asset_check(asset=dg.AssetKey("orders"), automation_condition=dg.AutomationCondition.on_cron("@daily")) +def my_eager_check() -> dg.AssetCheckResult: + return dg.AssetCheckResult(passed=True) AssetCheckSpec( - "expensive_check", - asset=..., - automation_condition=AutomationCondition.cron_tick_passed("@daily"), + "my_cron_check", + asset=dg.AssetKey("orders"), + automation_condition=AutomationCondition.on_cron("@daily"), ) ``` -The core framework is extremely flexible, allowing you to build custom conditions from the ground up. Refer to the [Customizing automation conditions guide](/concepts/automation/declarative-automation/customizing-automation-conditions) for more information. +The core framework is extensible, allowing you to build conditions that fit specific needs. Refer to the [Customizing automation conditions guide](/concepts/automation/declarative-automation/customizing-automation-conditions) for more information. ### Sensors -When automation conditions for an asset are met, a sensor will execute a run to materialize the asset. This sensor, named `default_automation_condition_sensor`, will be available for each code location and monitor all assets within that location. To use multiple sensors or change the properties of the default sensor, refer to the documentation. +When automation conditions for an asset or check are met, a sensor will execute a run in response. This sensor, named `default_automation_condition_sensor`, will be available for each code location and monitor all assets within that location. To use multiple sensors or change the properties of the default sensor, refer to the documentation. For an automation condition sensor to run, it must be turned on and an active [`dagster-daemon` process](/deployment/dagster-daemon) must be running. If you used [`dagster dev` to start the Dagster UI/webserver](/guides/running-dagster-locally), the daemon process will be automatically launched alongside the webserver. diff --git a/docs/content/concepts/automation/declarative-automation/customizing-automation-conditions.mdx b/docs/content/concepts/automation/declarative-automation/customizing-automation-conditions.mdx index c8c71de058f79..416a038f95996 100644 --- a/docs/content/concepts/automation/declarative-automation/customizing-automation-conditions.mdx +++ b/docs/content/concepts/automation/declarative-automation/customizing-automation-conditions.mdx @@ -5,11 +5,7 @@ description: "Learn to create your own custom Declarative Automation conditions. # Creating custom Declarative Automation conditions - - Declarative Automation is currently experimental. - - -[Declarative Automation](/concepts/automation/declarative-automation) includes pre-built conditions to handle common use cases, such as executing on a periodic schedule or whenever an upstream dependency updates, but you can also customize conditions. +[Declarative Automation](/concepts/automation/declarative-automation) includes pre-built conditions to handle common use cases, such as executing on a periodic schedule or whenever an upstream dependency updates, but the core system is extremely flexible and can be tailored to your specific needs. By the end of this guide, you'll understand how work and how to create your own custom conditions. @@ -26,7 +22,7 @@ Before continuing, you should be familiar with: ## How it works -Each consists of a set of **operands** and various **operators**. To create conditions that suit your specific needs, you can combine the operators and operands listed below. For example: +Each consists of a set of **operands** and various **operators**. To create conditions that suit your needs, you can combine the operators and operands listed below. For example: ```python from dagster import AutomationCondition @@ -36,23 +32,24 @@ in_progress_or_failed_parents = AutomationCondition.any_deps_match( ) ``` -This condition translates to **Any upstream dependencies (parents) part of an in-progress run or failed during the latest run**. +This condition translates to **Any upstream dependencies (parents) are part of an in-progress run or failed during the latest run**. ### Operands -Operands are base conditions which can be true or false about a given asset partition. - -| Operand | Description | -| ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | -| `AutomationCondition.missing` | The asset partition has never been materialized or observed | -| `AutomationCondition.in_progress` | The asset partition is part of an in-progress run | -| `AutomationCondition.failed` | The asset partition failed to be materialized in its latest run | -| `AutomationCondition.newly_updated` | The asset partition was materialized since the previous evaluation | -| `AutomationCondition.newly_requested` | The asset partition was requested on the previous evaluation | -| `AutomationCondition.code_version_changed` | The asset has a new code version since the previous evaluation | -| `AutomationCondition.cron_tick_passed` | A new tick of the provided cron schedule occurred since the previous evaluation | -| `AutomationCondition.in_latest_time_window` | The asset partition falls within the latest time window of the asset’s , if applicable. | -| `AutomationCondition.will_be_requested` | The asset partition will be requested in this tick | +Operands are base conditions which can be true or false about a given target. For partitioned assets, the target will be a given partition of the asset. + +| Operand | Description | +| ------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `AutomationCondition.missing` | Target has not been executed | +| `AutomationCondition.in_progress` | Target is part of an in-progress run | +| `AutomationCondition.execution_failed` | Target failed to be executed in its latest run | +| `AutomationCondition.newly_updated` | Target was updated since the previous evaluation | +| `AutomationCondition.newly_requested` | Target was requested on the previous evaluation | +| `AutomationCondition.code_version_changed` | Target has a new code version since the previous evaluation | +| `AutomationCondition.cron_tick_passed` | A new tick of the provided cron schedule occurred since the previous evaluation | +| `AutomationCondition.in_latest_time_window` | Target falls within the latest time window of the asset’s , if applicable. | +| `AutomationCondition.will_be_requested` | Target will be requested in this tick | +| `AutomationCondition.initial_evaluation` | This is the first evaluation of this condition | ### Operators @@ -90,7 +87,7 @@ The above conditions can be built into more complex expressions using the follow | (pipe) - OR; either condition must be true; ex: A | B + OR; either condition is true; ex: A | B @@ -98,7 +95,7 @@ The above conditions can be built into more complex expressions using the follow & (ampersand) - AND; both conditions must be true; ex: A & B + AND; both conditions are true; ex: A & B @@ -166,9 +163,9 @@ Finally, there are a set of pre-built conditions which make it easier to constru It's common to have use cases similar to pre-built policies but with minor differences. While it is always possible to copy the base implementation and modify it as needed, it can often be simpler to use the `.without()` method to remove the unwanted sub-conditions or add additional conditions with the `&` operator. -### `AutomationCondition.eager()`: Ignoring missing dependencies +### `AutomationCondition.eager()`: Ignoring missing upstream data -By default, `AutomationCondition.eager()` will not materialize an asset partition if it has any missing dependencies. If it is expected to have missing upstream data, remove `~AutomationCondition.any_deps_missing()` from the eager policy to allow execution: +By default, `AutomationCondition.eager()` will not materialize a target if it has any missing upstream data. If it is expected to have missing upstream data, remove `~AutomationCondition.any_deps_missing()` from the eager policy to allow execution: ```python from dagster import AutomationCondition @@ -220,7 +217,7 @@ Note that these `ignore()` and `allow()` methods also work for composite conditi ## Describing conditions with labels -When there are a large number of sub-conditions that make up an , it can be difficult to understand and troubleshoot the condition. To make conditions easier to understand, you can attach labels to sub-conditions, which will then display in the Dagster UI. +When there are a large number of sub-conditions that make up an , it can be difficult to understand and troubleshoot the condition. To make conditions easier to understand, you can attach labels to sub-conditions, which will then be displayed in the Dagster UI. Arbitrary string labels can be attached to any node in the tree by using the `with_label()` method, allowing you to describe the purpose of a specific sub-condition. For example: @@ -256,6 +253,62 @@ height={593} --- +## Arbitary Python AutomationConditions + +Some automation use cases require custom business logic that cannot be expressed with off-the-shelf components. In these cases, you can define AutomationConditions which execute arbitrary python code, and compose them with the built-in conditions. + +### Setup + +By default, Dagster executes `AutomationConditionSensorDefinitions` in a daemon process that does not have access to your user code. In order to execute arbitrary Python code, you'll need to update this to execute on your user code server. This is the same place that your `@sensor` methods are evaluated. + + + Automation condition evaluation can be more resource-intensive than a typical + sensor. A limit of 500 assets or checks per sensor is enforced. + + +To do this, add an automation condition sensor to your definitions with the `use_user_code_server` flag set to `True`: + +```python +import dagster as dg + +defs = dg.Definitions( + sensors=[dg.AutomationConditionSensorDefinition("automation_condition_sensor", target=dg.AssetSelection.all(), use_user_code_server=True)] +) +``` + +This will allow your sensor to target automation conditions containing custom python code. + +### Defining a custom condition + +You can create your own subclass of `AutomationCondition`, defining the `evaluate()` method. For example, imagine you want to avoid executing anything on a company holiday. To do this, you can first define a condition which detects if it's currently a company holiday: + +```python +import dagster as dg + +class IsCompanyHoliday(dg.AutomationCondition): + def evaluate(self, context: dg.AutomationContext) -> dg.AutomationResult: + if is_company_holiday(context.evaluation_time): + true_subset = context.candidate_subset + else: + true_subset = context.get_empty_subset() + return dg.AutomationResult(true_subset, context=context) + +``` + +In this example, we build up a subset of the evaluated asset for which this condition is True. We use `EntitySubsets`, rather than a pure `True` / `False` to account for partitioned assets, for which individual partitions may have different results. + +In our case, the condition will be applied the same regardless of if it's partitioned or not, so we don't need to have any special logic to differntiate between these cases. If it's not a company holiday, we can return an empty subset (meaning that this condition is not true for any subset of the asset), and if it is a company holiday, we return the `candidate_subset`, which is the subset of the asset that we need to evaluate. This subset shrinks as we filter partitions out using the `&` condition, so if you have an expression `A & B`, and `A` returns the empty subset, then the candidate subset for `B` will be empty as well. This helps avoid expensive computation in cases where we know it won't impact the final output. + +Once this condition is defined, you can use this condition as part of a broader expression, for example: + +```python +import dagster as dg + +condition = AutomationCondition.eager() & ~IsCompanyHoliday() +``` + +--- + ## Related diff --git a/docs/content/concepts/automation/schedules.mdx b/docs/content/concepts/automation/schedules.mdx index 082f70a2da4cb..f4fd2e4e8a2f0 100644 --- a/docs/content/concepts/automation/schedules.mdx +++ b/docs/content/concepts/automation/schedules.mdx @@ -5,11 +5,11 @@ description: "Use schedules to run your pipelines at fixed time intervals." # Schedules -Schedules are Dagster's way of supporting traditional methods of [automation](/concepts/automation), which allow you to specify when a [job](/concepts/ops-jobs-graphs/jobs) should run. Using schedules, you can define a fixed time interval to run a pipeline, such as daily, hourly, or Monday at 9:00 AM. +Schedules are Dagster's approach to imperative [automation](/concepts/automation). Schedules specify a fixed time interval at which to conditionally execute a target. Some example time intervals expressible with a schedule are daily, hourly, or Monday at 9:00 AM. -Each interval of a schedule is called a **tick**, which is an indication that a job should execute. Ticks kick off **runs**, which is a single instance of a job being executed. +Each interval of a schedule is called a **tick**, which is an opportunity for one or more runs to be launched. Ticks kick off **runs**, which either materialize a selection of assets or execute a [job](/concepts/ops-jobs-graphs/jobs). -When viewing a schedule in [Dagster's UI](/concepts/webserver/ui), you can see the schedule's definition, executing timezone, targeted jobs and partitions, as well as its tick and run history. +When viewing a schedule in [Dagster's UI](/concepts/webserver/ui), you can see the schedule's definition, executing timezone, target, and tick/run history. --- @@ -35,9 +35,9 @@ Before continuing, you should be familiar with: ## How it works -Schedules run jobs at fixed time intervals and have two main components: +Schedules launch runs at fixed time intervals and have two main components: -- **A job**, which targets a selection of assets or ops +- **A target**, which specifies a selection of assets to materialize or a job to execute - [**A cron expression**](https://en.wikipedia.org/wiki/Cron), which defines when the schedule runs. Simple and complex schedules are supported, allowing you to have fine-grained control over when runs are executed. With cron syntax, you can: - **Create custom schedules** like `Every hour from 9:00AM - 5:00PM` with cron expressions (`0 9-17 * * *`) diff --git a/docs/content/concepts/automation/schedules/automating-assets-schedules-jobs.mdx b/docs/content/concepts/automation/schedules/automating-assets-schedules-jobs.mdx index 12050dd86c45a..1b382f250b10c 100644 --- a/docs/content/concepts/automation/schedules/automating-assets-schedules-jobs.mdx +++ b/docs/content/concepts/automation/schedules/automating-assets-schedules-jobs.mdx @@ -7,13 +7,12 @@ description: "Learn how to automate asset materialization using schedules and jo After creating some [asset definitions](/concepts/assets/software-defined-assets), you may want to automate their materialization. -In this guide, we'll show you one method of accomplishing this by using schedules and jobs. To do this for ops, refer to the [Automating ops using schedules guide](/concepts/automation/schedules/automating-ops-schedules-jobs). +In this guide, we'll show you one method of accomplishing this by using schedules. To do this for ops, refer to the [Automating ops using schedules guide](/concepts/automation/schedules/automating-ops-schedules-jobs). By the end of this guide, you'll be able to: -- Create a job that materializes assets -- Create a schedule -- Add the new job and schedule to your project's object +- Create a schedule that directly targets some assets +- Add the new schedule to your project's object - Turn the schedule on --- @@ -25,16 +24,13 @@ To follow this guide, you'll need: - **To install Dagster and the Dagster UI.** Refer to the [Installation guide](/getting-started/install) for more info and instructions. - **Familiarity with**: - [Asset definitions](/concepts/assets/software-defined-assets) - - [Jobs](/concepts/ops-jobs-graphs/jobs) - [Code locations](/concepts/code-locations) () --- -## Step 1: Create a job +## Step 1: Define some assets -The first step in creating a schedule is to build a job that materializes some assets. - -Let's assume we already have a few assets in our project in a group named `ecommerce_assets`: +The first step in creating a schedule is to define the target assets we want to materialize. Define a few assets in a group named `ecommerce_assets`: ```python file=concepts/partitions_schedules_sensors/schedules/basic_asset_schedule.py startafter=start_assets endbefore=end_assets @asset(group_name="ecommerce_assets") @@ -47,31 +43,16 @@ def users_asset(): return 2 ``` -To create a job that materializes the assets in this group, we'll use : - -```python file=concepts/partitions_schedules_sensors/schedules/basic_asset_schedule.py startafter=start_job endbefore=end_job -ecommerce_job = define_asset_job( - "ecommerce_job", AssetSelection.groups("ecommerce_assets") -) -``` - -To create the job, we: - -1. Imported and -2. Constructed the job using and name it `ecommerce_job` -3. Selected all assets in the `ecommerce_assets` group using . Only these assets will be materialized when the job runs. - -Refer to the [Asset jobs documentation](/concepts/assets/asset-jobs) for more info and examples. - --- ## Step 2: Define the schedule -Next, we'll construct the schedule using and attach it to the job we created in [Step 1](#step-1-create-a-job). +Next, we'll construct the schedule using and use it to target the assets we created in [Step 1](#step-1-define-some-assets). ```python file=concepts/partitions_schedules_sensors/schedules/basic_asset_schedule.py startafter=start_schedule endbefore=end_schedule ecommerce_schedule = ScheduleDefinition( - job=ecommerce_job, + name="ecommerce_schedule", + target=AssetSelection.groups("ecommerce_assets"), cron_schedule="15 5 * * 1-5", default_status=DefaultScheduleStatus.RUNNING, ) @@ -82,7 +63,7 @@ To build the schedule, we: 1. Imported `DefaultScheduleStatus` and from `dagster` 2. Created a schedule using that: - - Is attached to the `ecommerce_job` job + - Targets the assets we defined in Step 1 using `AssetSelection.groups` - Has a cron expression of `15 5 * * 1-5`, which translates to `Every Monday through Friday of every month at 5:15AM` - Is turned on by default (`default_status`). We'll discuss this more in [Step 4](#step-4-turn-the-schedule-on). @@ -90,19 +71,25 @@ To build the schedule, we: ## Step 3: Update the Definitions object -Next, we'll update our project's object to include the new job and schedule. This ensures the job and schedule are available to Dagster processes, such as the Dagster UI. +Next, we'll update our project's object to include the new assets and schedule. This ensures the job and schedule are available to Dagster processes, such as the Dagster UI. ```python file=concepts/partitions_schedules_sensors/schedules/basic_asset_schedule.py startafter=start_definitions endbefore=end_definitions defs = Definitions( assets=[orders_asset, users_asset], - jobs=[ecommerce_job], schedules=[ecommerce_schedule], ) ``` + + We also could have passed our asset definitions directly as the `target` of + the schedule, and they would be automatically included as assets in the + `Definitions` object. Since we targeted them here using + `AssetSelection.groups`, we needed to include them separately in `assets`. + + At this point, your code should look like the following: -```python file=concepts/partitions_schedules_sensors/schedules/basic_asset_schedule.py lines=3-12,14-23,27-30,35-40,45-49 +```python file=concepts/partitions_schedules_sensors/schedules/basic_asset_schedule.py lines=3-12,14-23,27-33,38-41 from dagster import ( AssetSelection, DefaultScheduleStatus, @@ -123,19 +110,15 @@ def users_asset(): return 2 -ecommerce_job = define_asset_job( - "ecommerce_job", AssetSelection.groups("ecommerce_assets") -) - ecommerce_schedule = ScheduleDefinition( - job=ecommerce_job, + name="ecommerce_schedule", + target=AssetSelection.groups("ecommerce_assets"), cron_schedule="15 5 * * 1-5", default_status=DefaultScheduleStatus.RUNNING, ) defs = Definitions( assets=[orders_asset, users_asset], - jobs=[ecommerce_job], schedules=[ecommerce_schedule], ) ``` @@ -187,7 +170,8 @@ You can set the schedule's default status using `DefaultScheduleStatus.RUNNING` ```python file=concepts/partitions_schedules_sensors/schedules/basic_asset_schedule.py startafter=start_schedule endbefore=end_schedule ecommerce_schedule = ScheduleDefinition( - job=ecommerce_job, + name="ecommerce_schedule", + target=AssetSelection.groups("ecommerce_assets"), cron_schedule="15 5 * * 1-5", default_status=DefaultScheduleStatus.RUNNING, ) @@ -216,7 +200,6 @@ That's it! At this point, you should have a working, running schedule in your Da | Name | Description | | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | -| | A function for defining a job from a selection of assets. | | | A class that defines a selection of assets. Typically used with . | | | A class that defines a schedule and attaches it to a job. | | | The object that contains all the definitions defined within a code location. Definitions include assets, jobs, resources, schedules, and sensors. | @@ -234,10 +217,6 @@ That's it! At this point, you should have a working, running schedule in your Da title="Testing schedules" href="/concepts/automation/schedules/testing" > - resource, which can be used to launch AWS EMR jobs from Dagster assets and ops. Dagster can receive regular events such as logs, asset checks, or asset materializations from jobs launched with this client. Using it requires minimal code changes to your EMR jobs. + +--- + +## Prerequisites + +- **In the Dagster environment**, you'll need to: + + - Install the following packages: + + ```shell + pip install dagster dagster-webserver dagster-aws + ``` + + Refer to the [Dagster installation guide](/getting-started/install) for more info. + + - **AWS authentication credentials configured.** If you don't have this set up already, refer to the [boto3 quickstart](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html). + +- **In AWS**: + + - An existing AWS account + - Prepared infrastructure such as S3 buckets, IAM roles, and other resources required for your EMR job + +--- + +## Step 1: Install the dagster-pipes module in your EMR environment + +Choose one of the [options](https://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html#python-package-management) to install `dagster-pipes` in the EMR environment. + +For example, this `Dockerfile` can be used to package all required dependencies into a single [PEX](https://docs.pex-tool.org/) file (in practice, the most straightforward way to package Python dependencies for EMR jobs): + +```Dockerfile file=/guides/dagster/dagster_pipes/emr/Dockerfile +# this Dockerfile can be used to create a venv archive for PySpark on AWS EMR + +FROM amazonlinux:2 AS builder + +RUN yum install -y python3 + +WORKDIR /build + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv + +ENV VIRTUAL_ENV=/build/.venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +RUN uv python install --python-preference only-managed 3.9.16 && uv python pin 3.9.16 + +RUN uv venv .venv + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install pex dagster-pipes boto3 pyspark + +RUN pex dagster-pipes boto3 pyspark -o /output/venv.pex && chmod +x /output/venv.pex + +# test imports +RUN /output/venv.pex -c "import dagster_pipes, pyspark, boto3;" + +FROM scratch AS export + +COPY --from=builder /output/venv.pex /venv.pex +``` + +The build can be launched with: + +```shell +DOCKER_BUILDKIT=1 docker build --output type=local,dest=./output . +``` + +Then, upload the produced `output/venv.pix` file to an S3 bucket: + +```shell +aws s3 cp output/venv.pex s3://your-bucket/venv.pex +``` + +Finally, use the `--files` and `spark.pyspark.python` options to specify the path to the PEX file in the `spark-submit` command: + +```shell +spark-submit ... --files s3://your-bucket/venv.pex --conf spark.pyspark.python=./venv.pex +``` + +--- + +## Step 2: Add dagster-pipes to the EMR job script + +Call `open_dagster_pipes` in the EMR script to create a context that can be used to send messages to Dagster: + +```python file=/guides/dagster/dagster_pipes/emr/script.py +import boto3 +from dagster_pipes import PipesS3MessageWriter, open_dagster_pipes +from pyspark.sql import SparkSession + + +def main(): + with open_dagster_pipes( + message_writer=PipesS3MessageWriter(client=boto3.client("s3")) + ) as pipes: + pipes.log.info("Hello from AWS EMR!") + + spark = SparkSession.builder.appName("HelloWorld").getOrCreate() + + df = spark.createDataFrame( + [(1, "Alice", 34), (2, "Bob", 45), (3, "Charlie", 56)], + ["id", "name", "age"], + ) + + # calculate a really important statistic + avg_age = float(df.agg({"age": "avg"}).collect()[0][0]) + + # attach it to the asset materialization in Dagster + pipes.report_asset_materialization( + metadata={"average_age": {"raw_value": avg_age, "type": "float"}}, + data_version="alpha", + ) + + spark.stop() + + print("Hello from stdout!") + + +if __name__ == "__main__": + main() +``` + +--- + +## Step 3: Create an asset using the PipesEMRClient to launch the job + +In the Dagster asset/op code, use the `PipesEMRClient` resource to launch the job: + +```python file=/guides/dagster/dagster_pipes/emr/dagster_code.py startafter=start_asset_marker endbefore=end_asset_marker +import os + +import boto3 +from dagster_aws.pipes import PipesEMRClient, PipesS3MessageReader +from mypy_boto3_emr.type_defs import InstanceFleetTypeDef + +from dagster import AssetExecutionContext, asset + + +@asset +def emr_pipes_asset(context: AssetExecutionContext, pipes_emr_client: PipesEMRClient): + return pipes_emr_client.run( + context=context, + # see full reference here: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr/client/run_job_flow.html#EMR.Client.run_job_flow + run_job_flow_params={}, + ).get_materialize_result() +``` + +This will launch the AWS EMR job and wait for it completion. If the job fails, the Dagster process will raise an exception. If the Dagster process is interrupted while the job is still running, the job will be terminated. + +EMR application steps `stdout` and `stderr` will be forwarded to the Dagster process. + +--- + +## Step 4: Create Dagster definitions + +Next, add the `PipesEMRClient` resource to your project's object: + +```python file=/guides/dagster/dagster_pipes/emr/dagster_code.py startafter=start_definitions_marker endbefore=end_definitions_marker +from dagster import Definitions # noqa + + +defs = Definitions( + assets=[emr_pipes_asset], + resources={ + "pipes_emr_client": PipesEMRClient( + message_reader=PipesS3MessageReader( + client=boto3.client("s3"), bucket=os.environ["DAGSTER_PIPES_BUCKET"] + ) + ) + }, +) +``` + +Dagster will now be able to launch the AWS EMR job from the `emr_asset` asset, and receive logs and events from the job. + +--- + +## Related + + + + + diff --git a/docs/content/concepts/dagster-pipes/aws-glue.mdx b/docs/content/concepts/dagster-pipes/aws-glue.mdx index 38fafb25f61c7..b03761382f22c 100644 --- a/docs/content/concepts/dagster-pipes/aws-glue.mdx +++ b/docs/content/concepts/dagster-pipes/aws-glue.mdx @@ -13,7 +13,7 @@ The [dagster-aws](/\_apidocs/libraries/dagster-aws) integration library provides ## Prerequisites -- **In the orchestration environment**, you'll need to: +- **In the Dagster environment**, you'll need to: - Install the following packages: @@ -28,11 +28,11 @@ The [dagster-aws](/\_apidocs/libraries/dagster-aws) integration library provides - **In AWS**: - An existing AWS account - - An AWS Glue job with a Python 3.8+ runtime environment + - An AWS Glue job with a Python 3.9+ runtime environment --- -## Step 1: Provide the dagster-pipes module +## Step 1: Provide the dagster-pipes module in your Glue environment Provide the `dagster-pipes` module to the AWS Glue job either by installing it in the Glue job environment or packaging it along with the job script. diff --git a/docs/content/concepts/dagster-pipes/aws-lambda.mdx b/docs/content/concepts/dagster-pipes/aws-lambda.mdx index abe50035a5dc6..9b73c7c919f39 100644 --- a/docs/content/concepts/dagster-pipes/aws-lambda.mdx +++ b/docs/content/concepts/dagster-pipes/aws-lambda.mdx @@ -24,7 +24,7 @@ Pipes allows your code to interact with Dagster outside of a full Dagster enviro To use Dagster Pipes with AWS Lambda, you’ll need: -- **In the orchestration environment**, you'll need to: +- **In the Dagster environment**, you'll need to: - Install the following packages: diff --git a/docs/content/concepts/dagster-pipes/databricks.mdx b/docs/content/concepts/dagster-pipes/databricks.mdx index 8d465762a5b29..adaf75b0a1906 100644 --- a/docs/content/concepts/dagster-pipes/databricks.mdx +++ b/docs/content/concepts/dagster-pipes/databricks.mdx @@ -18,7 +18,7 @@ Pipes allows your Databricks jobs to stream logs (including `stdout` and `stderr To use Dagster Pipes with Databricks: -- **In the orchestration environment**, you'll need to install the following packages: +- **In the Dagster environment**, you'll need to install the following packages: ```shell pip install dagster dagster-webserver dagster-databricks diff --git a/docs/content/concepts/dagster-pipes/kubernetes.mdx b/docs/content/concepts/dagster-pipes/kubernetes.mdx index 129d8297062c8..606ada52d504b 100644 --- a/docs/content/concepts/dagster-pipes/kubernetes.mdx +++ b/docs/content/concepts/dagster-pipes/kubernetes.mdx @@ -24,7 +24,7 @@ Pipes allows your code to interact with Dagster outside of a full Dagster enviro To use Dagster Pipes with Kubernetes, you’ll need: -- **In the orchestration environment**, you'll need to install the following packages: +- **In the Dagster environment**, you'll need to install the following packages: ```shell pip install dagster dagster-webserver dagster-k8s diff --git a/docs/content/concepts/metadata-tags/asset-metadata.mdx b/docs/content/concepts/metadata-tags/asset-metadata.mdx index 4929fdc552566..6bacb91932e90 100644 --- a/docs/content/concepts/metadata-tags/asset-metadata.mdx +++ b/docs/content/concepts/metadata-tags/asset-metadata.mdx @@ -473,7 +473,7 @@ The `dagster` prefix indicates that the Dagster package takes responsibility for - dagster/relation_identifier + dagster/table_name