From 4bf7925abc4e1c29453ed40c4ef550f3ad3ea114 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 22 Apr 2024 17:51:30 -0400 Subject: [PATCH] Add get_catalog_store_urls + get_github_commit_url (#23) * Add get_catalog_store_urls + get_github_commit_url * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix pre-commit stuff * Bugfix in get_catalog_store_urls * Make inject_attrs optional --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../data_management_transforms.py | 61 ++++++++++++++++++- .../tests/test_data_management_transforms.py | 32 ++++++++++ pyproject.toml | 1 + 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/leap_data_management_utils/data_management_transforms.py b/leap_data_management_utils/data_management_transforms.py index 2624015..cd8071b 100644 --- a/leap_data_management_utils/data_management_transforms.py +++ b/leap_data_management_utils/data_management_transforms.py @@ -1,14 +1,52 @@ # Note: All of this code was written by Julius Busecke and copied from this feedstock: # https://github.com/leap-stc/cmip6-leap-feedstock/blob/main/feedstock/recipe.py#L262 -import datetime +import subprocess from dataclasses import dataclass +from datetime import datetime, timezone from typing import Optional import apache_beam as beam import zarr from google.api_core.exceptions import NotFound from google.cloud import bigquery +from ruamel.yaml import YAML + +yaml = YAML(typ='safe') + + +def get_github_commit_url() -> Optional[str]: + """Get the GitHub commit URL for the current commit""" + # Get GitHub Server URL + github_server_url = 'https://github.com' + + # Get the repository's remote origin URL + try: + repo_origin_url = subprocess.check_output( + ['git', 'config', '--get', 'remote.origin.url'], text=True + ).strip() + + # Extract the repository path from the remote URL + repository_path = repo_origin_url.split('github.com/')[-1].replace('.git', '') + + # Get the current commit SHA + commit_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], text=True).strip() + + # Construct the GitHub commit URL + git_url_hash = f'{github_server_url}/{repository_path}/commit/{commit_sha}' + + # Output the GitHub commit URL + return git_url_hash + + except subprocess.CalledProcessError as e: + print('Error executing Git command:', e) + return None + + +def get_catalog_store_urls(catalog_yaml_path: str) -> dict[str, str]: + with open(catalog_yaml_path) as f: + catalog_meta = yaml.load(f) + return {d['id']: d['url'] for d in catalog_meta['stores']} @dataclass @@ -54,7 +92,7 @@ def _get_table(self) -> bigquery.table.Table: return self.client.get_table(self.table_id) def insert(self, fields: dict = {}): - timestamp = datetime.datetime.now().isoformat() + timestamp = datetime.now().isoformat() rows_to_insert = [ fields | {'timestamp': timestamp} # timestamp is always overridden @@ -120,6 +158,8 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: @dataclass class Copy(beam.PTransform): + """Copy a store to a new location. If the target input is False, do nothing.""" + target: str def _copy(self, store: zarr.storage.FSStore) -> zarr.storage.FSStore: @@ -147,7 +187,22 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: @dataclass class InjectAttrs(beam.PTransform): - inject_attrs: dict + inject_attrs: dict = None + add_provenance: bool = True + + # add a post_init method to add the provenance attributes + def __post_init__(self): + if self.inject_attrs is None: + self.inject_attrs = {} + + if self.add_provenance: + git_url_hash = get_github_commit_url() + timestamp = datetime.now(timezone.utc).isoformat() + provenance_dict = { + 'pangeo_forge_build_git_hash': git_url_hash, + 'pangeo_forge_build_timestamp': timestamp, + } + self.inject_attrs.update(provenance_dict) def _update_zarr_attrs(self, store: zarr.storage.FSStore) -> zarr.storage.FSStore: # TODO: Can we get a warning here if the store does not exist? diff --git a/leap_data_management_utils/tests/test_data_management_transforms.py b/leap_data_management_utils/tests/test_data_management_transforms.py index 781b23a..56ffc72 100644 --- a/leap_data_management_utils/tests/test_data_management_transforms.py +++ b/leap_data_management_utils/tests/test_data_management_transforms.py @@ -1,4 +1,36 @@ +from ruamel.yaml import YAML + +from leap_data_management_utils.data_management_transforms import ( + get_catalog_store_urls, + get_github_commit_url, +) + +yaml = YAML(typ='safe') + + def test_smoke_test(): assert True # This is a bit dumb, but it at least checks the the imports are working # again super hard to test code involving bigquery here. + + +def test_get_github_commit_url(): + url = get_github_commit_url() + assert url.startswith('https://github.com/leap-stc/leap-data-management-utils') + + +def test_get_catalog_store_urls(tmp_path): + # Create a temporary text file + temp_file = tmp_path / 'some-name.yaml' + data = { + 'stores': [{'id': 'a', 'url': 'a-url', 'some_other': 'stuff'}, {'id': 'b', 'url': 'b-url'}] + } + with open(temp_file, 'w') as f: + yaml.dump(data, f) + + # Call the function to read the file + content = get_catalog_store_urls(temp_file) + + # Assertions + assert content['a'] == 'a-url' + assert content['b'] == 'b-url' diff --git a/pyproject.toml b/pyproject.toml index c7b0eed..b85a27d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "pydantic-core", "pydantic>=2", "pyyaml", + "ruamel.yaml", "universal-pathlib", "xarray", "zarr",