Calculate hash for new datasets when finishing a job

This is configurable with two new options: - `calculate_dataset_hash`: in which cases Galaxy should calculate a hash for a new dataset. Possible values: 'always', 'upload' (the default), 'never'. - `hash_function`. Possible values: 'md5', 'sha1', 'sha256', 'sha512' Hashes are calculated via a Celery task, so currently only if the 'enable_celery_tasks' option is set to true.
nsoranzo · Nov 19, 2024 · 5009375 · 5009375
1 parent e69eec0
commit 5009375
Show file tree

Hide file tree

Showing 8 changed files with 151 additions and 25 deletions.
diff --git a/doc/source/admin/galaxy_options.rst b/doc/source/admin/galaxy_options.rst
@@ -4719,6 +4719,33 @@
 :Type: float
 
 
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+``calculate_dataset_hash``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:Description:
+    In which cases Galaxy should calculate a hash for a new dataset.
+    Dataset hashes can be used by the Galaxy job cache/search to check
+    if job inputs match. Setting the 'enable_celery_tasks' option to
+    true is also required for dataset hash calculation. Possible
+    values are: 'always', 'upload' (the default), 'never'. If set to
+    'upload', the hash is calculated only for the outputs of upload
+    jobs.
+:Default: ``upload``
+:Type: str
+
+
+~~~~~~~~~~~~~~~~~
+``hash_function``
+~~~~~~~~~~~~~~~~~
+
+:Description:
+    Hash function to use if 'calculate_dataset_hash' is enabled.
+    Possible values are: 'md5', 'sha1', 'sha256', 'sha512'
+:Default: ``sha256``
+:Type: str
+
+
 ~~~~~~~~~~~~~~~~~~~~~
 ``metadata_strategy``
 ~~~~~~~~~~~~~~~~~~~~~

diff --git a/lib/galaxy/config/__init__.py b/lib/galaxy/config/__init__.py
@@ -46,6 +46,7 @@
 from galaxy.util.custom_logging import LOGLV_TRACE
 from galaxy.util.dynamic import HasDynamicProperties
 from galaxy.util.facts import get_facts
+from galaxy.util.hash_util import HashFunctionNameEnum
 from galaxy.util.properties import (
     read_properties_from_file,
     running_from_source,
@@ -716,6 +717,7 @@ class GalaxyAppConfiguration(BaseAppConfiguration, CommonConfigurationMixin):
     galaxy_data_manager_data_path: str
     galaxy_infrastructure_url: str
     hours_between_check: int
+    hash_function: HashFunctionNameEnum
     integrated_tool_panel_config: str
     involucro_path: str
     len_file_path: str
@@ -897,6 +899,13 @@ def _process_config(self, kwargs: Dict[str, Any]) -> None:
         self.update_integrated_tool_panel = kwargs.get("update_integrated_tool_panel", True)
         self.galaxy_data_manager_data_path = self.galaxy_data_manager_data_path or self.tool_data_path
         self.tool_secret = kwargs.get("tool_secret", "")
+        if self.calculate_dataset_hash not in ("always", "upload", "never"):
+            raise ConfigurationError(
+                f"Unrecognized value for calculate_dataset_hash option: {self.calculate_dataset_hash}"
+            )
+        if self.hash_function not in HashFunctionNameEnum.__members__:
+            raise ConfigurationError(f"Unrecognized value for hash_function option: {self.hash_function}")
+        self.hash_function = HashFunctionNameEnum[self.hash_function]
         self.metadata_strategy = kwargs.get("metadata_strategy", "directory")
         self.use_remote_user = self.use_remote_user or self.single_user
         self.fetch_url_allowlist_ips = parse_allowlist_ips(listify(kwargs.get("fetch_url_allowlist")))

diff --git a/lib/galaxy/config/sample/galaxy.yml.sample b/lib/galaxy/config/sample/galaxy.yml.sample
@@ -2545,6 +2545,18 @@ galaxy:
   # handler processes. Float values are allowed.
   #workflow_monitor_sleep: 1.0
 
+  # In which cases Galaxy should calculate a hash for a new dataset.
+  # Dataset hashes can be used by the Galaxy job cache/search to check
+  # if job inputs match. Setting the 'enable_celery_tasks' option to
+  # true is also required for dataset hash calculation. Possible values
+  # are: 'always', 'upload' (the default), 'never'. If set to 'upload',
+  # the hash is calculated only for the outputs of upload jobs.
+  #calculate_dataset_hash: upload
+
+  # Hash function to use if 'calculate_dataset_hash' is enabled.
+  # Possible values are: 'md5', 'sha1', 'sha256', 'sha512'
+  #hash_function: sha256
+
   # Determines how metadata will be set. Valid values are `directory`,
   # `extended`, `directory_celery` and `extended_celery`. In extended
   # mode jobs will decide if a tool run failed, the object stores

diff --git a/lib/galaxy/config/schemas/config_schema.yml b/lib/galaxy/config/schemas/config_schema.yml
@@ -2788,6 +2788,7 @@ mapping:
 
           Avoiding making this a boolean because we may add options such as 'in-single-form-view'
           or 'in-simplified-workflow-views'. https://github.com/galaxyproject/galaxy/pull/9809/files#r461889109
+
       allow_user_dataset_purge:
         type: bool
         default: true
@@ -3454,6 +3455,26 @@ mapping:
           decreased if extremely high job throughput is necessary, but doing so can increase CPU
           usage of handler processes. Float values are allowed.
 
+      calculate_dataset_hash:
+        type: str
+        default: upload
+        required: false
+        enum: ['always', 'upload', 'never']
+        desc: |
+          In which cases Galaxy should calculate a hash for a new dataset.
+          Dataset hashes can be used by the Galaxy job cache/search to check if job inputs match.
+          Setting the 'enable_celery_tasks' option to true is also required for dataset hash calculation.
+          Possible values are: 'always', 'upload' (the default), 'never'. If set to 'upload', the
+          hash is calculated only for the outputs of upload jobs.
+
+      hash_function:
+        type: str
+        default: sha256
+        required: false
+        desc: |
+          Hash function to use if 'calculate_dataset_hash' is enabled. Possible values
+          are: 'md5', 'sha1', 'sha256', 'sha512'
+
       metadata_strategy:
         type: str
         required: false
@@ -3547,6 +3568,7 @@ mapping:
         default: always
         required: false
         reloadable: true
+        enum: ['always', 'onsuccess', 'never']
         desc: |
           Clean up various bits of jobs left on the filesystem after completion.  These
           bits include the job working directory, external metadata temporary files,

diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py
@@ -74,6 +74,7 @@
     ObjectStorePopulator,
     serialize_static_object_store_config,
 )
+from galaxy.schema.tasks import ComputeDatasetHashTaskRequest
 from galaxy.structured_app import MinimalManagerApp
 from galaxy.tool_util.deps import requirements
 from galaxy.tool_util.output_checker import (
@@ -2037,6 +2038,25 @@ def fail(message=job.info, exception=None):
                 dataset.full_delete()
                 collected_bytes = 0
 
+        # Calculate dataset hash
+        for dataset_assoc in output_dataset_associations:
+            dataset = dataset_assoc.dataset.dataset
+            if not dataset.purged and not dataset.hashes:
+                if self.app.config.calculate_dataset_hash == "always" or (
+                    self.app.config.calculate_dataset_hash == "upload" and job.tool_id in ("upload1", "__DATA_FETCH__")
+                ):
+                    # Calculate dataset hash via a celery task
+                    if self.app.config.enable_celery_tasks:
+                        from galaxy.celery.tasks import compute_dataset_hash
+
+                        extra_files_path = dataset.extra_files_path if dataset.extra_files_path_exists() else None
+                        request = ComputeDatasetHashTaskRequest(
+                            dataset_id=dataset.id,
+                            extra_files_path=extra_files_path,
+                            hash_function=self.app.config.hash_function,
+                        )
+                        compute_dataset_hash.delay(request=request)
+
         user = job.user
         if user and collected_bytes > 0 and quota_source_info is not None and quota_source_info.use:
             user.adjust_total_disk_usage(collected_bytes, quota_source_info.label)

diff --git a/lib/galaxy_test/api/test_datasets.py b/lib/galaxy_test/api/test_datasets.py
@@ -757,23 +757,13 @@ def test_composite_datatype_download(self, history_id):
 
     def test_compute_md5_on_primary_dataset(self, history_id):
         hda = self.dataset_populator.new_dataset(history_id, wait=True)
-        hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
-        assert "hashes" in hda_details, str(hda_details.keys())
-        hashes = hda_details["hashes"]
-        assert len(hashes) == 0
-
         self.dataset_populator.compute_hash(hda["id"])
         hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
         self.assert_hash_value(hda_details, "940cbe15c94d7e339dc15550f6bdcf4d", "MD5")
 
     def test_compute_sha1_on_composite_dataset(self, history_id):
         output = self.dataset_populator.fetch_hda(history_id, COMPOSITE_DATA_FETCH_REQUEST_1, wait=True)
-        hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
-        assert "hashes" in hda_details, str(hda_details.keys())
-        hashes = hda_details["hashes"]
-        assert len(hashes) == 0
-
-        self.dataset_populator.compute_hash(hda_details["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
+        self.dataset_populator.compute_hash(output["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
         hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
         self.assert_hash_value(
             hda_details,
@@ -784,30 +774,20 @@ def test_compute_sha1_on_composite_dataset(self, history_id):
 
     def test_duplicated_hash_requests_on_primary(self, history_id):
         hda = self.dataset_populator.new_dataset(history_id, wait=True)
-        hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
-        assert "hashes" in hda_details, str(hda_details.keys())
-        hashes = hda_details["hashes"]
-        assert len(hashes) == 0
-
         self.dataset_populator.compute_hash(hda["id"])
         self.dataset_populator.compute_hash(hda["id"])
         hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
         self.assert_hash_value(hda_details, "940cbe15c94d7e339dc15550f6bdcf4d", "MD5")
 
     def test_duplicated_hash_requests_on_extra_files(self, history_id):
         output = self.dataset_populator.fetch_hda(history_id, COMPOSITE_DATA_FETCH_REQUEST_1, wait=True)
-        hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
-        assert "hashes" in hda_details, str(hda_details.keys())
-        hashes = hda_details["hashes"]
-        assert len(hashes) == 0
-
         # 4 unique requests, but make them twice...
         for _ in range(2):
-            self.dataset_populator.compute_hash(hda_details["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
-            self.dataset_populator.compute_hash(hda_details["id"], hash_function="SHA-1", extra_files_path="Roadmaps")
-            self.dataset_populator.compute_hash(hda_details["id"], hash_function="MD5", extra_files_path="Roadmaps")
+            self.dataset_populator.compute_hash(output["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
+            self.dataset_populator.compute_hash(output["id"], hash_function="SHA-1", extra_files_path="Roadmaps")
+            self.dataset_populator.compute_hash(output["id"], hash_function="MD5", extra_files_path="Roadmaps")
             self.dataset_populator.compute_hash(
-                hda_details["id"], hash_function="SHA-256", extra_files_path="Sequences"
+                output["id"], hash_function="SHA-256", extra_files_path="Sequences"
             )
 
         hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)

diff --git a/lib/galaxy_test/base/populators.py b/lib/galaxy_test/base/populators.py
@@ -1394,6 +1394,13 @@ def validated():
 
         return wait_on(validated, "dataset validation")
 
+    def wait_for_dataset_hashes(self, history_id: str, dataset_id: str):
+        def dataset_hashes_present():
+            hda = self.get_history_dataset_details(history_id=history_id, dataset_id=dataset_id)
+            return hda["hashes"] or None
+
+        return wait_on(dataset_hashes_present, "dataset hash presence")
+
     def setup_history_for_export_testing(self, history_name):
         using_requirement("new_history")
         history_id = self.new_history(name=history_name)

diff --git a/test/integration/test_dataset_hashing.py b/test/integration/test_dataset_hashing.py
@@ -0,0 +1,49 @@
+from typing import Optional
+
+from galaxy_test.base.populators import DatasetPopulator
+from galaxy_test.driver import integration_util
+
+
+class TestDatasetHashingIntegration(integration_util.IntegrationTestCase):
+    dataset_populator: DatasetPopulator
+    calculate_dataset_hash: Optional[str] = None
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.dataset_populator = DatasetPopulator(self.galaxy_interactor)
+
+    @classmethod
+    def handle_galaxy_config_kwds(cls, config) -> None:
+        super().handle_galaxy_config_kwds(config)
+        if cls.calculate_dataset_hash is not None:
+            config["enable_celery_tasks"] = True
+            config["calculate_dataset_hash"] = cls.calculate_dataset_hash
+
+    def test_hashing(self, history_id: str) -> None:
+        hda = self.dataset_populator.new_dataset(history_id, wait=True)
+        if self.calculate_dataset_hash in [None, "always", "upload"]:
+            hashes = self.dataset_populator.wait_for_dataset_hashes(history_id=history_id, dataset_id=hda["id"])
+            assert hashes[0]["hash_value"] == "a17dcdfd36f47303a4824f1309d43ac14d7491ab3b8abb28782ac8e8d3b680ea"
+        else:
+            assert hda["hashes"] == [], hda
+        inputs = {"input1": {"src": "hda", "id": hda["id"]}}
+        run_response = self.dataset_populator.run_tool_raw("cat1", inputs=inputs, history_id=history_id)
+        self.dataset_populator.wait_for_tool_run(history_id=history_id, run_response=run_response)
+        cat_dataset = self.dataset_populator.get_history_dataset_details(history_id=history_id)
+        if self.calculate_dataset_hash == "always":
+            hashes = self.dataset_populator.wait_for_dataset_hashes(history_id=history_id, dataset_id=cat_dataset["id"])
+            assert hashes[0]["hash_value"] == "a17dcdfd36f47303a4824f1309d43ac14d7491ab3b8abb28782ac8e8d3b680ea"
+        else:
+            assert cat_dataset["hashes"] == [], cat_dataset
+
+
+class TestDatasetHashingAlwaysIntegration(TestDatasetHashingIntegration):
+    calculate_dataset_hash = "always"
+
+
+class TestDatasetHashingUploadIntegration(TestDatasetHashingIntegration):
+    calculate_dataset_hash = "upload"
+
+
+class TestDatasetHashingNeverIntegration(TestDatasetHashingIntegration):
+    calculate_dataset_hash = "never"