Skip to content

Commit

Permalink
Calculate hash for new datasets when finishing a job
Browse files Browse the repository at this point in the history
This is configurable with two new options:
- `calculate_dataset_hash`: in which cases Galaxy should calculate
  a hash for a new dataset. Possible values: 'always', 'upload'
  (the default), 'never'.
- `hash_function`. Possible values: 'md5', 'sha1', 'sha256', 'sha512'

Hashes are calculated via a Celery task, so currently only if
the 'enable_celery_tasks' option is set to true.
  • Loading branch information
nsoranzo committed Nov 19, 2024
1 parent e69eec0 commit 5009375
Show file tree
Hide file tree
Showing 8 changed files with 151 additions and 25 deletions.
27 changes: 27 additions & 0 deletions doc/source/admin/galaxy_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4719,6 +4719,33 @@
:Type: float


~~~~~~~~~~~~~~~~~~~~~~~~~~
``calculate_dataset_hash``
~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
In which cases Galaxy should calculate a hash for a new dataset.
Dataset hashes can be used by the Galaxy job cache/search to check
if job inputs match. Setting the 'enable_celery_tasks' option to
true is also required for dataset hash calculation. Possible
values are: 'always', 'upload' (the default), 'never'. If set to
'upload', the hash is calculated only for the outputs of upload
jobs.
:Default: ``upload``
:Type: str


~~~~~~~~~~~~~~~~~
``hash_function``
~~~~~~~~~~~~~~~~~

:Description:
Hash function to use if 'calculate_dataset_hash' is enabled.
Possible values are: 'md5', 'sha1', 'sha256', 'sha512'
:Default: ``sha256``
:Type: str


~~~~~~~~~~~~~~~~~~~~~
``metadata_strategy``
~~~~~~~~~~~~~~~~~~~~~
Expand Down
9 changes: 9 additions & 0 deletions lib/galaxy/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from galaxy.util.custom_logging import LOGLV_TRACE
from galaxy.util.dynamic import HasDynamicProperties
from galaxy.util.facts import get_facts
from galaxy.util.hash_util import HashFunctionNameEnum
from galaxy.util.properties import (
read_properties_from_file,
running_from_source,
Expand Down Expand Up @@ -716,6 +717,7 @@ class GalaxyAppConfiguration(BaseAppConfiguration, CommonConfigurationMixin):
galaxy_data_manager_data_path: str
galaxy_infrastructure_url: str
hours_between_check: int
hash_function: HashFunctionNameEnum
integrated_tool_panel_config: str
involucro_path: str
len_file_path: str
Expand Down Expand Up @@ -897,6 +899,13 @@ def _process_config(self, kwargs: Dict[str, Any]) -> None:
self.update_integrated_tool_panel = kwargs.get("update_integrated_tool_panel", True)
self.galaxy_data_manager_data_path = self.galaxy_data_manager_data_path or self.tool_data_path
self.tool_secret = kwargs.get("tool_secret", "")
if self.calculate_dataset_hash not in ("always", "upload", "never"):
raise ConfigurationError(
f"Unrecognized value for calculate_dataset_hash option: {self.calculate_dataset_hash}"
)
if self.hash_function not in HashFunctionNameEnum.__members__:
raise ConfigurationError(f"Unrecognized value for hash_function option: {self.hash_function}")
self.hash_function = HashFunctionNameEnum[self.hash_function]
self.metadata_strategy = kwargs.get("metadata_strategy", "directory")
self.use_remote_user = self.use_remote_user or self.single_user
self.fetch_url_allowlist_ips = parse_allowlist_ips(listify(kwargs.get("fetch_url_allowlist")))
Expand Down
12 changes: 12 additions & 0 deletions lib/galaxy/config/sample/galaxy.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -2545,6 +2545,18 @@ galaxy:
# handler processes. Float values are allowed.
#workflow_monitor_sleep: 1.0

# In which cases Galaxy should calculate a hash for a new dataset.
# Dataset hashes can be used by the Galaxy job cache/search to check
# if job inputs match. Setting the 'enable_celery_tasks' option to
# true is also required for dataset hash calculation. Possible values
# are: 'always', 'upload' (the default), 'never'. If set to 'upload',
# the hash is calculated only for the outputs of upload jobs.
#calculate_dataset_hash: upload

# Hash function to use if 'calculate_dataset_hash' is enabled.
# Possible values are: 'md5', 'sha1', 'sha256', 'sha512'
#hash_function: sha256

# Determines how metadata will be set. Valid values are `directory`,
# `extended`, `directory_celery` and `extended_celery`. In extended
# mode jobs will decide if a tool run failed, the object stores
Expand Down
22 changes: 22 additions & 0 deletions lib/galaxy/config/schemas/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2788,6 +2788,7 @@ mapping:
Avoiding making this a boolean because we may add options such as 'in-single-form-view'
or 'in-simplified-workflow-views'. https://github.com/galaxyproject/galaxy/pull/9809/files#r461889109
allow_user_dataset_purge:
type: bool
default: true
Expand Down Expand Up @@ -3454,6 +3455,26 @@ mapping:
decreased if extremely high job throughput is necessary, but doing so can increase CPU
usage of handler processes. Float values are allowed.
calculate_dataset_hash:
type: str
default: upload
required: false
enum: ['always', 'upload', 'never']
desc: |
In which cases Galaxy should calculate a hash for a new dataset.
Dataset hashes can be used by the Galaxy job cache/search to check if job inputs match.
Setting the 'enable_celery_tasks' option to true is also required for dataset hash calculation.
Possible values are: 'always', 'upload' (the default), 'never'. If set to 'upload', the
hash is calculated only for the outputs of upload jobs.
hash_function:
type: str
default: sha256
required: false
desc: |
Hash function to use if 'calculate_dataset_hash' is enabled. Possible values
are: 'md5', 'sha1', 'sha256', 'sha512'
metadata_strategy:
type: str
required: false
Expand Down Expand Up @@ -3547,6 +3568,7 @@ mapping:
default: always
required: false
reloadable: true
enum: ['always', 'onsuccess', 'never']
desc: |
Clean up various bits of jobs left on the filesystem after completion. These
bits include the job working directory, external metadata temporary files,
Expand Down
20 changes: 20 additions & 0 deletions lib/galaxy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
ObjectStorePopulator,
serialize_static_object_store_config,
)
from galaxy.schema.tasks import ComputeDatasetHashTaskRequest
from galaxy.structured_app import MinimalManagerApp
from galaxy.tool_util.deps import requirements
from galaxy.tool_util.output_checker import (
Expand Down Expand Up @@ -2037,6 +2038,25 @@ def fail(message=job.info, exception=None):
dataset.full_delete()
collected_bytes = 0

# Calculate dataset hash
for dataset_assoc in output_dataset_associations:
dataset = dataset_assoc.dataset.dataset
if not dataset.purged and not dataset.hashes:
if self.app.config.calculate_dataset_hash == "always" or (
self.app.config.calculate_dataset_hash == "upload" and job.tool_id in ("upload1", "__DATA_FETCH__")
):
# Calculate dataset hash via a celery task
if self.app.config.enable_celery_tasks:
from galaxy.celery.tasks import compute_dataset_hash

extra_files_path = dataset.extra_files_path if dataset.extra_files_path_exists() else None
request = ComputeDatasetHashTaskRequest(
dataset_id=dataset.id,
extra_files_path=extra_files_path,
hash_function=self.app.config.hash_function,
)
compute_dataset_hash.delay(request=request)

user = job.user
if user and collected_bytes > 0 and quota_source_info is not None and quota_source_info.use:
user.adjust_total_disk_usage(collected_bytes, quota_source_info.label)
Expand Down
30 changes: 5 additions & 25 deletions lib/galaxy_test/api/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,23 +757,13 @@ def test_composite_datatype_download(self, history_id):

def test_compute_md5_on_primary_dataset(self, history_id):
hda = self.dataset_populator.new_dataset(history_id, wait=True)
hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
assert "hashes" in hda_details, str(hda_details.keys())
hashes = hda_details["hashes"]
assert len(hashes) == 0

self.dataset_populator.compute_hash(hda["id"])
hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
self.assert_hash_value(hda_details, "940cbe15c94d7e339dc15550f6bdcf4d", "MD5")

def test_compute_sha1_on_composite_dataset(self, history_id):
output = self.dataset_populator.fetch_hda(history_id, COMPOSITE_DATA_FETCH_REQUEST_1, wait=True)
hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
assert "hashes" in hda_details, str(hda_details.keys())
hashes = hda_details["hashes"]
assert len(hashes) == 0

self.dataset_populator.compute_hash(hda_details["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
self.dataset_populator.compute_hash(output["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
self.assert_hash_value(
hda_details,
Expand All @@ -784,30 +774,20 @@ def test_compute_sha1_on_composite_dataset(self, history_id):

def test_duplicated_hash_requests_on_primary(self, history_id):
hda = self.dataset_populator.new_dataset(history_id, wait=True)
hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
assert "hashes" in hda_details, str(hda_details.keys())
hashes = hda_details["hashes"]
assert len(hashes) == 0

self.dataset_populator.compute_hash(hda["id"])
self.dataset_populator.compute_hash(hda["id"])
hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
self.assert_hash_value(hda_details, "940cbe15c94d7e339dc15550f6bdcf4d", "MD5")

def test_duplicated_hash_requests_on_extra_files(self, history_id):
output = self.dataset_populator.fetch_hda(history_id, COMPOSITE_DATA_FETCH_REQUEST_1, wait=True)
hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
assert "hashes" in hda_details, str(hda_details.keys())
hashes = hda_details["hashes"]
assert len(hashes) == 0

# 4 unique requests, but make them twice...
for _ in range(2):
self.dataset_populator.compute_hash(hda_details["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
self.dataset_populator.compute_hash(hda_details["id"], hash_function="SHA-1", extra_files_path="Roadmaps")
self.dataset_populator.compute_hash(hda_details["id"], hash_function="MD5", extra_files_path="Roadmaps")
self.dataset_populator.compute_hash(output["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
self.dataset_populator.compute_hash(output["id"], hash_function="SHA-1", extra_files_path="Roadmaps")
self.dataset_populator.compute_hash(output["id"], hash_function="MD5", extra_files_path="Roadmaps")
self.dataset_populator.compute_hash(
hda_details["id"], hash_function="SHA-256", extra_files_path="Sequences"
output["id"], hash_function="SHA-256", extra_files_path="Sequences"
)

hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
Expand Down
7 changes: 7 additions & 0 deletions lib/galaxy_test/base/populators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,13 @@ def validated():

return wait_on(validated, "dataset validation")

def wait_for_dataset_hashes(self, history_id: str, dataset_id: str):
def dataset_hashes_present():
hda = self.get_history_dataset_details(history_id=history_id, dataset_id=dataset_id)
return hda["hashes"] or None

return wait_on(dataset_hashes_present, "dataset hash presence")

def setup_history_for_export_testing(self, history_name):
using_requirement("new_history")
history_id = self.new_history(name=history_name)
Expand Down
49 changes: 49 additions & 0 deletions test/integration/test_dataset_hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Optional

from galaxy_test.base.populators import DatasetPopulator
from galaxy_test.driver import integration_util


class TestDatasetHashingIntegration(integration_util.IntegrationTestCase):
dataset_populator: DatasetPopulator
calculate_dataset_hash: Optional[str] = None

def setUp(self) -> None:
super().setUp()
self.dataset_populator = DatasetPopulator(self.galaxy_interactor)

@classmethod
def handle_galaxy_config_kwds(cls, config) -> None:
super().handle_galaxy_config_kwds(config)
if cls.calculate_dataset_hash is not None:
config["enable_celery_tasks"] = True
config["calculate_dataset_hash"] = cls.calculate_dataset_hash

def test_hashing(self, history_id: str) -> None:
hda = self.dataset_populator.new_dataset(history_id, wait=True)
if self.calculate_dataset_hash in [None, "always", "upload"]:
hashes = self.dataset_populator.wait_for_dataset_hashes(history_id=history_id, dataset_id=hda["id"])
assert hashes[0]["hash_value"] == "a17dcdfd36f47303a4824f1309d43ac14d7491ab3b8abb28782ac8e8d3b680ea"
else:
assert hda["hashes"] == [], hda
inputs = {"input1": {"src": "hda", "id": hda["id"]}}
run_response = self.dataset_populator.run_tool_raw("cat1", inputs=inputs, history_id=history_id)
self.dataset_populator.wait_for_tool_run(history_id=history_id, run_response=run_response)
cat_dataset = self.dataset_populator.get_history_dataset_details(history_id=history_id)
if self.calculate_dataset_hash == "always":
hashes = self.dataset_populator.wait_for_dataset_hashes(history_id=history_id, dataset_id=cat_dataset["id"])
assert hashes[0]["hash_value"] == "a17dcdfd36f47303a4824f1309d43ac14d7491ab3b8abb28782ac8e8d3b680ea"
else:
assert cat_dataset["hashes"] == [], cat_dataset


class TestDatasetHashingAlwaysIntegration(TestDatasetHashingIntegration):
calculate_dataset_hash = "always"


class TestDatasetHashingUploadIntegration(TestDatasetHashingIntegration):
calculate_dataset_hash = "upload"


class TestDatasetHashingNeverIntegration(TestDatasetHashingIntegration):
calculate_dataset_hash = "never"

0 comments on commit 5009375

Please sign in to comment.