Skip to content

Commit

Permalink
tests for publishing
Browse files Browse the repository at this point in the history
  • Loading branch information
blue442 committed Apr 17, 2024
1 parent 69722ba commit 9af1d5f
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 308 deletions.
97 changes: 27 additions & 70 deletions foundry/foundry.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,20 +381,16 @@ def search_results_to_dataframe(self, results):

def publish_dataset(self,
foundry_dataset: FoundryDataset,
title: str, authors: List[str],
update: bool = False,
publication_year: int = None,
test: bool = False,
**kwargs: Dict[str, Any],) -> Dict[str, Any]:
short_name: str = None,
test: bool = False):
"""Submit a dataset for publication; can choose to submit via HTTPS using `https_data_path` or via Globus
Transfer using the `globus_data_source` argument. Only one upload method may be specified.
Args:
foundry_metadata (dict): Dict of metadata describing data package
title (string): Title of data package
authors (list): List of data package author names e.g., Jack Black
or Nunez, Victoria
update (bool): True if this is an update to a prior data package
(default: self.config.metadata_file)
short_name (str): Abbreviated name for dataset. If not specified, the title from the datacite entry
will be used.
test (bool): If True, do not submit the dataset for publication (ie transfer to the MDF endpoint).
Default is False.
Expand All @@ -404,35 +400,28 @@ def publish_dataset(self,
of dataset. Contains `source_id`, which can be used to check the
status of the submission
"""
# strip 'None' values from metadata object and ensure metadata is properly formatted
clean_metadata_json = self.remove_none_keys(foundry_dataset.foundry_schema.json())
self.validate_metadata(clean_metadata_json)

# strip 'None' values from datacite object and ensure datacite is properly formatted
clean_dc_json = self.remove_none_keys(foundry_dataset.dc.json())
self.validate(clean_dc_json)
# set dataset name using the title if an abbreviated short_name isn't specified
if short_name is None:
short_name = foundry_dataset.dc.titles[0].title

# ensure that one of `https_data_path` or `globus_data_source` have been assigned values
if (foundry_dataset.https_data_path and foundry_dataset.globus_data_source) or \
(foundry_dataset.https_data_path is None and foundry_dataset.globus_data_source is None):
raise ValueError("Must assign either `https_data_path` or `globus_data_source`")

self.connect_client.create_dc_block(
title=title,
authors=authors,
affiliations=kwargs.get("affiliations", []),
subjects=kwargs.get("tags", ["machine learning", "foundry"]),
publisher=kwargs.get("publisher", ""),
publication_year=publication_year,
description=kwargs.get("description", ""),
dataset_doi=kwargs.get("dataset_doi", ""),
related_dois=kwargs.get("related_dois", [])
)
self.connect_client.add_organization(self.organization)
self.connect_client.set_project_block(self.config.metadata_key)
if (not hasattr(foundry_dataset, '_https_data_path') and not hasattr(foundry_dataset, '_globus_data_source')):
raise ValueError("Must add data to your FoundryDataset object (use the FoundryDataset.add_data() method) before publishing")
if (hasattr(foundry_dataset, '_https_data_path') and hasattr(foundry_dataset, '_globus_data_source')):
raise ValueError("Dataset cannot contain both `https_data_path` and `globus_data_source` attributes. "
"Choose one by adding via the FoundryDataset.add() method.")
if (hasattr(foundry_dataset, '_https_data_path') and
foundry_dataset._https_data_path is None) or \
(hasattr(foundry_dataset, '_globus_data_source') and foundry_dataset._globus_data_source is None):
raise ValueError("Must assign a value to `https_data_path` OR `globus_data_source` in your FoundryDataset object - "
"use the FoundryDataset.add_data() method (cannot be None)")

self.connect_client.dc = foundry_dataset.clean_dc_dict()
self.connect_client.set_organization(self.organization)
self.connect_client.set_project_block("foundry", foundry_dataset)

# upload via HTTPS if specified
if foundry_dataset.https_data_path:
if foundry_dataset._https_data_path:
# gather auth'd clients necessary for publication to endpoint
endpoint_id = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec" # NCSA endpoint
scope = f"https://auth.globus.org/scopes/{endpoint_id}/https" # lets you HTTPS to specific endpoint
Expand All @@ -442,11 +431,13 @@ def publish_dataset(self,
endpoint_auth_clients={endpoint_id: AuthClient(authorizer=self.auths[scope])}
)
# upload (ie publish) data to endpoint
globus_data_source = upload_to_endpoint(pub_auths, foundry_dataset.https_data_path, endpoint_id)
globus_data_source = upload_to_endpoint(pub_auths, foundry_dataset._https_data_path, endpoint_id)
else:
# set Globus data source URL with MDF
globus_data_source = foundry_dataset._globus_data_source
# set Globus data source URL with MDF
self.connect_client.add_data_source(globus_data_source)
# set dataset name using the title if an abbreviated short_name isn't specified
self.connect_client.set_source_name(kwargs.get("short_name", title))
self.connect_client.set_source_name(short_name)

# do not submit to MDF if this is just a test
if not test:
Expand All @@ -456,32 +447,6 @@ def publish_dataset(self,
res = None
return res

def publish_model(self, title, creators, short_name, servable_type, serv_options, affiliations=None, paper_doi=None):
"""Simplified publishing method for servables
Args:
title (string): title for the servable
creators (string | list): either the creator's name (FamilyName, GivenName) or a list of the creators' names
short_name (string): shorthand name for the servable
servable_type (string): the type of the servable, must be a member of ("static_method",
"class_method",
"keras",
"pytorch",
"tensorflow",
"sklearn")
serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can be found at
https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html under the appropriate
``create_model`` signature. use the argument names as keys and their values as the values.
affiliations (list): list of affiliations for each author
paper_doi (str): DOI of a paper that describes the servable
Returns:
(string): task id of this submission, can be used to check for success
Raises:
ValueError: If the given servable_type is not in the list of acceptable types
Exception: If the serv_options are incomplete or the request to publish results in an error
"""
return self.dlhub_client.easy_publish(title, creators, short_name, servable_type, serv_options, affiliations, paper_doi)

def check_status(self, source_id, short=False, raw=False):
"""Check the status of your submission.
Expand All @@ -502,11 +467,3 @@ def check_status(self, source_id, short=False, raw=False):
If ``raw`` is ``True``, *dict*: The full status result.
"""
return self.connect_client.check_status(source_id, short, raw)

# def check_model_status(self, res):
# """Check status of model or function publication to DLHub
#
# TODO: currently broken on DLHub side of things
# """
# # return self.dlhub_client.get_task_status(res)
# pass
35 changes: 32 additions & 3 deletions foundry/foundry_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import os

Expand Down Expand Up @@ -135,7 +136,7 @@ def add_data(self, https_data_path: str = None, globus_data_source: str = None):
either a `globus_data_source` or `https_data_path`.
Arguments:
https_data_path (str): Path to the local dataset to publish to Foundry via HTTPS. Creates an HTTPS PUT
https_data_path (str): Local path to the dataset used to publish to Foundry via HTTPS. Creates an HTTPS PUT
request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is
transferred to MDF. If None, the user must specify a 'globus_data_source' URL to the location of the
data on their own Globus endpoint. User must choose either `globus_data_source` or `https_data_path` to
Expand All @@ -146,11 +147,39 @@ def add_data(self, https_data_path: str = None, globus_data_source: str = None):
publish their data.
"""
if https_data_path is None and globus_data_source is None:
raise ValueError("User must provide either a path to the data on their local machine or a URL to the data "
"on their Globus endpoint.")
if https_data_path is None:
self.globus_data_source = globus_data_source
self._globus_data_source = globus_data_source
if hasattr(self, '_https_data_path'):
delattr(self, '_https_data_path')
if globus_data_source is None:
self.https_data_path = https_data_path
if os.path.isdir(https_data_path) or os.path.isfile(https_data_path):
self._https_data_path = https_data_path
if hasattr(self, '_globus_data_source'):
delattr(self, '_globus_data_source')
else:
raise ValueError("The path provided does not exist or is not a file or directory.")

def clear_dataset_cache(self):
"""Deletes the cached data for this specific datset"""
self._foundry_cache.clear_cache(self.dataset_name)

def clean_dc_dict(self):
"""Clean the Datacite dictionary of None values"""
return self.delete_none(json.loads(self.dc.json()))

def delete_none(self, _dict):
"""Delete None values recursively from all of the dictionaries"""
for key, value in list(_dict.items()):
if isinstance(value, dict):
self.delete_none(value)
elif value is None:
del _dict[key]
elif isinstance(value, list):
for v_i in value:
if isinstance(v_i, dict):
self.delete_none(v_i)

return _dict
15 changes: 7 additions & 8 deletions foundry/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,14 @@ class is an auto-generated pydantic version of the json schema; this class exten
Raises:
ValidationError: If there is an issue validating the datacite data.
"""
def __init__(self, datacite_dict):
def __init__(self, datacite_dict, extra=Extra.allow):
try:
# modify the datacite_entry to match the expected format
dc = copy.deepcopy(datacite_dict)

if 'identifier' in dc.keys():
if 'identifier' in dc['identifier'].keys():
dc['identifier']['identifier'] = {'__root__': datacite_dict['identifier']['identifier']}
super(FoundryDatacite, self).__init__(**dc)
dc_dict = copy.deepcopy(datacite_dict)
if 'identifier' in dc_dict.keys():
if 'identifier' in dc_dict['identifier'].keys():
dc_dict['identifier']['identifier'] = {'__root__': datacite_dict['identifier']['identifier']}
super(FoundryDatacite, self).__init__(**dc_dict)
print("Datacite validation successful.")
except ValidationError as e:
print("Datacite validation failed!")
Expand Down Expand Up @@ -152,7 +151,7 @@ class FoundryBase(BaseModel, extra=Extra.allow):
local: Optional[bool] = False
local_cache_dir = "./data"
metadata_key: Optional[str] = "foundry"
organization: Optional[str] = "foundry"
organization: Optional[str] = "Foundry"

def _repr_html_(self):
return convert(json.loads(self.json()))
Loading

0 comments on commit 9af1d5f

Please sign in to comment.