tests for publishing

MLMI2-CSSI · Apr 17, 2024 · 9af1d5f · 9af1d5f
1 parent 69722ba
commit 9af1d5f
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 308 deletions.
diff --git a/foundry/foundry.py b/foundry/foundry.py
@@ -381,20 +381,16 @@ def search_results_to_dataframe(self, results):
 
     def publish_dataset(self,
                         foundry_dataset: FoundryDataset,
-                        title: str, authors: List[str],
                         update: bool = False,
-                        publication_year: int = None,
-                        test: bool = False,
-                        **kwargs: Dict[str, Any],) -> Dict[str, Any]:
+                        short_name: str = None,
+                        test: bool = False):
         """Submit a dataset for publication; can choose to submit via HTTPS using `https_data_path` or via Globus
             Transfer using the `globus_data_source` argument. Only one upload method may be specified.
         Args:
             foundry_metadata (dict): Dict of metadata describing data package
-            title (string): Title of data package
-            authors (list): List of data package author names e.g., Jack Black
-                or Nunez, Victoria
             update (bool): True if this is an update to a prior data package
-                (default: self.config.metadata_file)
+            short_name (str): Abbreviated name for dataset. If not specified, the title from the datacite entry
+                will be used.
             test (bool): If True, do not submit the dataset for publication (ie transfer to the MDF endpoint).
                 Default is False.
 
@@ -404,35 +400,28 @@ def publish_dataset(self,
             of dataset. Contains `source_id`, which can be used to check the
             status of the submission
         """
-        # strip 'None' values from metadata object and ensure metadata is properly formatted
-        clean_metadata_json = self.remove_none_keys(foundry_dataset.foundry_schema.json())
-        self.validate_metadata(clean_metadata_json)
-
-        # strip 'None' values from datacite object and ensure datacite is properly formatted
-        clean_dc_json = self.remove_none_keys(foundry_dataset.dc.json())
-        self.validate(clean_dc_json)
+        # set dataset name using the title if an abbreviated short_name isn't specified
+        if short_name is None:
+            short_name = foundry_dataset.dc.titles[0].title
 
         # ensure that one of `https_data_path` or `globus_data_source` have been assigned values
-        if (foundry_dataset.https_data_path and foundry_dataset.globus_data_source) or \
-                (foundry_dataset.https_data_path is None and foundry_dataset.globus_data_source is None):
-            raise ValueError("Must assign either `https_data_path` or `globus_data_source`")
-
-        self.connect_client.create_dc_block(
-            title=title,
-            authors=authors,
-            affiliations=kwargs.get("affiliations", []),
-            subjects=kwargs.get("tags", ["machine learning", "foundry"]),
-            publisher=kwargs.get("publisher", ""),
-            publication_year=publication_year,
-            description=kwargs.get("description", ""),
-            dataset_doi=kwargs.get("dataset_doi", ""),
-            related_dois=kwargs.get("related_dois", [])
-        )
-        self.connect_client.add_organization(self.organization)
-        self.connect_client.set_project_block(self.config.metadata_key)
+        if (not hasattr(foundry_dataset, '_https_data_path') and not hasattr(foundry_dataset, '_globus_data_source')):
+            raise ValueError("Must add data to your FoundryDataset object (use the FoundryDataset.add_data() method) before publishing")
+        if (hasattr(foundry_dataset, '_https_data_path') and hasattr(foundry_dataset, '_globus_data_source')):
+            raise ValueError("Dataset cannot contain both `https_data_path` and `globus_data_source` attributes. "
+                             "Choose one by adding via the FoundryDataset.add() method.")
+        if (hasattr(foundry_dataset, '_https_data_path') and
+            foundry_dataset._https_data_path is None) or \
+           (hasattr(foundry_dataset, '_globus_data_source') and foundry_dataset._globus_data_source is None):
+            raise ValueError("Must assign a value to `https_data_path` OR `globus_data_source` in your FoundryDataset object - "
+                             "use the FoundryDataset.add_data() method (cannot be None)")
+
+        self.connect_client.dc = foundry_dataset.clean_dc_dict()
+        self.connect_client.set_organization(self.organization)
+        self.connect_client.set_project_block("foundry", foundry_dataset)
 
         # upload via HTTPS if specified
-        if foundry_dataset.https_data_path:
+        if foundry_dataset._https_data_path:
             # gather auth'd clients necessary for publication to endpoint
             endpoint_id = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec"  # NCSA endpoint
             scope = f"https://auth.globus.org/scopes/{endpoint_id}/https"  # lets you HTTPS to specific endpoint
@@ -442,11 +431,13 @@ def publish_dataset(self,
                 endpoint_auth_clients={endpoint_id: AuthClient(authorizer=self.auths[scope])}
             )
             # upload (ie publish) data to endpoint
-            globus_data_source = upload_to_endpoint(pub_auths, foundry_dataset.https_data_path, endpoint_id)
+            globus_data_source = upload_to_endpoint(pub_auths, foundry_dataset._https_data_path, endpoint_id)
+        else:
+            # set Globus data source URL with MDF
+            globus_data_source = foundry_dataset._globus_data_source
         # set Globus data source URL with MDF
         self.connect_client.add_data_source(globus_data_source)
-        # set dataset name using the title if an abbreviated short_name isn't specified
-        self.connect_client.set_source_name(kwargs.get("short_name", title))
+        self.connect_client.set_source_name(short_name)
 
         # do not submit to MDF if this is just a test
         if not test:
@@ -456,32 +447,6 @@ def publish_dataset(self,
             res = None
         return res
 
-    def publish_model(self, title, creators, short_name, servable_type, serv_options, affiliations=None, paper_doi=None):
-        """Simplified publishing method for servables
-
-        Args:
-            title (string): title for the servable
-            creators (string | list): either the creator's name (FamilyName, GivenName) or a list of the creators' names
-            short_name (string): shorthand name for the servable
-            servable_type (string): the type of the servable, must be a member of ("static_method",
-                                                                                   "class_method",
-                                                                                   "keras",
-                                                                                   "pytorch",
-                                                                                   "tensorflow",
-                                                                                   "sklearn")
-            serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can be found at
-                                 https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html under the appropriate
-                                 ``create_model`` signature. use the argument names as keys and their values as the values.
-            affiliations (list): list of affiliations for each author
-            paper_doi (str): DOI of a paper that describes the servable
-        Returns:
-            (string): task id of this submission, can be used to check for success
-        Raises:
-            ValueError: If the given servable_type is not in the list of acceptable types
-            Exception: If the serv_options are incomplete or the request to publish results in an error
-        """
-        return self.dlhub_client.easy_publish(title, creators, short_name, servable_type, serv_options, affiliations, paper_doi)
-
     def check_status(self, source_id, short=False, raw=False):
         """Check the status of your submission.
 
@@ -502,11 +467,3 @@ def check_status(self, source_id, short=False, raw=False):
             If ``raw`` is ``True``, *dict*: The full status result.
         """
         return self.connect_client.check_status(source_id, short, raw)
-
-    # def check_model_status(self, res):
-    #     """Check status of model or function publication to DLHub
-    #
-    #     TODO: currently broken on DLHub side of things
-    #     """
-    #     # return self.dlhub_client.get_task_status(res)
-    #     pass
diff --git a/foundry/foundry_dataset.py b/foundry/foundry_dataset.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import os
 
@@ -135,7 +136,7 @@ def add_data(self, https_data_path: str = None, globus_data_source: str = None):
         either a `globus_data_source` or `https_data_path`.
 
         Arguments:
-                https_data_path (str): Path to the local dataset to publish to Foundry via HTTPS. Creates an HTTPS PUT
+                https_data_path (str): Local path to the dataset used to publish to Foundry via HTTPS. Creates an HTTPS PUT
                 request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is
                 transferred to MDF. If None, the user must specify a 'globus_data_source' URL to the location of the
                 data on their own Globus endpoint. User must choose either `globus_data_source` or `https_data_path` to
@@ -146,11 +147,39 @@ def add_data(self, https_data_path: str = None, globus_data_source: str = None):
                 publish their data.
 
         """
+        if https_data_path is None and globus_data_source is None:
+            raise ValueError("User must provide either a path to the data on their local machine or a URL to the data "
+                             "on their Globus endpoint.")
         if https_data_path is None:
-            self.globus_data_source = globus_data_source
+            self._globus_data_source = globus_data_source
+            if hasattr(self, '_https_data_path'):
+                delattr(self, '_https_data_path')
         if globus_data_source is None:
-            self.https_data_path = https_data_path
+            if os.path.isdir(https_data_path) or os.path.isfile(https_data_path):
+                self._https_data_path = https_data_path
+                if hasattr(self, '_globus_data_source'):
+                    delattr(self, '_globus_data_source')
+            else:
+                raise ValueError("The path provided does not exist or is not a file or directory.")
 
     def clear_dataset_cache(self):
         """Deletes the cached data for this specific datset"""
         self._foundry_cache.clear_cache(self.dataset_name)
+
+    def clean_dc_dict(self):
+        """Clean the Datacite dictionary of None values"""
+        return self.delete_none(json.loads(self.dc.json()))
+
+    def delete_none(self, _dict):
+        """Delete None values recursively from all of the dictionaries"""
+        for key, value in list(_dict.items()):
+            if isinstance(value, dict):
+                self.delete_none(value)
+            elif value is None:
+                del _dict[key]
+            elif isinstance(value, list):
+                for v_i in value:
+                    if isinstance(v_i, dict):
+                        self.delete_none(v_i)
+
+        return _dict
diff --git a/foundry/models.py b/foundry/models.py
@@ -107,15 +107,14 @@ class is an auto-generated pydantic version of the json schema; this class exten
     Raises:
         ValidationError: If there is an issue validating the datacite data.
     """
-    def __init__(self, datacite_dict):
+    def __init__(self, datacite_dict, extra=Extra.allow):
         try:
             # modify the datacite_entry to match the expected format
-            dc = copy.deepcopy(datacite_dict)
-
-            if 'identifier' in dc.keys():
-                if 'identifier' in dc['identifier'].keys():
-                    dc['identifier']['identifier'] = {'__root__': datacite_dict['identifier']['identifier']}
-            super(FoundryDatacite, self).__init__(**dc)
+            dc_dict = copy.deepcopy(datacite_dict)
+            if 'identifier' in dc_dict.keys():
+                if 'identifier' in dc_dict['identifier'].keys():
+                    dc_dict['identifier']['identifier'] = {'__root__': datacite_dict['identifier']['identifier']}
+            super(FoundryDatacite, self).__init__(**dc_dict)
             print("Datacite validation successful.")
         except ValidationError as e:
             print("Datacite validation failed!")
@@ -152,7 +151,7 @@ class FoundryBase(BaseModel, extra=Extra.allow):
     local: Optional[bool] = False
     local_cache_dir = "./data"
     metadata_key: Optional[str] = "foundry"
-    organization: Optional[str] = "foundry"
+    organization: Optional[str] = "Foundry"
 
     def _repr_html_(self):
         return convert(json.loads(self.json()))