Merge pull request #5 from ArcanaFramework/id-patterns

Id patterns
ArcanaFramework · Apr 11, 2023 · 74bc1e1 · 74bc1e1
2 parents 39cc599 + cb7a334
commit 74bc1e1
Show file tree

Hide file tree

Showing 8 changed files with 100 additions and 120 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -33,10 +33,10 @@ jobs:
         python-version: ${{ matrix.python-version }}
 
     - name: Update build tools
-      run: python -m pip install --upgrade pip flit_scm
+      run: python -m pip install --upgrade pip
 
     - name: Install Arcana
-      run: python -m pip install .[test]
+      run: python -m pip install .[test] fileformats-testing
 
     - name: Pytest
       run: pytest -vvs --cov arcana.bids  --cov-config .coveragerc --cov-report xml

diff --git a/README.rst b/README.rst
@@ -9,10 +9,10 @@ Arcana Extension - bids
    :alt: Python versions
 .. image:: https://img.shields.io/pypi/v/arcana-bids.svg
    :target: https://pypi.python.org/pypi/arcana-bids/
-   :alt: Latest Version  
-.. image:: https://github.com/ArcanaFramework/arcana/actions/workflows/docs.yml/badge.svg
-   :target: http://arcana.readthedocs.io/en/latest/?badge=latest
-   :alt: Docs
+   :alt: Latest Version
+.. image:: https://readthedocs.org/projects/arcana/badge/?version=latest
+  :target: https://arcanaframework.github.io/arcana
+  :alt: Documentation Status
 
 
 An extension of the Arcana framework to work with Brain Imaging Data Structure (BIDS)

diff --git a/arcana/bids/cli.py b/arcana/bids/cli.py
@@ -15,8 +15,7 @@ def bids_group():
 @bids_group.command(
     name="app-entrypoint",
     help="""Loads a dataset, or creates one it is not already present, then applies and
-launches a pipeline in a single command. To be used within the command configuration
-of an XNAT Container Service ready Docker image.
+launches a pipeline in a single command. To be used inside BidsApp images.
 
 DATASET_LOCATOR string containing the nickname of the data store, the ID of the
 dataset (e.g. XNAT project ID or file-system directory) and the dataset's name

diff --git a/arcana/bids/data.py b/arcana/bids/data.py
@@ -4,7 +4,6 @@
 import re
 import logging
 from operator import itemgetter
-from copy import copy
 import attrs
 import jq
 from pathlib import Path
@@ -15,7 +14,7 @@
 from arcana.core.exceptions import ArcanaUsageError
 from arcana.core.data.tree import DataTree
 from arcana.core.data.set import Dataset
-from arcana.core.data.space import Clinical
+from arcana.stdlib import Clinical
 from arcana.core.data.entry import DataEntry
 from arcana.core.data.row import DataRow
 
@@ -69,11 +68,19 @@ class Bids(LocalStore):
     name: str = "bids"
 
     BIDS_VERSION = "1.0.1"
+    DEFAULT_SPACE = Clinical
 
     PROV_SUFFIX = ".provenance"
     FIELDS_FNAME = "__fields__"
     FIELDS_PROV_FNAME = "__fields_provenance__"
 
+    VALID_HIERARCHIES = (
+        ["subject", "timepoint"],
+        ["session"],
+        ["group", "subject", "timepoint"],
+        ["group", "session"],
+    )
+
     #################################
     # Abstract-method implementations
     #################################
@@ -89,11 +96,10 @@ def populate_tree(self, tree: DataTree):
             The dataset to construct the tree dimensions for
         """
         root_dir = Path(tree.dataset.id)
-        participants_fspath = root_dir / "participants.tsv"
-        participants = {}
-        if participants_fspath.exists():
-            with open(participants_fspath) as f:
+        if "group" in tree.dataset.hierarchy:
+            with open(root_dir / "participants.tsv") as f:
                 lines = f.read().splitlines()
+            participants = {}
             if lines:
                 participant_keys = lines[0].split("\t")
                 for line in lines[1:]:
@@ -103,18 +109,17 @@ def populate_tree(self, tree: DataTree):
             if not subject_dir.name.startswith("sub-"):
                 continue
             subject_id = subject_dir.name[len("sub-") :]
-            try:
-                additional_ids = {"group": participants[subject_id]["group"]}
-            except KeyError:
-                additional_ids = {}
+            if "group" in tree.dataset.hierarchy:
+                tree_path = [participants[subject_id]["group"]]
+            else:
+                tree_path = []
+            tree_path.append(subject_id)
             if any(d.name.startswith("ses-") for d in subject_dir.iterdir()):
                 for sess_dir in subject_dir.iterdir():
                     timepoint_id = sess_dir.name[len("ses-") :]
-                    sess_add_ids = copy(additional_ids)
-                    sess_add_ids["session"] = f"sub-{subject_id}_ses-{timepoint_id}"
-                    tree.add_leaf([subject_id, timepoint_id], additional_ids=sess_add_ids)
+                    tree.add_leaf(tree_path + [timepoint_id])
             else:
-                tree.add_leaf([subject_id], additional_ids=additional_ids)
+                tree.add_leaf([subject_id])
 
     def populate_row(self, row: DataRow):
         root_dir = row.dataset.root_dir
@@ -167,7 +172,7 @@ def fileset_uri(self, path: str, datatype: type, row: DataRow) -> str:
         if dataset_name is None:
             base_uri = ""
         elif not dataset_name:
-            base_uri = f"derivatives/{Dataset.EMPTY_NAME}"
+            base_uri = f"derivatives/{self.EMPTY_DATASET_NAME}"
         else:
             base_uri = f"derivatives/{dataset_name}"
         return base_uri + str(
@@ -188,7 +193,7 @@ def field_uri(self, path: str, datatype: type, row: DataRow) -> str:
         if dataset_name is None:
             base_uri = ""
         elif not dataset_name:
-            base_uri = f"derivatives/{Dataset.EMPTY_NAME}"
+            base_uri = f"derivatives/{self.EMPTY_DATASET_NAME}"
         else:
             base_uri = f"derivatives/{dataset_name}"
         try:
@@ -268,51 +273,49 @@ def create_data_tree(
         id: str,
         leaves: list[tuple[str, ...]],
         hierarchy: list[str],
-        id_composition: dict[str, str] = None,
         **kwargs
     ):
+        if hierarchy not in self.VALID_HIERARCHIES:
+            raise ArcanaUsageError(
+                f"Invalid hiearchy {hierarchy} provided to create a new data tree "
+                f"needs to be one of the following:\n"
+                + "\n".join(str(h) for h in self.VALID_HIERARCHIES)
+            )
         root_dir = Path(id)
         root_dir.mkdir(parents=True)
         # Create sub-directories corresponding to rows of the dataset
         group_ids = set()
-        subject_group_ids = {}
+        subjects_group_id = {}
         for ids_tuple in leaves:
             ids = dict(zip(hierarchy, ids_tuple))
             # Add in composed IDs
-            ids.update(Dataset.decompose_ids(ids, id_composition))
-            if "session" in hierarchy:
-                subject_id = ids["session"]
-                timepoint_id = None
-                assert "subject" not in ids
-                assert "timepoint" not in ids
-            else:
+            try:
                 subject_id = ids["subject"]
-                timepoint_id = ids["timepoint"]
-                assert "session" not in ids
+            except KeyError:
+                subject_id = ids["session"]
+            timepoint_id = ids.get("timepoint")
             group_id = ids.get("group")
             if group_id:
                 group_ids.add(group_id)
-                subject_group_ids[subject_id] = group_id
+                subjects_group_id[subject_id] = group_id
             sess_dir_fspath = root_dir / self._entry2fs_path(
                 entry_path=None, subject_id=subject_id, timepoint_id=timepoint_id
             )
-            sess_dir_fspath.mkdir(parents=True)
+            sess_dir_fspath.mkdir(parents=True, exist_ok=True)
         # Add participants.tsv to define the groups if present
         if group_ids:
             with open(root_dir / "participants.tsv", "w") as f:
                 f.write("participant_id\tgroup\n")
-                for subject_id, group_id in subject_group_ids.items():
+                for subject_id, group_id in subjects_group_id.items():
                     f.write(f"sub-{subject_id}\t{group_id}\n")
 
     ####################
     # Overrides of API #
     ####################
 
-    def save_dataset(
-        self, dataset: Dataset, name: str = None, overwrite_bids_metadata: bool = False
-    ):
+    def save_dataset(self, dataset: Dataset, name: str = None):
         super().save_dataset(dataset, name=name)
-        self._save_metadata(dataset, overwrite_bids_metadata=overwrite_bids_metadata)
+        self._save_metadata(dataset)
 
     def create_dataset(
         self,
@@ -347,81 +350,57 @@ def create_dataset(
         dataset = super().create_dataset(
             id=id, leaves=leaves, hierarchy=hierarchy, space=space, name=name, **kwargs
         )
-        self._save_metadata(dataset, overwrite_bids_metadata=True)
+        self._save_metadata(dataset)
         return dataset
 
     ################
     # Helper methods
     ################
 
-    def _save_metadata(self, dataset: Dataset, overwrite_bids_metadata: bool = False):
+    def _save_metadata(self, dataset: Dataset):
         root_dir = Path(dataset.id)
         dataset_description_fspath = root_dir / "dataset_description.json"
-        if dataset_description_fspath.exists() and not overwrite_bids_metadata:
-            logger.warning(
-                "Not attempting to overwrite existing BIDS dataset description at "
-                "'%s, use 'overwrite_bids_metadata' to "
-                "force.",
-                str(dataset_description_fspath),
-            )
-        else:
-            dataset_description = map_to_bids_names(
-                attrs.asdict(dataset.metadata, recurse=True)
-            )
-            dataset_description["BIDSVersion"] = self.BIDS_VERSION
-            with open(dataset_description_fspath, "w") as f:
-                json.dump(dataset_description, f, indent="    ")
+        dataset_description = map_to_bids_names(
+            attrs.asdict(dataset.metadata, recurse=True)
+        )
+        dataset_description["BIDSVersion"] = self.BIDS_VERSION
+        with open(dataset_description_fspath, "w") as f:
+            json.dump(dataset_description, f, indent="    ")
 
         if dataset.metadata.description is not None:
             readme_path = root_dir / "README"
-            if readme_path.exists() and not overwrite_bids_metadata:
-                logger.warning(
-                    "Not attempting to overwrite existing BIDS dataset description at "
-                    "%s, use 'overwrite_bids_metadata' to "
-                    "force.",
-                    str(readme_path),
-                )
-            else:
-                with open(readme_path, "w") as f:
-                    f.write(dataset.metadata.description)
-        participants_tsv_fspath = dataset.root_dir / "participants.tsv"
+            with open(readme_path, "w") as f:
+                f.write(dataset.metadata.description)
         columns = list(dataset.metadata.row_metadata)
         group_ids = [i for i in dataset.row_ids("group") if i is not None]
         if group_ids or columns:
-            if participants_tsv_fspath.exists() and not overwrite_bids_metadata:
-                logger.warning(
-                    "Not attempting to overwrite existing BIDS participants TSV at "
-                    "%s, use 'overwrite_bids_metadata' to "
-                    "force.",
-                    str(participants_tsv_fspath),
-                )
-            else:
-                with open(dataset.root_dir / "participants.tsv", "w") as f:
-                    f.write("participant_id")
+            subject_rows = dataset.rows("subject")
+            with open(dataset.root_dir / "participants.tsv", "w") as f:
+                f.write("participant_id")
+                if group_ids:
+                    f.write("\tgroup")
+                if columns:
+                    f.write("\t" + "\t".join(columns))
+                f.write("\n")
+                for row in subject_rows:
+                    f.write(
+                        f"sub-{row.id}"
+                    )
                     if group_ids:
-                        f.write("\tgroup")
+                        f.write("\t" + row.frequency_id('group'))
                     if columns:
-                        f.write("\t" + "\t".join(columns))
+                        f.write("\t" + "\t".join(row.metadata[k] for k in columns))
                     f.write("\n")
-                    for row in dataset.rows("subject"):
-                        f.write(
-                            f"sub-{row.id}"
-                        )
-                        if group_ids:
-                            f.write("\t" + row.frequency_id('group'))
-                        if columns:
-                            f.write("\t" + "\t".join(row.metadata[k] for k in columns))
-                        f.write("\n")
-                participants_desc = {}
-                if group_ids:
-                    participants_desc["group"] = {
-                        "Description": "the group the participant belonged to",
-                        "Levels": {g: f"{g} group" for g in dataset.row_ids("group")},
-                    }
-                for name, desc in dataset.metadata.row_metadata.items():
-                    participants_desc[name] = {"Description": desc}
-                with open(dataset.root_dir / "participants.json", "w") as f:
-                    json.dump(participants_desc, f)
+            participants_desc = {}
+            if group_ids:
+                participants_desc["group"] = {
+                    "Description": "the group the participant belonged to",
+                    "Levels": {g: f"{g} group" for g in dataset.row_ids("group")},
+                }
+            for name, desc in dataset.metadata.row_metadata.items():
+                participants_desc[name] = {"Description": desc}
+            with open(dataset.root_dir / "participants.json", "w") as f:
+                json.dump(participants_desc, f)
 
     def _fileset_fspath(self, entry: DataEntry) -> Path:
         return Path(entry.row.dataset.id) / entry.uri

diff --git a/arcana/bids/tasks.py b/arcana/bids/tasks.py
@@ -19,7 +19,7 @@
 from arcana.core import __version__
 from arcana.core.data.set import Dataset
 from fileformats.core import FileSet
-from arcana.core.data.space import Clinical
+from arcana.stdlib import Clinical
 from arcana.bids.data import JsonEdit
 from arcana.core.exceptions import ArcanaUsageError
 from arcana.core.utils.serialize import (
@@ -82,8 +82,7 @@ def bids_app(
     outputs : list[ty.Union[AppField, dict[str, str]]]
         The outputs to be extracted from the derivatives directory. Should be a list of tuples
         consisting of the the path the file/directory is saved by the app within a BIDS subject/session,
-        e.g. freesurfer/recon-all, and the DataFormat class it is stored in, e.g.
-        arcana.dirtree.data.Directory.
+        e.g. freesurfer/recon-all, and the DataFormat class it is stored in, 
     executable : str, optional
         Name of the executable within the image to run (i.e. the entrypoint of the image).
         Required when extending the base image and launching Arcana within it. Defaults to

diff --git a/arcana/bids/tests/test_cli.py b/arcana/bids/tests/test_cli.py
@@ -5,7 +5,7 @@
 from arcana.testing.data.blueprint import (
     TestDatasetBlueprint, FileSetEntryBlueprint as FileBP
 )
-from arcana.core.data.space import Clinical
+from arcana.stdlib import Clinical
 from fileformats.medimage import NiftiGzX
 from arcana.bids.cli import app_entrypoint
 from arcana.core.utils.serialize import ClassResolver