update api

Co-authored-by: Givanna Putri <[email protected]> Co-authored-by: Sofie Van Gassen <[email protected]> Co-authored-by: Katrien Quintelier <[email protected]> Co-authored-by: Luca Leomazzi <[email protected]>
openproblems-bio · Nov 5, 2024 · 4e651ec · 4e651ec
1 parent d9de3b5
commit 4e651ec
Show file tree

Hide file tree

Showing 11 changed files with 343 additions and 239 deletions.
diff --git a/README.md b/README.md
@@ -38,28 +38,27 @@ should convince readers of the significance and relevance of your task.
 flowchart TB
   file_common_dataset("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-common-dataset'>Common Dataset</a>")
   comp_data_processor[/"<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#component-type-data-processor'>Data processor</a>"/]
-  file_solution("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-solution'>Solution</a>")
-  file_test("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-test-data'>Test data</a>")
-  file_train("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-training-data'>Training data</a>")
+  file_unintegrated_censored("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-unintegrated-censored'>Unintegrated Censored</a>")
+  file_unintegrated("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-unintegrated'>Unintegrated</a>")
+  file_validation("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-validation'>Validation</a>")
+  comp_method[/"<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#component-type-method'>Method</a>"/]
   comp_control_method[/"<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#component-type-control-method'>Control Method</a>"/]
   comp_metric[/"<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#component-type-metric'>Metric</a>"/]
-  comp_method[/"<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#component-type-method'>Method</a>"/]
-  file_prediction("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-predicted-data'>Predicted data</a>")
+  file_integrated("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-integrated'>Integrated</a>")
   file_score("<a href='https://github.com/openproblems-bio/task_cyto_batch_integration#file-format-score'>Score</a>")
   file_common_dataset---comp_data_processor
-  comp_data_processor-->file_solution
-  comp_data_processor-->file_test
-  comp_data_processor-->file_train
-  file_solution---comp_control_method
-  file_solution---comp_metric
-  file_test---comp_control_method
-  file_test---comp_method
-  file_train---comp_control_method
-  file_train---comp_method
-  comp_control_method-->file_prediction
+  comp_data_processor-->file_unintegrated_censored
+  comp_data_processor-->file_unintegrated
+  comp_data_processor-->file_validation
+  file_unintegrated_censored---comp_method
+  file_unintegrated---comp_control_method
+  file_unintegrated---comp_metric
+  file_validation---comp_control_method
+  file_validation---comp_metric
+  comp_method-->file_integrated
+  comp_control_method-->file_integrated
   comp_metric-->file_score
-  comp_method-->file_prediction
-  file_prediction---comp_metric
+  file_integrated---comp_metric
 ```
 
 ## File format: Common Dataset
@@ -116,32 +115,31 @@ Arguments:
 
 <div class="small">
 
-| Name | Type | Description |
-|:---|:---|:---|
-| `--input` | `file` | A subset of the common dataset. |
-| `--output_train` | `file` | (*Output*) The training data in h5ad format. |
-| `--output_test` | `file` | (*Output*) The subset of molecules used for the test dataset. |
-| `--output_solution` | `file` | (*Output*) The solution for the test data. |
+| Name                             | Type   | Description                      |
+|:---------------------------------|:-------|:---------------------------------|
+| `--input`                        | `file` | A subset of the common dataset.  |
+| `--output_unintegrated_censored` | `file` | (*Output*) Unintegrated dataset. |
+| `--output_unintegrated`          | `file` | (*Output*) Unintegrated dataset. |
+| `--output_validation`            | `file` | (*Output*) Validation dataset.   |
 
 </div>
 
-## File format: Solution
+## File format: Unintegrated Censored
 
-The solution for the test data
+Unintegrated dataset
 
 Example file:
-`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad`
+`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/train.h5ad`
 
 Format:
 
 <div class="small">
 
     AnnData object
-     obs: 'label', 'batch'
-     var: 'hvg', 'hvg_score'
-     obsm: 'X_pca'
-     layers: 'counts', 'normalized'
-     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id'
+     obs: 'batch', 'sample', 'donor'
+     var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct'
+     layers: 'preprocessed'
+     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism'
 
 </div>
 
@@ -151,41 +149,41 @@ Data structure:
 
 | Slot | Type | Description |
 |:---|:---|:---|
-| `obs["label"]` | `string` | Ground truth cell type labels. |
 | `obs["batch"]` | `string` | Batch information. |
-| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
-| `var["hvg_score"]` | `double` | A ranking of the features by hvg. |
-| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
-| `layers["counts"]` | `integer` | Raw counts. |
-| `layers["normalized"]` | `double` | Normalized counts. |
+| `obs["sample"]` | `string` | Sample ID. |
+| `obs["donor"]` | `string` | (*Optional*) Donor ID. |
+| `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. |
+| `var["channel"]` | `string` | The channel / detector of the instrument. |
+| `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. |
+| `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. |
+| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. |
+| `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. |
 | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
 | `uns["dataset_name"]` | `string` | Nicely formatted name. |
 | `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. |
 | `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. |
 | `uns["dataset_summary"]` | `string` | Short description of the dataset. |
 | `uns["dataset_description"]` | `string` | Long description of the dataset. |
 | `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
-| `uns["normalization_id"]` | `string` | Which normalization was used. |
 
 </div>
 
-## File format: Test data
+## File format: Unintegrated
 
-The subset of molecules used for the test dataset
+Unintegrated dataset
 
 Example file:
-`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/test.h5ad`
+`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/train.h5ad`
 
 Format:
 
 <div class="small">
 
     AnnData object
-     obs: 'batch'
-     var: 'hvg', 'hvg_score'
-     obsm: 'X_pca'
-     layers: 'counts', 'normalized'
-     uns: 'dataset_id', 'normalization_id'
+     obs: 'cell_type', 'batch', 'sample', 'donor', 'group'
+     var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct'
+     layers: 'preprocessed'
+     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism'
 
 </div>
 
@@ -195,34 +193,43 @@ Data structure:
 
 | Slot | Type | Description |
 |:---|:---|:---|
+| `obs["cell_type"]` | `string` | Cell type information. |
 | `obs["batch"]` | `string` | Batch information. |
-| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
-| `var["hvg_score"]` | `double` | A ranking of the features by hvg. |
-| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
-| `layers["counts"]` | `integer` | Raw counts. |
-| `layers["normalized"]` | `double` | Normalized counts. |
+| `obs["sample"]` | `string` | Sample ID. |
+| `obs["donor"]` | `string` | Donor ID. |
+| `obs["group"]` | `string` | Biological group of the donor. |
+| `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. |
+| `var["channel"]` | `string` | The channel / detector of the instrument. |
+| `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. |
+| `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. |
+| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. |
+| `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. |
 | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
-| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["dataset_name"]` | `string` | Nicely formatted name. |
+| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. |
+| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. |
+| `uns["dataset_summary"]` | `string` | Short description of the dataset. |
+| `uns["dataset_description"]` | `string` | Long description of the dataset. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
 
 </div>
 
-## File format: Training data
+## File format: Validation
 
-The training data in h5ad format
+Validation dataset
 
 Example file:
-`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/train.h5ad`
+`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad`
 
 Format:
 
 <div class="small">
 
     AnnData object
-     obs: 'label', 'batch'
-     var: 'hvg', 'hvg_score'
-     obsm: 'X_pca'
-     layers: 'counts', 'normalized'
-     uns: 'dataset_id', 'normalization_id'
+     obs: 'cell_type', 'batch', 'sample', 'donor', 'group'
+     var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct'
+     layers: 'preprocessed'
+     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism'
 
 </div>
 
@@ -232,70 +239,78 @@ Data structure:
 
 | Slot | Type | Description |
 |:---|:---|:---|
-| `obs["label"]` | `string` | Ground truth cell type labels. |
+| `obs["cell_type"]` | `string` | Cell type information. |
 | `obs["batch"]` | `string` | Batch information. |
-| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
-| `var["hvg_score"]` | `double` | A ranking of the features by hvg. |
-| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
-| `layers["counts"]` | `integer` | Raw counts. |
-| `layers["normalized"]` | `double` | Normalized counts. |
+| `obs["sample"]` | `string` | Sample ID. |
+| `obs["donor"]` | `string` | Donor ID. |
+| `obs["group"]` | `string` | Biological group of the donor. |
+| `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. |
+| `var["channel"]` | `string` | The channel / detector of the instrument. |
+| `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. |
+| `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. |
+| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. |
+| `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. |
 | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
-| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["dataset_name"]` | `string` | Nicely formatted name. |
+| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. |
+| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. |
+| `uns["dataset_summary"]` | `string` | Short description of the dataset. |
+| `uns["dataset_description"]` | `string` | Long description of the dataset. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
 
 </div>
 
-## Component type: Control Method
+## Component type: Method
 
-Quality control methods for verifying the pipeline.
+A method.
 
 Arguments:
 
 <div class="small">
 
-| Name | Type | Description |
-|:---|:---|:---|
-| `--input_train` | `file` | The training data in h5ad format. |
-| `--input_test` | `file` | The subset of molecules used for the test dataset. |
-| `--input_solution` | `file` | The solution for the test data. |
-| `--output` | `file` | (*Output*) A predicted dataset as output by a method. |
+| Name       | Type   | Description                    |
+|:-----------|:-------|:-------------------------------|
+| `--input`  | `file` | Unintegrated dataset.          |
+| `--output` | `file` | (*Output*) Integrated dataset. |
 
 </div>
 
-## Component type: Metric
+## Component type: Control Method
 
-A task template metric.
+Quality control methods for verifying the pipeline.
 
 Arguments:
 
 <div class="small">
 
-| Name | Type | Description |
-|:---|:---|:---|
-| `--input_solution` | `file` | The solution for the test data. |
-| `--input_prediction` | `file` | A predicted dataset as output by a method. |
-| `--output` | `file` | (*Output*) File indicating the score of a metric. |
+| Name                   | Type   | Description                    |
+|:-----------------------|:-------|:-------------------------------|
+| `--input_unintegrated` | `file` | Unintegrated dataset.          |
+| `--input_validation`   | `file` | Validation dataset.            |
+| `--output`             | `file` | (*Output*) Integrated dataset. |
 
 </div>
 
-## Component type: Method
+## Component type: Metric
 
-A method.
+A task template metric.
 
 Arguments:
 
 <div class="small">
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--input_train` | `file` | The training data in h5ad format. |
-| `--input_test` | `file` | The subset of molecules used for the test dataset. |
-| `--output` | `file` | (*Output*) A predicted dataset as output by a method. |
+| `--input_validation` | `file` | Validation dataset. |
+| `--input_unintegrated` | `file` | Unintegrated dataset. |
+| `--input_integrated` | `file` | Integrated dataset. |
+| `--output` | `file` | (*Output*) File indicating the score of a metric. |
 
 </div>
 
-## File format: Predicted data
+## File format: Integrated
 
-A predicted dataset as output by a method.
+Integrated dataset
 
 Example file:
 `resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/prediction.h5ad`

diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
@@ -12,20 +12,16 @@ info:
       the task, and also as a quality control for the metrics defined
       in the task.
 arguments:
-  - name: --input_train
-    __merge__: file_train.yaml
+  - name: --input_unintegrated
+    __merge__: file_unintegrated.yaml
     required: true
     direction: input
-  - name: --input_test
-    __merge__: file_test.yaml
+  - name: --input_validation
+    __merge__: file_validation.yaml
     required: true
     direction: input
-  - name: "--input_solution"
-    __merge__: file_solution.yaml
-    direction: input
-    required: true
   - name: --output
-    __merge__: file_prediction.yaml
+    __merge__: file_integrated.yaml
     required: true
     direction: output
 # test_resources:

diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
@@ -11,16 +11,16 @@ arguments:
     __merge__: file_common_dataset.yaml
     direction: input
     required: true
-  - name: "--output_train"
-    __merge__: file_train.yaml
+  - name: "--output_unintegrated_censored"
+    __merge__: file_unintegrated_censored.yaml
     direction: output
     required: true
-  - name: "--output_test"
-    __merge__: file_test.yaml
+  - name: "--output_unintegrated"
+    __merge__: file_unintegrated.yaml
     direction: output
     required: true
-  - name: "--output_solution"
-    __merge__: file_solution.yaml
+  - name: "--output_validation"
+    __merge__: file_validation.yaml
     direction: output
     required: true
 # test_resources:

diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
@@ -7,16 +7,12 @@ info:
     description: |
       A method to predict the task effects.
 arguments:
-  - name: --input_train
-    __merge__: file_train.yaml
+  - name: --input
+    __merge__: file_unintegrated_censored.yaml
     required: true
     direction: input
-  - name: "--input_test"
-    __merge__: file_test.yaml
-    direction: input
-    required: true
   - name: --output
-    __merge__: file_prediction.yaml
+    __merge__: file_integrated.yaml
     required: true
     direction: output
 # test_resources: