update interfaces

openproblems-bio · Oct 31, 2024 · 120e97e · 120e97e
1 parent 2c95a19
commit 120e97e
Show file tree

Hide file tree

Showing 6 changed files with 242 additions and 11 deletions.
diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml
@@ -27,15 +27,13 @@ arguments:
     direction: output
     default: id_map.csv
   - name: --sc_train
-    type: file
+    __merge__: file_sc_train.yaml
     required: false
     direction: output
-    default: sc_train.h5ad
   - name: --sc_test
-    type: file
+    __merge__: file_sc_test.yaml
     required: false
     direction: output
-    default: sc_test.h5ad
   - name: --pseudobulk_filtered_with_uns
     type: file
     required: false

diff --git a/src/api/file_sc_test.yaml b/src/api/file_sc_test.yaml
@@ -0,0 +1,114 @@
+type: file
+example: resources/datasets/neurips-2023-data/sc_test.h5ad
+label: SC test
+summary: "Single-Cell Perturbation Data for Testing"
+info:
+  format:
+    type: h5ad
+    obs:
+      - name: dose_uM
+        description: "Dose in micromolar."
+        type: integer
+        required: true
+      - name: timepoint_hr
+        description: "Time point measured in hours."
+        type: float
+        required: true
+      - name: raw_cell_id
+        description: "Original cell identifier."
+        type: string
+        required: true
+      - name: hashtag_id
+        description: "Identifier for hashtag oligo."
+        type: string
+        required: true
+      - name: well
+        description: "Well location in the plate."
+        type: string
+        required: true
+      - name: container_format
+        description: "Format of the container (e.g., 96-well plate)."
+        type: string
+        required: true
+      - name: row
+        description: "Row in the plate."
+        type: string
+        required: true
+      - name: col
+        description: "Column in the plate."
+        type: integer
+        required: true
+      - name: plate_name
+        description: "Name of the plate."
+        type: string
+        required: true
+      - name: cell_id
+        description: "Unique cell identifier."
+        type: string
+        required: true
+      - name: cell_type
+        description: "Type of cell (e.g., B cells, T cells CD4+)."
+        type: string
+        required: true
+      - name: split
+        description: "Dataset split type (e.g., control, treated)."
+        type: string
+        required: true
+      - name: donor_id
+        description: "Identifier for the donor."
+        type: string
+        required: true
+      - name: sm_name
+        description: "Name of the small molecule used for treatment."
+        type: string
+        required: true
+      - name: control
+        type: boolean
+        description: "Boolean indicating whether this instance was used as a control."
+        required: true
+      - name: SMILES
+        type: string
+        description: |
+          Simplified molecular-input line-entry system (SMILES) representations of the
+          compounds used in the experiment. This is a 1D representation of molecular
+          structure. These SMILES are provided by Cellarity based on the specific
+          compounds ordered for this experiment.
+        required: true
+      - name: sm_lincs_id
+        type: string
+        description: |
+          The global LINCS ID (parent) compound (in a standardized representation).
+          This is provided to map the data in this experiment to the LINCS Connectivity
+          Map data.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
+        required: true
+      - name: dataset_name
+        type: string
+        description: A human-readable name for the dataset.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        multiple: true
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        multiple: true
diff --git a/src/api/file_sc_train.yaml b/src/api/file_sc_train.yaml
@@ -0,0 +1,114 @@
+type: file
+example: resources/datasets/neurips-2023-data/sc_train.h5ad
+label: SC train
+summary: "Single-Cell Perturbation Data for Training"
+info:
+  format:
+    type: h5ad
+    obs:
+      - name: dose_uM
+        description: "Dose in micromolar."
+        type: integer
+        required: true
+      - name: timepoint_hr
+        description: "Time point measured in hours."
+        type: float
+        required: true
+      - name: raw_cell_id
+        description: "Original cell identifier."
+        type: string
+        required: true
+      - name: hashtag_id
+        description: "Identifier for hashtag oligo."
+        type: string
+        required: true
+      - name: well
+        description: "Well location in the plate."
+        type: string
+        required: true
+      - name: container_format
+        description: "Format of the container (e.g., 96-well plate)."
+        type: string
+        required: true
+      - name: row
+        description: "Row in the plate."
+        type: string
+        required: true
+      - name: col
+        description: "Column in the plate."
+        type: integer
+        required: true
+      - name: plate_name
+        description: "Name of the plate."
+        type: string
+        required: true
+      - name: cell_id
+        description: "Unique cell identifier."
+        type: string
+        required: true
+      - name: cell_type
+        description: "Type of cell (e.g., B cells, T cells CD4+)."
+        type: string
+        required: true
+      - name: split
+        description: "Dataset split type (e.g., control, treated)."
+        type: string
+        required: true
+      - name: donor_id
+        description: "Identifier for the donor."
+        type: string
+        required: true
+      - name: sm_name
+        description: "Name of the small molecule used for treatment."
+        type: string
+        required: true
+      - name: control
+        type: boolean
+        description: "Boolean indicating whether this instance was used as a control."
+        required: true
+      - name: SMILES
+        type: string
+        description: |
+          Simplified molecular-input line-entry system (SMILES) representations of the
+          compounds used in the experiment. This is a 1D representation of molecular
+          structure. These SMILES are provided by Cellarity based on the specific
+          compounds ordered for this experiment.
+        required: true
+      - name: sm_lincs_id
+        type: string
+        description: |
+          The global LINCS ID (parent) compound (in a standardized representation).
+          This is provided to map the data in this experiment to the LINCS Connectivity
+          Map data.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
+        required: true
+      - name: dataset_name
+        type: string
+        description: A human-readable name for the dataset.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        multiple: true
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        multiple: true
diff --git a/src/methods/cpa/config.vsh.yaml b/src/methods/cpa/config.vsh.yaml
@@ -18,7 +18,7 @@ links:
 
 info:
   # we normalize in the script. The model can operate on either raw counts or cpm and log-normalized data.
-  preferred_normalization: raw_counts
+  preferred_normalization: counts
 
 resources:
   - type: python_script

diff --git a/src/methods/cpa/script.py b/src/methods/cpa/script.py
@@ -6,8 +6,8 @@
 
 ## VIASH START
 par = {
-  'sc_train_h5ad': 'resources_test/sc_train.h5ad',
-  'sc_test_h5ad': 'resources_test/sc_test.h5ad',
+  'sc_train': 'resources_test/sc_train.h5ad',
+  'sc_test': 'resources_test/sc_test.h5ad',
   'output_sc': 'output_sc.h5ad',
 }
 meta = {
@@ -16,14 +16,14 @@
 ## VIASH END
 
 print('Reading input files', flush=True)
-sc_train_h5ad = ad.read_h5ad(par['sc_train_h5ad'])
-sc_test_h5ad = ad.read_h5ad(par['sc_test_h5ad'])
+sc_train = ad.read_h5ad(par['sc_train'])
+sc_test = ad.read_h5ad(par['sc_test'])
 
 # remove the counts from the test set to prevent leakage
-sc_test_h5ad.X[:] = 0
+sc_test.X[:] = 0
 
 print('Preprocess data for CPA', flush=True)
-sc_h5ad = ad.concat([sc_train_h5ad, sc_test_h5ad], axis=0)
+sc_h5ad = ad.concat([sc_train, sc_test], axis=0)
 sc_h5ad.obs['control'] = sc_h5ad.obs['sm_name'].eq("Dimethyl Sulfoxide").astype(int)
 sc_h5ad.layers["counts"] = sc_h5ad.X.copy()
 sc.pp.normalize_total(sc_h5ad, target_sum=1e4)

diff --git a/src/process_dataset/split_sc/script.py b/src/process_dataset/split_sc/script.py
@@ -54,6 +54,11 @@
     if col not in ["cell_count_by_well_celltype", "cell_count_by_plate_well", "obs_id"]:
         filtered_sc_counts.obs[col] = filtered_sc_counts.obs[col].astype("category")
 
+# copy uns from pseudobulk
+to_copy_uns = [key for key in pseudobulk_filtered_with_uns.uns.keys() if key.startswith("dataset_")]
+for uns_key in to_copy_uns:
+    filtered_sc_counts.uns[uns_key] = pseudobulk_filtered_with_uns.uns[uns_key]
+
 print(">> Save sc dataset into splits", flush=True)
 filtered_sc_counts[filtered_sc_counts.obs["split"] == "train"].write_h5ad(par["sc_train"], compression="gzip")
 filtered_sc_counts[filtered_sc_counts.obs["split"] == "test"].write_h5ad(par["sc_test"], compression="gzip")