Skip to content

Commit

Permalink
update interfaces
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Oct 31, 2024
1 parent 2c95a19 commit 120e97e
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 11 deletions.
6 changes: 2 additions & 4 deletions src/api/comp_process_dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,13 @@ arguments:
direction: output
default: id_map.csv
- name: --sc_train
type: file
__merge__: file_sc_train.yaml
required: false
direction: output
default: sc_train.h5ad
- name: --sc_test
type: file
__merge__: file_sc_test.yaml
required: false
direction: output
default: sc_test.h5ad
- name: --pseudobulk_filtered_with_uns
type: file
required: false
Expand Down
114 changes: 114 additions & 0 deletions src/api/file_sc_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
type: file
example: resources/datasets/neurips-2023-data/sc_test.h5ad
label: SC test
summary: "Single-Cell Perturbation Data for Testing"
info:
format:
type: h5ad
obs:
- name: dose_uM
description: "Dose in micromolar."
type: integer
required: true
- name: timepoint_hr
description: "Time point measured in hours."
type: float
required: true
- name: raw_cell_id
description: "Original cell identifier."
type: string
required: true
- name: hashtag_id
description: "Identifier for hashtag oligo."
type: string
required: true
- name: well
description: "Well location in the plate."
type: string
required: true
- name: container_format
description: "Format of the container (e.g., 96-well plate)."
type: string
required: true
- name: row
description: "Row in the plate."
type: string
required: true
- name: col
description: "Column in the plate."
type: integer
required: true
- name: plate_name
description: "Name of the plate."
type: string
required: true
- name: cell_id
description: "Unique cell identifier."
type: string
required: true
- name: cell_type
description: "Type of cell (e.g., B cells, T cells CD4+)."
type: string
required: true
- name: split
description: "Dataset split type (e.g., control, treated)."
type: string
required: true
- name: donor_id
description: "Identifier for the donor."
type: string
required: true
- name: sm_name
description: "Name of the small molecule used for treatment."
type: string
required: true
- name: control
type: boolean
description: "Boolean indicating whether this instance was used as a control."
required: true
- name: SMILES
type: string
description: |
Simplified molecular-input line-entry system (SMILES) representations of the
compounds used in the experiment. This is a 1D representation of molecular
structure. These SMILES are provided by Cellarity based on the specific
compounds ordered for this experiment.
required: true
- name: sm_lincs_id
type: string
description: |
The global LINCS ID (parent) compound (in a standardized representation).
This is provided to map the data in this experiment to the LINCS Connectivity
Map data.
required: true
uns:
- type: string
name: dataset_id
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
required: true
- name: dataset_name
type: string
description: A human-readable name for the dataset.
required: true
- type: string
name: dataset_url
description: Link to the original source of the dataset.
required: false
- name: dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
multiple: true
- name: dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: dataset_description
type: string
description: Long description of the dataset.
required: true
- name: dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
multiple: true
114 changes: 114 additions & 0 deletions src/api/file_sc_train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
type: file
example: resources/datasets/neurips-2023-data/sc_train.h5ad
label: SC train
summary: "Single-Cell Perturbation Data for Training"
info:
format:
type: h5ad
obs:
- name: dose_uM
description: "Dose in micromolar."
type: integer
required: true
- name: timepoint_hr
description: "Time point measured in hours."
type: float
required: true
- name: raw_cell_id
description: "Original cell identifier."
type: string
required: true
- name: hashtag_id
description: "Identifier for hashtag oligo."
type: string
required: true
- name: well
description: "Well location in the plate."
type: string
required: true
- name: container_format
description: "Format of the container (e.g., 96-well plate)."
type: string
required: true
- name: row
description: "Row in the plate."
type: string
required: true
- name: col
description: "Column in the plate."
type: integer
required: true
- name: plate_name
description: "Name of the plate."
type: string
required: true
- name: cell_id
description: "Unique cell identifier."
type: string
required: true
- name: cell_type
description: "Type of cell (e.g., B cells, T cells CD4+)."
type: string
required: true
- name: split
description: "Dataset split type (e.g., control, treated)."
type: string
required: true
- name: donor_id
description: "Identifier for the donor."
type: string
required: true
- name: sm_name
description: "Name of the small molecule used for treatment."
type: string
required: true
- name: control
type: boolean
description: "Boolean indicating whether this instance was used as a control."
required: true
- name: SMILES
type: string
description: |
Simplified molecular-input line-entry system (SMILES) representations of the
compounds used in the experiment. This is a 1D representation of molecular
structure. These SMILES are provided by Cellarity based on the specific
compounds ordered for this experiment.
required: true
- name: sm_lincs_id
type: string
description: |
The global LINCS ID (parent) compound (in a standardized representation).
This is provided to map the data in this experiment to the LINCS Connectivity
Map data.
required: true
uns:
- type: string
name: dataset_id
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
required: true
- name: dataset_name
type: string
description: A human-readable name for the dataset.
required: true
- type: string
name: dataset_url
description: Link to the original source of the dataset.
required: false
- name: dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
multiple: true
- name: dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: dataset_description
type: string
description: Long description of the dataset.
required: true
- name: dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
multiple: true
2 changes: 1 addition & 1 deletion src/methods/cpa/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ links:

info:
# we normalize in the script. The model can operate on either raw counts or cpm and log-normalized data.
preferred_normalization: raw_counts
preferred_normalization: counts

resources:
- type: python_script
Expand Down
12 changes: 6 additions & 6 deletions src/methods/cpa/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

## VIASH START
par = {
'sc_train_h5ad': 'resources_test/sc_train.h5ad',
'sc_test_h5ad': 'resources_test/sc_test.h5ad',
'sc_train': 'resources_test/sc_train.h5ad',
'sc_test': 'resources_test/sc_test.h5ad',
'output_sc': 'output_sc.h5ad',
}
meta = {
Expand All @@ -16,14 +16,14 @@
## VIASH END

print('Reading input files', flush=True)
sc_train_h5ad = ad.read_h5ad(par['sc_train_h5ad'])
sc_test_h5ad = ad.read_h5ad(par['sc_test_h5ad'])
sc_train = ad.read_h5ad(par['sc_train'])
sc_test = ad.read_h5ad(par['sc_test'])

# remove the counts from the test set to prevent leakage
sc_test_h5ad.X[:] = 0
sc_test.X[:] = 0

print('Preprocess data for CPA', flush=True)
sc_h5ad = ad.concat([sc_train_h5ad, sc_test_h5ad], axis=0)
sc_h5ad = ad.concat([sc_train, sc_test], axis=0)
sc_h5ad.obs['control'] = sc_h5ad.obs['sm_name'].eq("Dimethyl Sulfoxide").astype(int)
sc_h5ad.layers["counts"] = sc_h5ad.X.copy()
sc.pp.normalize_total(sc_h5ad, target_sum=1e4)
Expand Down
5 changes: 5 additions & 0 deletions src/process_dataset/split_sc/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@
if col not in ["cell_count_by_well_celltype", "cell_count_by_plate_well", "obs_id"]:
filtered_sc_counts.obs[col] = filtered_sc_counts.obs[col].astype("category")

# copy uns from pseudobulk
to_copy_uns = [key for key in pseudobulk_filtered_with_uns.uns.keys() if key.startswith("dataset_")]
for uns_key in to_copy_uns:
filtered_sc_counts.uns[uns_key] = pseudobulk_filtered_with_uns.uns[uns_key]

print(">> Save sc dataset into splits", flush=True)
filtered_sc_counts[filtered_sc_counts.obs["split"] == "train"].write_h5ad(par["sc_train"], compression="gzip")
filtered_sc_counts[filtered_sc_counts.obs["split"] == "test"].write_h5ad(par["sc_test"], compression="gzip")

0 comments on commit 120e97e

Please sign in to comment.