diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index ecfe2bb2..f0aa5af6 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -27,15 +27,13 @@ arguments: direction: output default: id_map.csv - name: --sc_train - type: file + __merge__: file_sc_train.yaml required: false direction: output - default: sc_train.h5ad - name: --sc_test - type: file + __merge__: file_sc_test.yaml required: false direction: output - default: sc_test.h5ad - name: --pseudobulk_filtered_with_uns type: file required: false diff --git a/src/api/file_sc_test.yaml b/src/api/file_sc_test.yaml new file mode 100644 index 00000000..574df704 --- /dev/null +++ b/src/api/file_sc_test.yaml @@ -0,0 +1,114 @@ +type: file +example: resources/datasets/neurips-2023-data/sc_test.h5ad +label: SC test +summary: "Single-Cell Perturbation Data for Testing" +info: + format: + type: h5ad + obs: + - name: dose_uM + description: "Dose in micromolar." + type: integer + required: true + - name: timepoint_hr + description: "Time point measured in hours." + type: float + required: true + - name: raw_cell_id + description: "Original cell identifier." + type: string + required: true + - name: hashtag_id + description: "Identifier for hashtag oligo." + type: string + required: true + - name: well + description: "Well location in the plate." + type: string + required: true + - name: container_format + description: "Format of the container (e.g., 96-well plate)." + type: string + required: true + - name: row + description: "Row in the plate." + type: string + required: true + - name: col + description: "Column in the plate." + type: integer + required: true + - name: plate_name + description: "Name of the plate." + type: string + required: true + - name: cell_id + description: "Unique cell identifier." + type: string + required: true + - name: cell_type + description: "Type of cell (e.g., B cells, T cells CD4+)." + type: string + required: true + - name: split + description: "Dataset split type (e.g., control, treated)." + type: string + required: true + - name: donor_id + description: "Identifier for the donor." + type: string + required: true + - name: sm_name + description: "Name of the small molecule used for treatment." + type: string + required: true + - name: control + type: boolean + description: "Boolean indicating whether this instance was used as a control." + required: true + - name: SMILES + type: string + description: | + Simplified molecular-input line-entry system (SMILES) representations of the + compounds used in the experiment. This is a 1D representation of molecular + structure. These SMILES are provided by Cellarity based on the specific + compounds ordered for this experiment. + required: true + - name: sm_lincs_id + type: string + description: | + The global LINCS ID (parent) compound (in a standardized representation). + This is provided to map the data in this experiment to the LINCS Connectivity + Map data. + required: true + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true diff --git a/src/api/file_sc_train.yaml b/src/api/file_sc_train.yaml new file mode 100644 index 00000000..3e9f8371 --- /dev/null +++ b/src/api/file_sc_train.yaml @@ -0,0 +1,114 @@ +type: file +example: resources/datasets/neurips-2023-data/sc_train.h5ad +label: SC train +summary: "Single-Cell Perturbation Data for Training" +info: + format: + type: h5ad + obs: + - name: dose_uM + description: "Dose in micromolar." + type: integer + required: true + - name: timepoint_hr + description: "Time point measured in hours." + type: float + required: true + - name: raw_cell_id + description: "Original cell identifier." + type: string + required: true + - name: hashtag_id + description: "Identifier for hashtag oligo." + type: string + required: true + - name: well + description: "Well location in the plate." + type: string + required: true + - name: container_format + description: "Format of the container (e.g., 96-well plate)." + type: string + required: true + - name: row + description: "Row in the plate." + type: string + required: true + - name: col + description: "Column in the plate." + type: integer + required: true + - name: plate_name + description: "Name of the plate." + type: string + required: true + - name: cell_id + description: "Unique cell identifier." + type: string + required: true + - name: cell_type + description: "Type of cell (e.g., B cells, T cells CD4+)." + type: string + required: true + - name: split + description: "Dataset split type (e.g., control, treated)." + type: string + required: true + - name: donor_id + description: "Identifier for the donor." + type: string + required: true + - name: sm_name + description: "Name of the small molecule used for treatment." + type: string + required: true + - name: control + type: boolean + description: "Boolean indicating whether this instance was used as a control." + required: true + - name: SMILES + type: string + description: | + Simplified molecular-input line-entry system (SMILES) representations of the + compounds used in the experiment. This is a 1D representation of molecular + structure. These SMILES are provided by Cellarity based on the specific + compounds ordered for this experiment. + required: true + - name: sm_lincs_id + type: string + description: | + The global LINCS ID (parent) compound (in a standardized representation). + This is provided to map the data in this experiment to the LINCS Connectivity + Map data. + required: true + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true diff --git a/src/methods/cpa/config.vsh.yaml b/src/methods/cpa/config.vsh.yaml index 2f90250b..760dcdcd 100644 --- a/src/methods/cpa/config.vsh.yaml +++ b/src/methods/cpa/config.vsh.yaml @@ -18,7 +18,7 @@ links: info: # we normalize in the script. The model can operate on either raw counts or cpm and log-normalized data. - preferred_normalization: raw_counts + preferred_normalization: counts resources: - type: python_script diff --git a/src/methods/cpa/script.py b/src/methods/cpa/script.py index 6004137e..96fd8634 100644 --- a/src/methods/cpa/script.py +++ b/src/methods/cpa/script.py @@ -6,8 +6,8 @@ ## VIASH START par = { - 'sc_train_h5ad': 'resources_test/sc_train.h5ad', - 'sc_test_h5ad': 'resources_test/sc_test.h5ad', + 'sc_train': 'resources_test/sc_train.h5ad', + 'sc_test': 'resources_test/sc_test.h5ad', 'output_sc': 'output_sc.h5ad', } meta = { @@ -16,14 +16,14 @@ ## VIASH END print('Reading input files', flush=True) -sc_train_h5ad = ad.read_h5ad(par['sc_train_h5ad']) -sc_test_h5ad = ad.read_h5ad(par['sc_test_h5ad']) +sc_train = ad.read_h5ad(par['sc_train']) +sc_test = ad.read_h5ad(par['sc_test']) # remove the counts from the test set to prevent leakage -sc_test_h5ad.X[:] = 0 +sc_test.X[:] = 0 print('Preprocess data for CPA', flush=True) -sc_h5ad = ad.concat([sc_train_h5ad, sc_test_h5ad], axis=0) +sc_h5ad = ad.concat([sc_train, sc_test], axis=0) sc_h5ad.obs['control'] = sc_h5ad.obs['sm_name'].eq("Dimethyl Sulfoxide").astype(int) sc_h5ad.layers["counts"] = sc_h5ad.X.copy() sc.pp.normalize_total(sc_h5ad, target_sum=1e4) diff --git a/src/process_dataset/split_sc/script.py b/src/process_dataset/split_sc/script.py index f2c440c1..add41cf4 100644 --- a/src/process_dataset/split_sc/script.py +++ b/src/process_dataset/split_sc/script.py @@ -54,6 +54,11 @@ if col not in ["cell_count_by_well_celltype", "cell_count_by_plate_well", "obs_id"]: filtered_sc_counts.obs[col] = filtered_sc_counts.obs[col].astype("category") +# copy uns from pseudobulk +to_copy_uns = [key for key in pseudobulk_filtered_with_uns.uns.keys() if key.startswith("dataset_")] +for uns_key in to_copy_uns: + filtered_sc_counts.uns[uns_key] = pseudobulk_filtered_with_uns.uns[uns_key] + print(">> Save sc dataset into splits", flush=True) filtered_sc_counts[filtered_sc_counts.obs["split"] == "train"].write_h5ad(par["sc_train"], compression="gzip") filtered_sc_counts[filtered_sc_counts.obs["split"] == "test"].write_h5ad(par["sc_test"], compression="gzip")