add sc data preprocessing (#77)

* add sc data preprocessing * fix task file info that was moved * refactor scripts * make uns metadata component an R component for now * add split_sc to wf * add pseudobulked data to preprocessing output * fixes to split_sc and process_dataset wf * update resource directory * fix defaults * make sc columns categorical * fix control setting in sc adata * update --------- Co-authored-by: Robrecht Cannoodt <[email protected]>
openproblems-bio · Oct 30, 2024 · cb4543d · cb4543d
1 parent 23d44c6
commit cb4543d
Show file tree

Hide file tree

Showing 34 changed files with 275 additions and 115 deletions.
diff --git a/_viash.yaml b/_viash.yaml
@@ -47,7 +47,7 @@ description: |
 
 version: dev
 license: MIT
-keywords: [single-cell, perturbation prediction, perturbation, openproblems, benchmark]
+keywords: [single-cell, perturbation prediction, perturbation, benchmark]
 links:
   issue_tracker: https://github.com/openproblems-bio/task_perturbation_prediction/issues
   repository: https://github.com/openproblems-bio/task_perturbation_prediction
@@ -121,7 +121,7 @@ viash_version: 0.9.0
 info:
   test_resources:
     - type: s3
-      path: s3://openproblems-data/resources/perturbation_prediction/datasets
+      path: s3://openproblems-data/resources/task_perturbation_prediction/datasets
       dest: resources/datasets
 
 # set default labels

diff --git a/scripts/add_a_method.md b/scripts/add_a_method.md
diff --git a/scripts/build_components.sh b/scripts/build_components.sh
diff --git a/scripts/create_component/.gitignore b/scripts/create_component/.gitignore
@@ -0,0 +1,2 @@
+# if users change the scripts, the changes should not be committed.
+/create_*_*.s
diff --git a/scripts/create_component/create_python_method.sh b/scripts/create_component/create_python_method.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_python_method \
+  --language python \
+  --type method
diff --git a/scripts/create_component/create_python_metric.sh b/scripts/create_component/create_python_metric.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_python_metric \
+  --language python \
+  --type metric
diff --git a/scripts/create_component/create_r_method.sh b/scripts/create_component/create_r_method.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_r_method \
+  --language r \
+  --type method
diff --git a/scripts/create_component/create_r_metric.sh b/scripts/create_component/create_r_metric.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_r_metric \
+  --language r \
+  --type metric
diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
-common/create_task_readme/create_task_readme \
-  --task_dir src \
-  --output README.md
+set -e
+
+common/scripts/create_task_readme --input src/api
diff --git a/scripts/datasets/neurips-2023-data.sh → ...pts/create_resources/neurips-2023-data.sh b/scripts/datasets/neurips-2023-data.sh → ...pts/create_resources/neurips-2023-data.sh
@@ -54,7 +54,7 @@ viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
   --output "$OUT/score.h5ad"
 
 echo ">> Uploading results to S3"
-# aws s3 sync --profile op \
-#   "resources/datasets" \
-#   "s3://openproblems-data/resources/perturbation_prediction/datasets/" \
-#   --delete --dryrun
+aws s3 sync --profile op \
+  "resources/datasets" \
+  "s3://openproblems-data/resources/task_perturbation_prediction/datasets/" \
+  --delete --dryrun
diff --git a/scripts/datasets/neurips-2023-kaggle.sh → ...s/create_resources/neurips-2023-kaggle.sh b/scripts/datasets/neurips-2023-kaggle.sh → ...s/create_resources/neurips-2023-kaggle.sh
diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh
diff --git a/scripts/init_submodule.sh b/scripts/init_submodule.sh
diff --git a/scripts/project/build_all_components.sh b/scripts/project/build_all_components.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
+viash ns build --parallel
diff --git a/scripts/project/build_all_docker_containers.sh b/scripts/project/build_all_docker_containers.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e
+
+# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
+# and set up the container via a cached build
+viash ns build --parallel --setup cachedbuild
diff --git a/scripts/project/test_all_components.sh b/scripts/project/test_all_components.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+# Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html)
+viash ns test --parallel
diff --git a/scripts/run_benchmark_test.sh → scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark_test.sh → scripts/run_benchmark/run_full_local.sh
@@ -1,10 +1,27 @@
 #!/bin/bash
 
-export NXF_VER=23.04.2
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
 
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# NOTE: depending on the the datasets and components, you may need to launch this workflow
+# on a different compute platform (e.g. a HPC, AWS Cloud, Azure Cloud, Google Cloud).
+# please refer to the nextflow information for more details:
+# https://www.nextflow.io/docs/latest/
+
+set -e
+
+echo "Running benchmark on test data"
+echo "  Make sure to run 'scripts/project/build_all_docker_containers.sh'!"
+
+# generate a unique id
 resources_dir="resources"
-publish_dir="output/test_run_benchmark"
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="resources/results/${RUN_ID}"
 
+# write the parameters to file
 cat > /tmp/params.yaml << HERE
 param_list:
   - id: neurips-2023-data
@@ -21,8 +38,10 @@ output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
+# run the benchmark
 nextflow run . \
   -main-script target/nextflow/workflows/run_benchmark/main.nf \
   -profile docker \
   -resume \
+  -c common/nextflow_helpers/labels_ci.config \
   -params-file /tmp/params.yaml
diff --git a/scripts/run_benchmark_tw.sh → ...pts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark_tw.sh → ...pts/run_benchmark/run_full_seqeracloud.sh
@@ -1,9 +1,19 @@
 #!/bin/bash
 
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+# generate a unique id
 RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-resources_dir="s3://openproblems-data/resources/perturbation_prediction/datasets/"
-publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
+resources_dir="s3://openproblems-data/resources/task_perturbation_prediction/datasets/"
+publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/results/${RUN_ID}"
 
+# write the parameters to file
 cat > /tmp/params.yaml << HERE
 param_list:
   - id: neurips-2023-data
@@ -20,11 +30,13 @@ output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-tw launch openproblems-bio/task_perturbation_prediction \
+tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file /tmp/params.yaml \
-  --config src/common/nextflow_helpers/labels_tw.config
+  --entry-name auto \
+  --config common/nextflow_helpers/labels_tw.config \
+  --labels task_perturbation_prediction,full
diff --git a/scripts/run_stability_test.sh → scripts/run_benchmark/run_stability_test.sh b/scripts/run_stability_test.sh → scripts/run_benchmark/run_stability_test.sh
diff --git a/scripts/run_stability_tw.sh → scripts/run_benchmark/run_stability_tw.sh b/scripts/run_stability_tw.sh → scripts/run_benchmark/run_stability_tw.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 RUN_ID="stability_$(date +%Y-%m-%d_%H-%M-%S)"
-publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
+publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/results/${RUN_ID}"
 
 cat > /tmp/params.yaml << HERE
 id: neurips-2023-data

diff --git a/scripts/sync_results.sh → scripts/run_benchmark/sync_results.sh b/scripts/sync_results.sh → scripts/run_benchmark/sync_results.sh
@@ -1,18 +1,18 @@
 #!/bin/bash
 
 aws s3 sync \
-  s3://openproblems-data/resources/perturbation_prediction/results/ \
+  s3://openproblems-data/resources/task_perturbation_prediction/results/ \
   output/benchmark_results/ \
   --delete --dryrun
 
 # sync back modified results
 aws s3 sync \
   output/benchmark_results/ \
-  s3://openproblems-data/resources/perturbation_prediction/results/ \
+  s3://openproblems-data/resources/task_perturbation_prediction/results/ \
   --delete --dryrun
 
 # sync one run
 runid=run_2024-06-01_00-03-09; aws s3 sync \
   output/benchmark_results/${runid}/ \
-  s3://openproblems-data/resources/perturbation_prediction/results/${runid}/ \
+  s3://openproblems-data/resources/task_perturbation_prediction/results/${runid}/ \
   --delete --dryrun
diff --git a/scripts/sync_resources.sh b/scripts/sync_resources.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/sync_resources
diff --git a/scripts/test_components.sh b/scripts/test_components.sh
diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml
@@ -25,4 +25,19 @@ arguments:
     __merge__: file_id_map.yaml
     required: true
     direction: output
-    default: id_map.csv
+    default: id_map.csv
+  - name: --sc_train_h5ad
+    type: file
+    required: false
+    direction: output
+    default: sc_train.h5ad
+  - name: --sc_test_h5ad
+    type: file
+    required: false
+    direction: output
+    default: sc_test.h5ad
+  - name: --pseudobulk_filtered_with_uns
+    type: file
+    required: false
+    direction: output
+    default: pseudobulk_filtered_with_uns.h5ad
diff --git a/src/process_dataset/add_uns_metadata/config.vsh.yaml b/src/process_dataset/add_uns_metadata/config.vsh.yaml
@@ -47,11 +47,11 @@ arguments:
     direction: output
     example: resources/datasets/neurips-2023-data/pseudobulk_uns.h5ad
 resources:
-  - type: python_script
-    path: script.py
+  - type: r_script
+    path: script.R
 engines:
   - type: docker
-    image: openproblems/base_python:1.0.0
+    image: openproblems/base_r:1.0.0
 runners:
   - type: executable
   - type: nextflow

diff --git a/src/process_dataset/add_uns_metadata/script.R b/src/process_dataset/add_uns_metadata/script.R
@@ -0,0 +1,25 @@
+library(anndata)
+
+## VIASH START
+par <- list(
+  "input" = "resources/neurips-2023-raw/pseudobulk_cleaned.h5ad",
+  "dataset_id" = "neurips-2023-data",
+  "dataset_name" = "NeurIPS2023 scPerturb DGE",
+  "dataset_url" = "TBD",
+  "dataset_reference" = "TBD",
+  "dataset_summary" = "Differential gene expression ...",
+  "dataset_description" = "For this competition, we designed ...",
+  "dataset_organism" = "homo_sapiens",
+  "output" = "resources/datasets/neurips-2023-data/pseudobulk_uns.h5ad"
+)
+## VIASH END
+
+cat(">> Load dataset\n")
+input <- read_h5ad(par$input)
+
+for (key in c("dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism")) {
+  input$uns[[key]] <- par[[key]]
+}
+
+cat(">> Save filtered bulk dataset\n")
+input$write_h5ad(par$output, compression = "gzip")
diff --git a/src/process_dataset/add_uns_metadata/script.py b/src/process_dataset/add_uns_metadata/script.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# if users change the scripts, the changes should not be committed.
		/create__.s