Skip to content

Commit

Permalink
add sc data preprocessing (#77)
Browse files Browse the repository at this point in the history
* add sc data preprocessing

* fix task file info that was moved

* refactor scripts

* make uns metadata component an R component for now

* add split_sc to wf

* add pseudobulked data to preprocessing output

* fixes to split_sc and process_dataset wf

* update resource directory

* fix defaults

* make sc columns categorical

* fix control setting in sc adata

* update

---------

Co-authored-by: Robrecht Cannoodt <[email protected]>
  • Loading branch information
szalata and rcannood authored Oct 30, 2024
1 parent 23d44c6 commit cb4543d
Show file tree
Hide file tree
Showing 34 changed files with 275 additions and 115 deletions.
4 changes: 2 additions & 2 deletions _viash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ description: |
version: dev
license: MIT
keywords: [single-cell, perturbation prediction, perturbation, openproblems, benchmark]
keywords: [single-cell, perturbation prediction, perturbation, benchmark]
links:
issue_tracker: https://github.com/openproblems-bio/task_perturbation_prediction/issues
repository: https://github.com/openproblems-bio/task_perturbation_prediction
Expand Down Expand Up @@ -121,7 +121,7 @@ viash_version: 0.9.0
info:
test_resources:
- type: s3
path: s3://openproblems-data/resources/perturbation_prediction/datasets
path: s3://openproblems-data/resources/task_perturbation_prediction/datasets
dest: resources/datasets

# set default labels
Expand Down
40 changes: 0 additions & 40 deletions scripts/add_a_method.md

This file was deleted.

3 changes: 0 additions & 3 deletions scripts/build_components.sh

This file was deleted.

2 changes: 2 additions & 0 deletions scripts/create_component/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# if users change the scripts, the changes should not be committed.
/create_*_*.s
8 changes: 8 additions & 0 deletions scripts/create_component/create_python_method.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

set -e

common/scripts/create_component \
--name my_python_method \
--language python \
--type method
8 changes: 8 additions & 0 deletions scripts/create_component/create_python_metric.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

set -e

common/scripts/create_component \
--name my_python_metric \
--language python \
--type metric
8 changes: 8 additions & 0 deletions scripts/create_component/create_r_method.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

set -e

common/scripts/create_component \
--name my_r_method \
--language r \
--type method
8 changes: 8 additions & 0 deletions scripts/create_component/create_r_metric.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

set -e

common/scripts/create_component \
--name my_r_metric \
--language r \
--type metric
6 changes: 3 additions & 3 deletions scripts/create_readme.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

common/create_task_readme/create_task_readme \
--task_dir src \
--output README.md
set -e

common/scripts/create_task_readme --input src/api
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
--output "$OUT/score.h5ad"

echo ">> Uploading results to S3"
# aws s3 sync --profile op \
# "resources/datasets" \
# "s3://openproblems-data/resources/perturbation_prediction/datasets/" \
# --delete --dryrun
aws s3 sync --profile op \
"resources/datasets" \
"s3://openproblems-data/resources/task_perturbation_prediction/datasets/" \
--delete --dryrun
File renamed without changes.
14 changes: 0 additions & 14 deletions scripts/download_resources.sh

This file was deleted.

3 changes: 0 additions & 3 deletions scripts/init_submodule.sh

This file was deleted.

6 changes: 6 additions & 0 deletions scripts/project/build_all_components.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

set -e

# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
viash ns build --parallel
7 changes: 7 additions & 0 deletions scripts/project/build_all_docker_containers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

set -e

# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
# and set up the container via a cached build
viash ns build --parallel --setup cachedbuild
6 changes: 6 additions & 0 deletions scripts/project/test_all_components.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

set -e

# Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html)
viash ns test --parallel
Original file line number Diff line number Diff line change
@@ -1,10 +1,27 @@
#!/bin/bash

export NXF_VER=23.04.2
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

# NOTE: depending on the the datasets and components, you may need to launch this workflow
# on a different compute platform (e.g. a HPC, AWS Cloud, Azure Cloud, Google Cloud).
# please refer to the nextflow information for more details:
# https://www.nextflow.io/docs/latest/

set -e

echo "Running benchmark on test data"
echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!"

# generate a unique id
resources_dir="resources"
publish_dir="output/test_run_benchmark"
RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
publish_dir="resources/results/${RUN_ID}"

# write the parameters to file
cat > /tmp/params.yaml << HERE
param_list:
- id: neurips-2023-data
Expand All @@ -21,8 +38,10 @@ output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

# run the benchmark
nextflow run . \
-main-script target/nextflow/workflows/run_benchmark/main.nf \
-profile docker \
-resume \
-c common/nextflow_helpers/labels_ci.config \
-params-file /tmp/params.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
#!/bin/bash

# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

set -e

# generate a unique id
RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
resources_dir="s3://openproblems-data/resources/perturbation_prediction/datasets/"
publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
resources_dir="s3://openproblems-data/resources/task_perturbation_prediction/datasets/"
publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/results/${RUN_ID}"

# write the parameters to file
cat > /tmp/params.yaml << HERE
param_list:
- id: neurips-2023-data
Expand All @@ -20,11 +30,13 @@ output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch openproblems-bio/task_perturbation_prediction \
tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
--revision build/main \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--config src/common/nextflow_helpers/labels_tw.config
--entry-name auto \
--config common/nextflow_helpers/labels_tw.config \
--labels task_perturbation_prediction,full
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

RUN_ID="stability_$(date +%Y-%m-%d_%H-%M-%S)"
publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
id: neurips-2023-data
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#!/bin/bash

aws s3 sync \
s3://openproblems-data/resources/perturbation_prediction/results/ \
s3://openproblems-data/resources/task_perturbation_prediction/results/ \
output/benchmark_results/ \
--delete --dryrun

# sync back modified results
aws s3 sync \
output/benchmark_results/ \
s3://openproblems-data/resources/perturbation_prediction/results/ \
s3://openproblems-data/resources/task_perturbation_prediction/results/ \
--delete --dryrun

# sync one run
runid=run_2024-06-01_00-03-09; aws s3 sync \
output/benchmark_results/${runid}/ \
s3://openproblems-data/resources/perturbation_prediction/results/${runid}/ \
s3://openproblems-data/resources/task_perturbation_prediction/results/${runid}/ \
--delete --dryrun
5 changes: 5 additions & 0 deletions scripts/sync_resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

set -e

common/scripts/sync_resources
3 changes: 0 additions & 3 deletions scripts/test_components.sh

This file was deleted.

17 changes: 16 additions & 1 deletion src/api/comp_process_dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,19 @@ arguments:
__merge__: file_id_map.yaml
required: true
direction: output
default: id_map.csv
default: id_map.csv
- name: --sc_train_h5ad
type: file
required: false
direction: output
default: sc_train.h5ad
- name: --sc_test_h5ad
type: file
required: false
direction: output
default: sc_test.h5ad
- name: --pseudobulk_filtered_with_uns
type: file
required: false
direction: output
default: pseudobulk_filtered_with_uns.h5ad
6 changes: 3 additions & 3 deletions src/process_dataset/add_uns_metadata/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ arguments:
direction: output
example: resources/datasets/neurips-2023-data/pseudobulk_uns.h5ad
resources:
- type: python_script
path: script.py
- type: r_script
path: script.R
engines:
- type: docker
image: openproblems/base_python:1.0.0
image: openproblems/base_r:1.0.0
runners:
- type: executable
- type: nextflow
Expand Down
25 changes: 25 additions & 0 deletions src/process_dataset/add_uns_metadata/script.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
library(anndata)

## VIASH START
par <- list(
"input" = "resources/neurips-2023-raw/pseudobulk_cleaned.h5ad",
"dataset_id" = "neurips-2023-data",
"dataset_name" = "NeurIPS2023 scPerturb DGE",
"dataset_url" = "TBD",
"dataset_reference" = "TBD",
"dataset_summary" = "Differential gene expression ...",
"dataset_description" = "For this competition, we designed ...",
"dataset_organism" = "homo_sapiens",
"output" = "resources/datasets/neurips-2023-data/pseudobulk_uns.h5ad"
)
## VIASH END

cat(">> Load dataset\n")
input <- read_h5ad(par$input)

for (key in c("dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism")) {
input$uns[[key]] <- par[[key]]
}

cat(">> Save filtered bulk dataset\n")
input$write_h5ad(par$output, compression = "gzip")
25 changes: 0 additions & 25 deletions src/process_dataset/add_uns_metadata/script.py

This file was deleted.

Loading

0 comments on commit cb4543d

Please sign in to comment.