diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index efdf563..7bc6da6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,6 +15,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + submodules: true - uses: viash-io/viash-actions/setup@v5 @@ -83,6 +85,8 @@ jobs: - uses: data-intuitive/reclaim-the-bytes@v2 - uses: actions/checkout@v4 + with: + submodules: true - uses: viash-io/viash-actions/setup@v5 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5f35250..aeabf9e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,6 +40,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + submodules: true - uses: viash-io/viash-actions/setup@v5 @@ -86,6 +87,8 @@ jobs: - uses: data-intuitive/reclaim-the-bytes@v2 - uses: actions/checkout@v4 + with: + submodules: true - uses: viash-io/viash-actions/setup@v5 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ed29877 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "common"] + path = common + url = https://github.com/openproblems-bio/common-resources.git diff --git a/README.md b/README.md new file mode 100644 index 0000000..40c7287 --- /dev/null +++ b/README.md @@ -0,0 +1,374 @@ +# Spatial decomposition + + + + +Estimation of cell type proportions per spot in 2D space from spatial +transcriptomic data coupled with corresponding single-cell data + +Path to source: +[`src`](https://github.com/openproblems-bio/task-spatial-decomposition/tree/main/src) + +## Motivation + +Spatial decomposition (also often referred to as Spatial deconvolution) +is applicable to spatial transcriptomics data where the transcription +profile of each capture location (spot, voxel, bead, etc.) do not share +a bijective relationship with the cells in the tissue, i.e., multiple +cells may contribute to the same capture location. The task of spatial +decomposition then refers to estimating the composition of cell +types/states that are present at each capture location. The cell +type/states estimates are presented as proportion values, representing +the proportion of the cells at each capture location that belong to a +given cell type. + +## Description + +We distinguish between *reference-based* decomposition and *de novo* +decomposition, where the former leverage external data (e.g., scRNA-seq +or scNuc-seq) to guide the inference process, while the latter only work +with the spatial data. We require that all datasets have an associated +reference single cell data set, but methods are free to ignore this +information. Due to the lack of real datasets with the necessary +ground-truth, this task makes use of a simulated dataset generated by +creating cell-aggregates by sampling from a Dirichlet distribution. The +ground-truth dataset consists of the spatial expression matrix, XY +coordinates of the spots, true cell-type proportions for each spot, and +the reference single-cell data (from which cell aggregated were +simulated). + +## Authors & contributors + +| name | roles | +|:-----------------|:-------------------| +| Giovanni Palla | author, maintainer | +| Scott Gigante | author | +| Sai Nirmayi Yasa | author | + +## API + +``` mermaid +flowchart LR + file_common_dataset("Common Dataset") + comp_process_dataset[/"Data processor"/] + file_single_cell("Single cell data") + file_spatial_masked("Spatial masked") + file_solution("Solution") + comp_control_method[/"Control method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_output("Output") + file_score("Score") + file_common_dataset---comp_process_dataset + comp_process_dataset-->file_single_cell + comp_process_dataset-->file_spatial_masked + comp_process_dataset-->file_solution + file_single_cell---comp_control_method + file_single_cell---comp_method + file_spatial_masked---comp_control_method + file_spatial_masked---comp_method + file_solution---comp_control_method + file_solution---comp_metric + comp_control_method-->file_output + comp_method-->file_output + comp_metric-->file_score + file_output---comp_metric +``` + +## File format: Common Dataset + +A subset of the common dataset. + +Example file: +`resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/dataset_simulated.h5ad` + +Format: + +
+ + AnnData object + obs: 'cell_type', 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca', 'coordinates', 'proportions_true' + layers: 'counts' + uns: 'cell_type_names', 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:--------------------------------------------------------------------------------------------------------------------| +| `obs["cell_type"]` | `string` | Cell type label IDs. | +| `obs["batch"]` | `string` | A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsm["coordinates"]` | `double` | (*Optional*) XY coordinates for each spot. | +| `obsm["proportions_true"]` | `double` | (*Optional*) True cell type proportions for each spot. | +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["cell_type_names"]` | `string` | (*Optional*) Cell type names corresponding to values in `cell_type`. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | + +
+ +## Component type: Data processor + +Path: +[`src/process_dataset`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/process_dataset) + +A spatial decomposition dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:--------------------------|:-------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `--input` | `file` | A subset of the common dataset. | +| `--output_single_cell` | `file` | (*Output*) The single-cell data file used as reference for the spatial data. | +| `--output_spatial_masked` | `file` | (*Output*) The spatial data file containing transcription profiles for each capture location, without cell-type proportions for each spot. | +| `--output_solution` | `file` | (*Output*) The spatial data file containing transcription profiles for each capture location, with true cell-type proportions for each spot / capture location. | + +
+ +## File format: Single cell data + +The single-cell data file used as reference for the spatial data + +Example file: +`resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad` + +Format: + +
+ + AnnData object + obs: 'cell_type', 'batch' + layers: 'counts' + uns: 'cell_type_names', 'dataset_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-------------------------|:----------|:---------------------------------------------------------------------------------------------------------------------------------| +| `obs["cell_type"]` | `string` | Cell type label IDs. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["cell_type_names"]` | `string` | Cell type names corresponding to values in `cell_type`. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | + +
+ +## File format: Spatial masked + +The spatial data file containing transcription profiles for each capture +location, without cell-type proportions for each spot. + +Example file: +`resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad` + +Format: + +
+ + AnnData object + obsm: 'coordinates' + layers: 'counts' + uns: 'cell_type_names', 'dataset_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-------------------------|:----------|:--------------------------------------------------------------------------| +| `obsm["coordinates"]` | `double` | XY coordinates for each spot. | +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["cell_type_names"]` | `string` | Cell type names corresponding to columns of `proportions_pred` in output. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | + +
+ +## File format: Solution + +The spatial data file containing transcription profiles for each capture +location, with true cell-type proportions for each spot / capture +location. + +Example file: +`resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/solution.h5ad` + +Format: + +
+ + AnnData object + obsm: 'coordinates', 'proportions_true' + layers: 'counts' + uns: 'cell_type_names', 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obsm["coordinates"]` | `double` | XY coordinates for each spot. | +| `obsm["proportions_true"]` | `double` | True cell type proportions for each spot. | +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["cell_type_names"]` | `string` | Cell type names corresponding to columns of `proportions`. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Control method + +Path: +[`src/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/control_methods) + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------------|:-------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| +| `--input_single_cell` | `file` | The single-cell data file used as reference for the spatial data. | +| `--input_spatial_masked` | `file` | The spatial data file containing transcription profiles for each capture location, without cell-type proportions for each spot. | +| `--input_solution` | `file` | The spatial data file containing transcription profiles for each capture location, with true cell-type proportions for each spot / capture location. | +| `--output` | `file` | (*Output*) Spatial data with estimated proportions. | + +
+ +## Component type: Method + +Path: +[`src/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/methods) + +A spatial composition method. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------------|:-------|:--------------------------------------------------------------------------------------------------------------------------------| +| `--input_single_cell` | `file` | The single-cell data file used as reference for the spatial data. | +| `--input_spatial_masked` | `file` | The spatial data file containing transcription profiles for each capture location, without cell-type proportions for each spot. | +| `--output` | `file` | (*Output*) Spatial data with estimated proportions. | + +
+ +## Component type: Metric + +Path: +[`src/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/metrics) + +A spatial decomposition metric. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------|:-------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| +| `--input_method` | `file` | Spatial data with estimated proportions. | +| `--input_solution` | `file` | The spatial data file containing transcription profiles for each capture location, with true cell-type proportions for each spot / capture location. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Output + +Spatial data with estimated proportions. + +Example file: +`resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/output.h5ad` + +Description: + +Spatial data file with estimated cell type proportions. + +Format: + +
+ + AnnData object + obsm: 'coordinates', 'proportions_pred' + layers: 'counts' + uns: 'cell_type_names', 'dataset_id', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------------|:----------|:-----------------------------------------------------------| +| `obsm["coordinates"]` | `double` | XY coordinates for each spot. | +| `obsm["proportions_pred"]` | `double` | Estimated cell type proportions for each spot. | +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["cell_type_names"]` | `string` | Cell type names corresponding to columns of `proportions`. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +Metric score file. + +Example file: +`resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ diff --git a/common b/common new file mode 160000 index 0000000..81d4268 --- /dev/null +++ b/common @@ -0,0 +1 @@ +Subproject commit 81d42682169dcf3990f26a9865d658baf67e6335 diff --git a/scripts/add_component.sh b/scripts/add_component.sh new file mode 100644 index 0000000..f222478 --- /dev/null +++ b/scripts/add_component.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +echo "This script is not supposed to be run directly." +echo "Please run the script step-by-step." +exit 1 + +# create a new component (types: method, metric, control_method) +type="method" +lang="python" # change this to "r" if need be + +common/create_component/create_component \ + --task spatial_decomposition \ + --type $type \ + --name component_name \ + --language $lang + +# TODO: fill in required fields in src/methods/foo/config.vsh.yaml +# TODO: edit src/methods/foo/script.py/R + +# test the component +viash test src/$type/component_name/config.vsh.yaml diff --git a/scripts/build_components.sh b/scripts/build_components.sh new file mode 100644 index 0000000..60e3270 --- /dev/null +++ b/scripts/build_components.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Build all components in namespace +viash ns build --parallel --setup cb diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh new file mode 100644 index 0000000..9b012f1 --- /dev/null +++ b/scripts/download_resources.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Downloading test resources +directories=("common" "spatial_decomposition") +for dir in ${directories[@]}; do + common/sync_resources/sync_resources \ + --input "s3://openproblems-data/resources_test/$dir/cxg_mouse_pancreas_atlas" \ + --output "resources_test/$dir/cxg_mouse_pancreas_atlas" +done \ No newline at end of file diff --git a/scripts/render_readme.sh b/scripts/render_readme.sh new file mode 100644 index 0000000..abafdc1 --- /dev/null +++ b/scripts/render_readme.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +common/create_task_readme/create_task_readme \ + --task "spatial_decomposition" \ + --task_dir "src/" \ + --github_url "https://github.com/openproblems-bio/task-spatial-decomposition/tree/main/" \ + --output "README.md" \ No newline at end of file diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh new file mode 100644 index 0000000..2aeb014 --- /dev/null +++ b/scripts/run_benchmark.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Process dataset +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/spatial_decomposition +echo "Running process_dataset" +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_spatial_masked": "$id/spatial_masked.h5ad", "output_single_cell": "$id/single_cell_ref.h5ad", "output_solution": "$id/solution.h5ad", "alpha": 1.0, "simulated_data": "$id/dataset_simulated.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' + +# Run benchmark +DATASETS_DIR="resources_test/spatial_decomposition" +OUTPUT_DIR="output/temp" +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi +echo "Running the benchmark" +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_single_cell:output_single_cell,input_spatial_masked:output_spatial_masked,input_solution:output_solution' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ No newline at end of file diff --git a/scripts/run_benchmark_tw.sh b/scripts/run_benchmark_tw.sh new file mode 100644 index 0000000..989226f --- /dev/null +++ b/scripts/run_benchmark_tw.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +### Process datasets +# Simulating spot-resolution spatial data with alpha = 1 + +cat > /tmp/params.yaml << 'HERE' +id: spatial_decomposition_process_datasets +input_states: s3://openproblems-data/resources/datasets/**/log_cp10k/state.yaml +settings: '{"output_spatial_masked": "$id/spatial_masked.h5ad", "output_single_cell": "$id/single_cell_ref.h5ad", "output_solution": "$id/solution.h5ad", "alpha": 1.0, "simulated_data": "$id/dataset_simulated.h5ad"}' +rename_keys: 'input:output_dataset' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/spatial_decomposition/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + withLabel:highmem { + memory = '350GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/task-spatial-decomposition.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + + + +### Run benchmark + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/spatial_decomposition/results/${RUN_ID}" + +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/spatial_decomposition/datasets/**/log_cp10k/state.yaml +rename_keys: 'input_single_cell:output_single_cell,input_spatial_masked:output_spatial_masked,input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task-spatial-decomposition.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ diff --git a/scripts/test_components.sh b/scripts/test_components.sh new file mode 100644 index 0000000..b8f78b3 --- /dev/null +++ b/scripts/test_components.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html) +viash ns test --parallel + +# Test individual components +viash test src/methods/nnls/config.vsh.yaml + + +DATASET_DIR=resources_test/spatial_decomposition + +# run one method +viash run src/methods/nnls/config.vsh.yaml -- \ + --input_single_cell $DATASET_DIR/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad \ + --input_spatial_masked $DATASET_DIR/cxg_mouse_pancreas_atlas/spatial_masked.h5ad \ + --output $DATASET_DIR/cxg_mouse_pancreas_atlas/output.h5ad + +# run one metric +viash run src/metrics/r2/config.vsh.yaml -- \ + --input_method $DATASET_DIR/cxg_mouse_pancreas_atlas/output.h5ad \ + --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ + --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml new file mode 100644 index 0000000..dde7893 --- /dev/null +++ b/src/api/comp_control_method.yaml @@ -0,0 +1,38 @@ +functionality: + namespace: "control_methods" + info: + type: control_method + type_info: + label: Control method + summary: Quality control methods for verifying the pipeline. + description: | + Control methods have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. + arguments: + - name: "--input_single_cell" + __merge__: file_single_cell.yaml + direction: input + required: true + - name: "--input_spatial_masked" + __merge__: file_spatial_masked.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_output.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /common/component_tests/check_method_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + - path: /common/library.bib diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml new file mode 100644 index 0000000..dfc13d1 --- /dev/null +++ b/src/api/comp_method.yaml @@ -0,0 +1,29 @@ +functionality: + namespace: "methods" + info: + type: method + type_info: + label: Method + summary: A spatial composition method. + description: "Method to estimate cell type proportions from spatial and single cell data" + arguments: + - name: "--input_single_cell" + __merge__: file_single_cell.yaml + direction: input + required: true + - name: "--input_spatial_masked" + __merge__: file_spatial_masked.yaml + direction: input + required: true + - name: "--output" + __merge__: file_output.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /common/component_tests/check_method_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + - path: /common/library.bib \ No newline at end of file diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml new file mode 100644 index 0000000..d333524 --- /dev/null +++ b/src/api/comp_metric.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: "metrics" + info: + type: metric + type_info: + label: Metric + summary: A spatial decomposition metric. + description: | + A metric for evaluating accuracy of cell type proportion estimate + arguments: + - name: "--input_method" + __merge__: file_output.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /common/component_tests/check_metric_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + - path: /common/library.bib + \ No newline at end of file diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml new file mode 100644 index 0000000..a72684c --- /dev/null +++ b/src/api/comp_process_dataset.yaml @@ -0,0 +1,33 @@ +functionality: + namespace: process_dataset + info: + type: process_dataset + type_info: + label: Data processor + summary: A spatial decomposition dataset processor. + description: | + Prepare a common dataset for the spatial_decomposition task. + arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_single_cell" + __merge__: file_single_cell.yaml + direction: output + required: true + - name: "--output_spatial_masked" + __merge__: file_spatial_masked.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + # - path: /resources_test/common/cxg_mouse_pancreas_atlas + # dest: resources_test/common/cxg_mouse_pancreas_atlas + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas \ No newline at end of file diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml new file mode 100644 index 0000000..b5399a3 --- /dev/null +++ b/src/api/file_common_dataset.yaml @@ -0,0 +1,75 @@ +type: file +example: "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/dataset_simulated.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts. + required: true + obs: + - type: string + name: cell_type + description: Cell type label IDs. + required: true + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + - type: double + name: coordinates + description: XY coordinates for each spot. + required: false + - type: double + name: proportions_true + description: True cell type proportions for each spot + required: false + uns: + - type: string + name: cell_type_names + description: Cell type names corresponding to values in `cell_type` + required: false + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false \ No newline at end of file diff --git a/src/api/file_output.yaml b/src/api/file_output.yaml new file mode 100644 index 0000000..4328ea5 --- /dev/null +++ b/src/api/file_output.yaml @@ -0,0 +1,35 @@ +type: file +example: "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/output.h5ad" +info: + label: Output + summary: "Spatial data with estimated proportions." + description: "Spatial data file with estimated cell type proportions." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obsm: + - type: double + name: coordinates + description: XY coordinates for each spot + required: true + - type: double + name: proportions_pred + description: Estimated cell type proportions for each spot + required: true + uns: + - type: string + name: cell_type_names + description: Cell type names corresponding to columns of `proportions` + required: true + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + \ No newline at end of file diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml new file mode 100644 index 0000000..fea2d39 --- /dev/null +++ b/src/api/file_score.yaml @@ -0,0 +1,25 @@ +type: file +example: "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/score.h5ad" +info: + label: "Score" + summary: Metric score file. + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/api/file_single_cell.yaml b/src/api/file_single_cell.yaml new file mode 100644 index 0000000..0ec0a94 --- /dev/null +++ b/src/api/file_single_cell.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad" +info: + label: "Single cell data" + summary: "The single-cell data file used as reference for the spatial data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: cell_type + description: Cell type label IDs + required: true + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + uns: + - type: string + name: cell_type_names + description: Cell type names corresponding to values in `cell_type` + required: true + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true \ No newline at end of file diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml new file mode 100644 index 0000000..ecd447e --- /dev/null +++ b/src/api/file_solution.yaml @@ -0,0 +1,57 @@ +type: file +example: "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/solution.h5ad" +info: + label: Solution + summary: "The spatial data file containing transcription profiles for each capture location, with true cell-type proportions for each spot / capture location." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obsm: + - type: double + name: coordinates + description: XY coordinates for each spot + required: true + - type: double + name: proportions_true + description: True cell type proportions for each spot + required: true + uns: + - name: cell_type_names + type: string + description: Cell type names corresponding to columns of `proportions` + required: true + - name: dataset_id + type: string + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true \ No newline at end of file diff --git a/src/api/file_spatial_masked.yaml b/src/api/file_spatial_masked.yaml new file mode 100644 index 0000000..77632b5 --- /dev/null +++ b/src/api/file_spatial_masked.yaml @@ -0,0 +1,25 @@ +type: file +example: "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad" +info: + label: "Spatial masked" + summary: "The spatial data file containing transcription profiles for each capture location, without cell-type proportions for each spot." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obsm: + - type: double + name: coordinates + description: XY coordinates for each spot + required: true + uns: + - type: string + name: cell_type_names + description: Cell type names corresponding to columns of `proportions_pred` in output + required: true + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true \ No newline at end of file diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml new file mode 100644 index 0000000..0fa3e16 --- /dev/null +++ b/src/api/task_info.yaml @@ -0,0 +1,23 @@ +name: spatial_decomposition +label: "Spatial decomposition" +summary: "Estimation of cell type proportions per spot in 2D space from spatial transcriptomic data coupled with corresponding single-cell data" +motivation: | + Spatial decomposition (also often referred to as Spatial deconvolution) is applicable to spatial transcriptomics data where the transcription profile of each capture location (spot, voxel, bead, etc.) do not share a bijective relationship with the cells in the tissue, i.e., multiple cells may contribute to the same capture location. The task of spatial decomposition then refers to estimating the composition of cell types/states that are present at each capture location. The cell type/states estimates are presented as proportion values, representing the proportion of the cells at each capture location that belong to a given cell type. +description: | + We distinguish between _reference-based_ decomposition and _de novo_ decomposition, where the former leverage external data (e.g., scRNA-seq or scNuc-seq) to guide the inference process, while the latter only work with the spatial data. We require that all datasets have an associated reference single cell data set, but methods are free to ignore this information. + Due to the lack of real datasets with the necessary ground-truth, this task makes use of a simulated dataset generated by creating cell-aggregates by sampling from a Dirichlet distribution. The ground-truth dataset consists of the spatial expression matrix, XY coordinates of the spots, true cell-type proportions for each spot, and the reference single-cell data (from which cell aggregated were simulated). +authors: + - name: "Giovanni Palla" + roles: [ author, maintainer ] + info: + github: giovp + - name: "Scott Gigante" + roles: [ author ] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: "Sai Nirmayi Yasa" + roles: [ author ] + info: + github: sainirmayi + orcid: "0009-0003-6319-9803" \ No newline at end of file diff --git a/src/control_methods/random_proportions/config.vsh.yaml b/src/control_methods/random_proportions/config.vsh.yaml new file mode 100644 index 0000000..a0ccc70 --- /dev/null +++ b/src/control_methods/random_proportions/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml + +functionality: + name: random_proportions + info: + label: Random Proportions + summary: "Negative control method that randomly assigns celltype proportions from a Dirichlet distribution." + description: | + A negative control method with random assignment of predicted celltype proportions from a Dirichlet distribution. + preferred_normalization: counts + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: numpy + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/control_methods/random_proportions/script.py b/src/control_methods/random_proportions/script.py new file mode 100644 index 0000000..17af41d --- /dev/null +++ b/src/control_methods/random_proportions/script.py @@ -0,0 +1,42 @@ +import anndata as ad +import numpy as np + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'input_solution': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/solution.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'random_proportions' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial_masked = ad.read_h5ad(par['input_spatial_masked']) +input_solution = ad.read_h5ad(par['input_solution']) + +print('Generate predictions', flush=True) +label_distribution = input_single_cell.obs["cell_type"].value_counts() +input_spatial_masked.obsm["proportions_pred"] = np.random.dirichlet(label_distribution, size=input_spatial_masked.shape[0]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_spatial_masked.obs[[]], + var=input_spatial_masked.var[[]], + uns={ + 'cell_type_names': input_spatial_masked.uns['cell_type_names'], + 'dataset_id': input_spatial_masked.uns['dataset_id'], + 'method_id': meta['functionality_name'] + }, + obsm={ + 'coordinates': input_spatial_masked.obsm['coordinates'], + 'proportions_pred': input_spatial_masked.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial_masked.layers['counts'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/control_methods/true_proportions/config.vsh.yaml b/src/control_methods/true_proportions/config.vsh.yaml new file mode 100644 index 0000000..7979f59 --- /dev/null +++ b/src/control_methods/true_proportions/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_control_method.yaml + +functionality: + name: true_proportions + info: + label: True Proportions + summary: "Positive control method that assigns celltype proportions from the ground truth." + description: | + A positive control method with perfect assignment of predicted celltype proportions from the ground truth. + preferred_normalization: counts + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/control_methods/true_proportions/script.py b/src/control_methods/true_proportions/script.py new file mode 100644 index 0000000..e4c47e3 --- /dev/null +++ b/src/control_methods/true_proportions/script.py @@ -0,0 +1,40 @@ +import anndata as ad + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'input_solution': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/solution.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'true_proportions' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial_masked = ad.read_h5ad(par['input_spatial_masked']) +input_solution = ad.read_h5ad(par['input_solution']) + +print('Generate predictions', flush=True) +input_spatial_masked.obsm["proportions_pred"] = input_solution.obsm["proportions_true"] + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_spatial_masked.obs[[]], + var=input_spatial_masked.var[[]], + uns={ + 'cell_type_names': input_spatial_masked.uns['cell_type_names'], + 'dataset_id': input_spatial_masked.uns['dataset_id'], + 'method_id': meta['functionality_name'] + }, + obsm={ + 'coordinates': input_spatial_masked.obsm['coordinates'], + 'proportions_pred': input_spatial_masked.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial_masked.layers['counts'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/dataset_simulator/config.vsh.yaml b/src/dataset_simulator/config.vsh.yaml new file mode 100644 index 0000000..af9b429 --- /dev/null +++ b/src/dataset_simulator/config.vsh.yaml @@ -0,0 +1,200 @@ +functionality: + name: "dataset_simulator" + info: + type: dataset_simulator + type_info: + label: Dataset simulator + summary: Simulate cell aggregates from single-cell data. + description: | + The dataset simulator creates cell-aggregates from the single-cell dataset by sampling from a Dirichlet distribution. The simulated data consists of the the spatial expression matrix, the XY coordinates of the spots, the cell-type proportions in each spot, and the reference single-cell data. + variants: + alpha_1: + alpha: 1 + alpha_5: + alpha: 5 + alpha_0_5: + alpha: 0.5 + arguments: + - name: "--input" + type: file + description: Single-cell reference dataset + direction: input + example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" + info: + slots: + layers: + - type: integer + name: counts + description: Raw counts. + required: true + obs: + - type: string + name: cell_type + description: Cell type label IDs. + required: true + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: false + - type: integer + name: hvg_score + description: A ranking of the features by hvg. + required: false + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: "--alpha" + type: double + description: Alpha value to use for generating synthetic dataset + default: 1.0 + - name: "--n_obs" + type: integer + description: Number of spatial observations to generate. Default value is 100. + default: 100 + - name: "--cell_lb" + type: integer + description: Lower bound for number of cells at each spot. Default value is 10. + default: 10 + - name: "--cell_ub" + type: integer + description: Upper bound for number of cells at each spot. Default value is 30. + default: 30 + - name: "--umi_lb" + type: integer + description: Lower bound for number of cells at each spot. Default value is 1000. + default: 1000 + - name: "--umi_ub" + type: integer + description: Upper bound for number of UMIs at each spot. Default value is 5000. + default: 5000 + - name: "--simulated_data" + type: file + direction: output + description: Simulated dataset + required: false + example: dataset_simulated.h5ad + info: + slots: + layers: + - type: integer + name: counts + description: Raw counts. + required: true + obs: + - type: string + name: cell_type + description: Cell type label IDs. + required: true + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: false + - type: integer + name: hvg_score + description: A ranking of the features by hvg. + required: false + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: false + - type: double + name: coordinates + description: XY coordinates for each spot. + required: true + - type: double + name: proportions_true + description: True cell type proportions for each spot. + required: true + uns: + - type: string + name: cell_type_names + description: Cell type names corresponding to values in `cell_type`. + required: true + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/common/cxg_mouse_pancreas_atlas + dest: resources_test/common/cxg_mouse_pancreas_atlas +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: [numpy, scanpy] + - type: nextflow + directives: + label: [midtime, highmem, highcpu] + - type: native diff --git a/src/dataset_simulator/script.py b/src/dataset_simulator/script.py new file mode 100644 index 0000000..901d7de --- /dev/null +++ b/src/dataset_simulator/script.py @@ -0,0 +1,208 @@ +from typing import Sequence +from typing import Union + +import anndata as ad +import numpy as np +import scanpy as sc + +## VIASH START +par = { + "input": "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad", + "alpha": 1, + "n_obs": 100, + "cell_lb": 10, + "cell_ub": 30, + "umi_lb": 1000, + "umi_ub": 5000, + "simulated_data": "dataset_simulated.h5ad" +} +meta = { + "functionality_name": "dataset_simulator", + "resources_dir": "src/tasks/spatial_decomposition/dataset_simulator", +} +## VIASH END + +CELLTYPE_MIN_CELLS = 25 + +# Reading input dataset +adata = ad.read_h5ad(par['input']) + + +def generate_synthetic_dataset( + adata: ad.AnnData, + alpha: Union[float, Sequence] = 1.0, + n_obs: int = 1000, + cell_lb: int = 10, + cell_ub: int = 30, + umi_lb: int = 1000, + umi_ub: int = 5000, +) -> ad.AnnData: + """Create cell-aggregate samples for ground-truth spatial decomposition task. + + Parameters + ---------- + adata: AnnData + Anndata object. + type_column: str + name of column in `adata.obs` where cell type labels are given + alpha: Union[float,Sequence] + alpha value in dirichlet distribution. If single number then all alpha_i values + will be set to this value. Default value is 1. + n_obs: int + number of spatial observations to generate. Default value is 1000. + cell_lb: int + lower bound for number of cells at each spot. Default value is 10. + cell_ub: int + upper bound for number of cells at each spot. Default value is 30. + umi_lb: int + lower bound for number of UMIs at each spot. Default value is 10. + umi_ub: int + upper bound for number of UMIs at each spot. Default value is 30. + + Returns + ------- + AnnData with: + - `adata_merged.X`: simulated counts (aggregate of sc dataset). + - `adata_merged.obsm["proportions_true"]`: true proportion values. + - `adata_merged.obsm["coordinates"]`: coordinates of each spot. + - `adata_merged.obsm["n_cells"]`: number of cells from each type at every location. + + """ + + # remove rare celltypes + adata = filter_celltypes(adata) + + # set random generator seed + rng = np.random.default_rng(42) + + # get single cell expression data + counts = adata.layers['counts'] + # get cell annotations/labels + labels = adata.obs['cell_type'].values + # get unique labels + uni_labs = np.unique(labels) + # count number of labels + n_labs = len(uni_labs) + # get number of genes + n_genes = adata.shape[1] + + # create dict with indices of each label + label_indices = dict() + for label in uni_labs: + label_indices[label] = np.where(labels == label)[0] + + # adjust alpha to vector if single scalar + if not hasattr(alpha, "__len__"): + alpha = np.ones(n_labs) * alpha + else: + assert len(alpha) == n_labs, "alpha must be same size as number of cell types" + + # generate probability of sampling label at each spot + sp_props = rng.dirichlet(alpha, size=n_obs) + # number of cells present at each spot + n_cells = rng.integers(cell_lb, cell_ub, size=n_obs) + + # initialize spatial expression matrix + sp_x = np.zeros((n_obs, n_genes)) + # initialize spatial proportion matrix + sp_p = np.zeros((n_obs, n_labs)) + # initialize spatial cell number matrix + sp_c = np.zeros(sp_p.shape) + + # generate expression vector for each spot (s) + for s in range(n_obs): + # number of cells from each label at s + raw_s = rng.multinomial(n_cells[s], pvals=sp_props[s, :]) + # store number of cells from each type at s + sp_c[s, :] = raw_s + # compute proportion of each type at s + prop_s = raw_s / n_cells[s] + # store proportion of each type at s + sp_p[s, :] = prop_s + + # initialize transcript pool at s + pool_s = np.zeros(n_genes) + + # add molecules to transcript pool + for lab, n in enumerate(raw_s): + # get indices of cells from which transcripts should be added + idx_sl = rng.choice(label_indices[uni_labs[lab]], size=n) + # add molecules to pool + pool_s += counts[idx_sl, :].sum(axis=0).A.flatten() + + # number of UMIs at spot s + n_umis = rng.integers(umi_lb, umi_ub) + # compute probability of sampling UMI from gene + prob_pool_s = pool_s / pool_s.sum() + + # sample transcripts from pool + sp_x[s, :] = np.random.multinomial(n=n_umis, pvals=prob_pool_s) + + obs_names = ["spatial_{}".format(x) for x in range(n_obs)] + adata_spatial = ad.AnnData( + sp_x, + obs=dict(obs_names=obs_names), + var=dict(var_names=adata.var_names), + ) + + # fake coordinates + adata_spatial.obsm["coordinates"] = rng.random((adata_spatial.shape[0], 2)) + adata_spatial.obsm["proportions_true"] = sp_p + adata_spatial.obs["n_cells"] = n_cells + adata_spatial.obsm["n_cells"] = sp_c + + adata_merged = ad.concat( + {"sc": adata, "sp": adata_spatial}, + label="modality", + join="outer", + index_unique=None, + merge="unique", + uns_merge="unique" + ) + adata_merged.X[adata_merged.X == np.inf] = adata_merged.X.max() # remove inf + adata_merged.layers["counts"] = adata_merged.X + adata_merged.uns["cell_type_names"] = uni_labs + return adata_merged + + +def filter_celltypes(adata, min_cells=CELLTYPE_MIN_CELLS): + """Filter rare celltypes from an AnnData""" + celltype_counts = adata.obs["cell_type"].value_counts() >= min_cells + keep_cells = np.isin(adata.obs["cell_type"], celltype_counts.index[celltype_counts]) + return adata[adata.obs.index[keep_cells]].copy() + + +def filter_genes_cells(adata): + """Remove empty cells and genes.""" + if "var_names_all" not in adata.uns: + # fill in original var names before filtering + adata.uns["var_names_all"] = adata.var.index.to_numpy() + sc.pp.filter_genes(adata, min_cells=1) + sc.pp.filter_cells(adata, min_counts=2) + + +adata.X = adata.layers["counts"] +sc.pp.filter_genes(adata, min_counts=10) +adata_merged = generate_synthetic_dataset(adata, + alpha=par['alpha'], + n_obs=par['n_obs'], + cell_lb=par['cell_lb'], + cell_ub=par['cell_ub'], + umi_lb=par['umi_lb'], + umi_ub=par['umi_ub'] +) +adata_merged.uns["spatial_data_summary"] = f"Dirichlet alpha={par['alpha']}" +filter_genes_cells(adata_merged) +adata_merged.X = None + +# Convert non-string objects to categoricals to avoid +# TypeError: Can't implicitly convert non-string objects to strings +# In this case, the error is raised when there are NA values in .obs columns with dtype object (boolean). +# The resulting anndata object cannot be written to a file. +# This conversion is handled in later versions of anndata (0.10) +for col in adata_merged.obs: + if adata_merged.obs[col].dtype == 'object': + adata_merged.obs[col] = adata_merged.obs[col].astype('category') + +print("Writing output to file") +adata_merged.write_h5ad(par["simulated_data"]) diff --git a/src/methods/cell2location/config.vsh.yaml b/src/methods/cell2location/config.vsh.yaml new file mode 100644 index 0000000..42b99b0 --- /dev/null +++ b/src/methods/cell2location/config.vsh.yaml @@ -0,0 +1,87 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: cell2location + + info: + label: Cell2Location + summary: "Cell2location uses a Bayesian model to resolve cell types in spatial transcriptomic data and create comprehensive cellular maps of diverse tissues." + description: | + Cell2location is a decomposition method based on Negative Binomial regression that is able to account for batch effects in estimating the single-cell gene expression signature used for the spatial decomposition step. + Note that when batch information is unavailable for this task, we can use either a hard-coded reference, or a negative-binomial learned reference without batch labels. The parameter alpha refers to the detection efficiency prior. + preferred_normalization: counts + variants: + cell2location_amortised_detection_alpha_20: + detection_alpha: 20 + amortised: true + cell2location_detection_alpha_1: + detection_alpha: 1 + cell2location_detection_alpha_20: + detection_alpha: 20 + cell2location_detection_alpha_20_nb: + detection_alpha: 20 + hard_coded_reference: false + cell2location_detection_alpha_200: + detection_alpha: 200 + reference: "kleshchevnikov2022cell2location" + documentation_url: https://cell2location.readthedocs.io/en/latest/ + repository_url: https://github.com/BayraktarLab/cell2location + + # Component-specific parameters (optional) + arguments: + - name: "--detection_alpha" + type: double + default: 20.0 + description: Hyperparameter controlling normalisation of within-experiment variation in RNA detection. + - name: "--n_cells_per_location" + type: integer + default: 20 + description: The expected average cell abundance. It is a tissue-dependent hyper-prior which can be estimated from histology images + - name: "--hard_coded_reference" + type: boolean + default: true + description: Whether to use hard-coded reference or negative binomial regression model to account for batch effects. Hard-coded reference used by default. + - name: "--amortised" + type: boolean + default: false + description: Whether to use amortised inference. + - name: "--num_samples" + type: integer + default: 1000 + description: Number of samples to use for summarising posterior distribution. + - name: "--sc_batch_size" + type: integer + default: 2500 + description: Batch size used to train regression model for estimation of reference single-cell gene expression signature. + - name: "--st_batch_size" + type: integer + description: Batch size used to train cell2location model for spatial mapping. + - name: "--max_epochs_sc" + type: integer + default: 250 + description: Maximum number of epochs to train regression model for estimation of reference single-cell gene expression signature. + - name: "--max_epochs_st" + type: integer + default: 30000 + description: Maximum number of epochs to train cell2location model for spatial mapping. + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - scvi-tools==1.0.4 + - cell2location + - jax==0.4.23 + - jaxlib==0.4.23 + - scipy<1.13 # The scipy.linalg functions tri, triu & tril are deprecated and will be removed in SciPy 1.13. + + - type: native + - type: nextflow + directives: + label: [hightime, midmem, midcpu] diff --git a/src/methods/cell2location/script.py b/src/methods/cell2location/script.py new file mode 100644 index 0000000..3d47991 --- /dev/null +++ b/src/methods/cell2location/script.py @@ -0,0 +1,152 @@ +import anndata as ad +import numpy as np +from cell2location.cluster_averages.cluster_averages import compute_cluster_averages +from cell2location.models import Cell2location +from cell2location.models import RegressionModel +from torch.nn import ELU + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'output': 'output.h5ad', + 'detection_alpha': 20.0, + 'n_cells_per_location': 20, + 'hard_coded_reference': True, + 'amortised': False, + 'num_samples': 1000, + 'sc_batch_size': 2500, + 'st_batch_size': None, + 'max_epochs_sc': 250, + 'max_epochs_st': 5000 +} +meta = { + 'functionality_name': 'cell2location' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial = ad.read_h5ad(par['input_spatial_masked']) + +input_single_cell.X = input_single_cell.layers["counts"] +input_spatial.X = input_spatial.layers["counts"] + +if not par["hard_coded_reference"]: + if "batch" in input_single_cell.obs.columns: + input_single_cell.obs["batch_key"] = input_single_cell.obs["batch"].copy() + else: + input_single_cell.obs["batch_key"] = "all" + # REFERENCE SIGNATURE ESTIMATION FROM scRNA + # prepare anndata for the regression model + RegressionModel.setup_anndata( + adata=input_single_cell, + # 10X reaction / sample / batch + batch_key="batch_key", + # cell type, covariate used for constructing signatures + labels_key="cell_type", + ) + sc_model = RegressionModel(input_single_cell) + sc_model.train(max_epochs=par["max_epochs_sc"], batch_size=par["sc_batch_size"]) + # In this section, we export the estimated cell abundance + # (summary of the posterior distribution). + input_single_cell = sc_model.export_posterior( + input_single_cell, + sample_kwargs={"num_samples": par["num_samples"], "batch_size": par["sc_batch_size"]}, + ) + # export estimated expression in each cluster + try: + means_per_cluster = input_single_cell.varm["means_per_cluster_mu_fg"] + except KeyError: + # sometimes varm fails for unknown reason + means_per_cluster = input_single_cell.var + means_per_cluster = means_per_cluster[ + [ + f"means_per_cluster_mu_fg_{i}" + for i in input_single_cell.uns["mod"]["factor_names"] + ] + ].copy() + means_per_cluster.columns = input_single_cell.uns["mod"]["factor_names"] +else: + means_per_cluster = compute_cluster_averages( + input_single_cell, + labels="cell_type", + layer=None, + use_raw=False, + ) + +# SPATIAL MAPPING +# find shared genes and subset both anndata and reference signatures +intersect = np.intersect1d(input_spatial.var_names, means_per_cluster.index) +input_spatial = input_spatial[:, intersect].copy() +means_per_cluster = means_per_cluster.loc[intersect, :].copy() + +# prepare anndata for cell2location model +input_spatial.obs["sample"] = "all" +Cell2location.setup_anndata(adata=input_spatial, batch_key="sample") +cell2location_kwargs = dict( + cell_state_df=means_per_cluster, + # the expected average cell abundance: tissue-dependent hyper-prior which can be estimated from paired histology: + # here = average in the simulated dataset + N_cells_per_location=par["n_cells_per_location"], + # hyperparameter controlling normalisation of within-experiment variation in RNA detection: + detection_alpha=par["detection_alpha"], +) +if par["amortised"]: + cell2location_kwargs["amortised"] = True + cell2location_kwargs["encoder_mode"] = "multiple" + cell2location_kwargs["encoder_kwargs"] = { + "dropout_rate": 0.1, + "n_hidden": { + "single": 256, + "n_s_cells_per_location": 10, + "b_s_groups_per_location": 10, + "z_sr_groups_factors": 64, + "w_sf": 256, + "detection_y_s": 20, + }, + "use_batch_norm": False, + "use_layer_norm": True, + "n_layers": 1, + "activation_fn": ELU, + } +# create and train the model +st_model = Cell2location(input_spatial, **cell2location_kwargs) +st_model.train( + max_epochs=par["max_epochs_st"], + # train using full data (batch_size=None) + batch_size=par["st_batch_size"], + # use all data points in training because we need to estimate cell abundance at all locations + train_size=1, +) +# In this section, we export the estimated cell abundance (summary of the posterior distribution). +input_spatial = st_model.export_posterior( + input_spatial, + sample_kwargs={ + "num_samples": par["num_samples"], + "batch_size": par["st_batch_size"], + }, +) + +input_spatial.obsm["proportions_pred"] = input_spatial.obsm["q05_cell_abundance_w_sf"].values +input_spatial.obsm["proportions_pred"] /= input_spatial.obsm["proportions_pred"].sum(axis=1)[:, None] + +output = ad.AnnData( + obs=input_spatial.obs[[]], + var=input_spatial.var[[]], + obsm={ + 'coordinates': input_spatial.obsm['coordinates'], + 'proportions_pred': input_spatial.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial.layers['counts'] + }, + uns={ + 'cell_type_names': input_spatial.uns['cell_type_names'], + 'dataset_id': input_spatial.uns['dataset_id'], + 'method_id': meta['functionality_name'] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/methods/destvi/config.vsh.yaml b/src/methods/destvi/config.vsh.yaml new file mode 100644 index 0000000..a415bd7 --- /dev/null +++ b/src/methods/destvi/config.vsh.yaml @@ -0,0 +1,43 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: destvi + + info: + label: DestVI + summary: "DestVI is a probabilistic method for multi-resolution analysis for spatial transcriptomics that explicitly models continuous variation within cell types" + description: | + Deconvolution of Spatial Transcriptomics profiles using Variational Inference (DestVI) is a spatial decomposition method that leverages a conditional generative model of spatial transcriptomics down to the sub-cell-type variation level, which is then used to decompose the cell-type proportions determining the spatial organization of a tissue. + preferred_normalization: counts + reference: "lopez2022destvi" + documentation_url: https://docs.scvi-tools.org/en/stable/user_guide/models/destvi.html + repository_url: https://github.com/scverse/scvi-tools + + arguments: + - name: "--max_epochs_sc" + type: integer + default: 500 + description: Number of epochs to train the Conditional version of single-cell Variational Inference (CondSCVI) model using MAP inference. + - name: "--max_epochs_sp" + type: integer + default: 2000 + description: Number of epochs to train the DestVI model using MAP inference. + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + setup: + - type: python + packages: + - scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - type: native + - type: nextflow + directives: + label: [hightime, midmem, midcpu, gpu] diff --git a/src/methods/destvi/script.py b/src/methods/destvi/script.py new file mode 100644 index 0000000..4682e74 --- /dev/null +++ b/src/methods/destvi/script.py @@ -0,0 +1,62 @@ +import anndata as ad +from scvi.model import CondSCVI +from scvi.model import DestVI + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'output': 'output.h5ad', + 'max_epochs_sc': 500, + 'max_epochs_sp': 5000 +} +meta = { + 'functionality_name': 'destvi' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial = ad.read_h5ad(par['input_spatial_masked']) + +input_single_cell.X = input_single_cell.layers["counts"] +input_spatial.X = input_spatial.layers["counts"] + +CondSCVI.setup_anndata(input_single_cell, labels_key="cell_type") +sc_model = CondSCVI(input_single_cell, weight_obs=False) +sc_model.train( + max_epochs=par['max_epochs_sc'], + early_stopping=True, + train_size=0.9, + validation_size=0.1, + early_stopping_monitor="elbo_validation", +) + +DestVI.setup_anndata(input_spatial) +st_model = DestVI.from_rna_model(input_spatial, sc_model) +st_model.train( + max_epochs=par['max_epochs_sp'], + batch_size=min(int(input_spatial.n_obs / 20 + 3), 128), + plan_kwargs={"min_kl_weight": 3.0, "max_kl_weight": 3}, +) +input_spatial.obsm["proportions_pred"] = st_model.get_proportions().to_numpy() + +output = ad.AnnData( + obs=input_spatial.obs[[]], + var=input_spatial.var[[]], + uns={ + 'cell_type_names': input_spatial.uns['cell_type_names'], + 'dataset_id': input_spatial.uns['dataset_id'], + 'method_id': meta['functionality_name'] + }, + obsm={ + 'coordinates': input_spatial.obsm['coordinates'], + 'proportions_pred': input_spatial.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial.layers['counts'] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/methods/nmfreg/config.vsh.yaml b/src/methods/nmfreg/config.vsh.yaml new file mode 100644 index 0000000..97320cb --- /dev/null +++ b/src/methods/nmfreg/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: nmfreg + info: + label: NMFreg + summary: "NMFreg reconstructs gene expression as a weighted combination of cell type signatures defined by scRNA-seq." + description: | + Non-Negative Matrix Factorization regression (NMFreg) is a decomposition method that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It was originally developed for Slide-seq data. This is a re-implementation from https://github.com/tudaga/NMFreg_tutorial. + preferred_normalization: counts + reference: "rodriques2019slide" + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html + repository_url: https://github.com/tudaga/NMFreg_tutorial/tree/master?tab=readme-ov-file + + arguments: + - name: "--n_components" + type: integer + default: 30 + description: Number of components to use for non-negative matrix factorization. + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - numpy + - scipy + - scikit-learn + - type: native + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/methods/nmfreg/script.py b/src/methods/nmfreg/script.py new file mode 100644 index 0000000..1cc0fd1 --- /dev/null +++ b/src/methods/nmfreg/script.py @@ -0,0 +1,91 @@ +import anndata as ad +import numpy as np +from scipy.optimize import nnls +from scipy.sparse import issparse +from sklearn.decomposition import NMF +from sklearn.preprocessing import StandardScaler + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'output': 'output.h5ad', + 'n_components': 30 +} +meta = { + 'functionality_name': 'nmfreg' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial = ad.read_h5ad(par['input_spatial_masked']) + +n_types = input_single_cell.obs["cell_type"].cat.categories.shape[0] + +# Learn from reference +X = input_single_cell.layers['counts'] +X_norm = X / X.sum(1) +X_scaled = StandardScaler(with_mean=False).fit_transform(X_norm) +model = NMF( + n_components=par['n_components'], + init="random", + random_state=42 +) +Ha = model.fit_transform(X_scaled) +Wa = model.components_ + +cluster_df = input_single_cell.obs[["cell_type"]].copy() +cluster_df.loc[:, "factor"] = np.argmax(Ha, axis=1) +cluster_df.loc[:, "code"] = cluster_df.cell_type.values.codes +factor_to_cluster_map = np.array( + [ + np.histogram( + cluster_df.loc[cluster_df.factor == k, "code"], + bins=n_types, + range=(0, n_types), + )[0] + for k in range(par['n_components']) + ] +).T + +factor_to_best_celltype = np.argmax(factor_to_cluster_map, axis=0) + +factor_to_best_celltype_matrix = np.zeros((par['n_components'], n_types)) +for i, j in enumerate(factor_to_best_celltype): + factor_to_best_celltype_matrix[i, j] = 1 + +Ha_norm = StandardScaler(with_mean=False).fit_transform(Ha) +sc_deconv = np.dot(Ha_norm**2, factor_to_best_celltype_matrix) +sc_deconv = sc_deconv / sc_deconv.sum(1)[:, np.newaxis] + +# Start run on actual spatial data +X_sp = input_spatial.layers['counts'] +X_sp_norm = X_sp / X_sp.sum(1) +X_sp_scaled = StandardScaler(with_mean=False).fit_transform(X_sp_norm) + +bead_prop_soln = np.array([nnls(Wa.T, X_sp_scaled[b, : ].toarray().reshape(-1))[0] for b in range(X_sp_scaled.shape[0])]) +bead_prop_soln = StandardScaler(with_mean=False).fit_transform(bead_prop_soln) +bead_prop = np.dot(bead_prop_soln, factor_to_best_celltype_matrix) + +prop = bead_prop / bead_prop.sum(1)[:, np.newaxis] +input_spatial.obsm["proportions_pred"] = prop + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_spatial.obs[[]], + var=input_spatial.var[[]], + uns={ + 'cell_type_names': input_spatial.uns['cell_type_names'], + 'dataset_id': input_spatial.uns['dataset_id'], + 'method_id': meta['functionality_name'] + }, + obsm={ + 'coordinates': input_spatial.obsm['coordinates'], + 'proportions_pred': input_spatial.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial.layers['counts'] + } +) +output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/methods/nnls/config.vsh.yaml b/src/methods/nnls/config.vsh.yaml new file mode 100644 index 0000000..537c714 --- /dev/null +++ b/src/methods/nnls/config.vsh.yaml @@ -0,0 +1,30 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: nnls + info: + label: NNLS + summary: "NNLS is a decomposition method based on Non-Negative Least Square Regression." + description: | + NonNegative Least Squares (NNLS), is a convex optimization problem with convex constraints. It was used by the AutoGeneS method to infer cellular proporrtions by solvong a multi-objective optimization problem. + preferred_normalization: counts + reference: "aliee2021autogenes" + documentation_url: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.nnls.html + repository_url: https://github.com/scipy/scipy + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - numpy + - scipy + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/methods/nnls/script.py b/src/methods/nnls/script.py new file mode 100644 index 0000000..069ba57 --- /dev/null +++ b/src/methods/nnls/script.py @@ -0,0 +1,63 @@ +import anndata as ad +import numpy as np +from scipy.optimize import nnls +from scipy.sparse import issparse + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'nnls' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial = ad.read_h5ad(par['input_spatial_masked']) + +# Compute means over each 'cell_type' +labels = input_single_cell.obs['cell_type'].cat.categories +n_var = input_single_cell.shape[1] +means = np.empty((labels.shape[0], n_var)) +for i, lab in enumerate(labels): + adata_lab = input_single_cell[input_single_cell.obs['cell_type'] == lab] + x_lab = adata_lab.layers['counts'] + means[i, :] = x_lab.mean(axis=0).flatten() +adata_means = ad.AnnData(means) +adata_means.obs_names = labels +adata_means.var_names = input_single_cell.var_names + +X = adata_means.X.T +y = input_spatial.layers['counts'].T +res = np.zeros((y.shape[1], X.shape[1])) # (voxels, cells) +for i in range(y.shape[1]): + x, _ = nnls(X, y[:, i].toarray().reshape(-1)) + res[i] = x + +# Normalize coefficients to sum to 1 +res[res < 0] = 0 +res = res / res.sum(axis=1, keepdims=1) + +input_spatial.obsm["proportions_pred"] = res + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_spatial.obs[[]], + var=input_spatial.var[[]], + uns={ + 'cell_type_names': input_spatial.uns['cell_type_names'], + 'dataset_id': input_spatial.uns['dataset_id'], + 'method_id': meta['functionality_name'] + }, + obsm={ + 'coordinates': input_spatial.obsm['coordinates'], + 'proportions_pred': input_spatial.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial.layers['counts'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/rctd/config.vsh.yaml b/src/methods/rctd/config.vsh.yaml new file mode 100644 index 0000000..2d2f82c --- /dev/null +++ b/src/methods/rctd/config.vsh.yaml @@ -0,0 +1,39 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: rctd + info: + label: RCTD + summary: "RCTD learns cell type profiles from scRNA-seq to decompose cell type mixtures while correcting for differences across sequencing technologies." + description: | + RCTD (Robust Cell Type Decomposition) is a decomposition method that uses signatures learnt from single-cell data to decompose spatial expression of tissues. It is able to use a platform effect normalization step, which normalizes the scRNA-seq cell type profiles to match the platform effects of the spatial transcriptomics dataset. + preferred_normalization: counts + reference: cable2021robust + documentation_url: https://raw.githack.com/dmcable/spacexr/master/vignettes/spatial-transcriptomics.html + repository_url: https://github.com/dmcable/spacexr + + arguments: + - name: "--fc_cutoff" + type: double + default: 0.5 + description: Minimum log-fold-change (across cell types) for genes to be included in the platform effect normalization step. + - name: "--fc_cutoff_reg" + type: double + default: 0.75 + description: Minimum log-fold-change (across cell types) for genes to be included in the RCTD step. + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.4 + setup: + - type: r + cran: [ Matrix, pak ] + - type: r + script: 'pak::pkg_install("dmcable/spacexr")' + - type: native + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/methods/rctd/script.R b/src/methods/rctd/script.R new file mode 100644 index 0000000..f5878ae --- /dev/null +++ b/src/methods/rctd/script.R @@ -0,0 +1,94 @@ +library(anndata) +library(spacexr) +library(Matrix) + +## VIASH START +par <- list( + input_single_cell = "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad", + input_spatial_masked = "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad", + output = "output.h5ad", + fc_cutoff = 0.5, + fc_cutoff_reg = 0.75 +) +meta <- list( + functionality_name = "rctd", + cpus = 1 +) +## VIASH END + +cat("Reading input files\n") +input_single_cell <- anndata::read_h5ad(par$input_single_cell) +input_spatial <- anndata::read_h5ad(par$input_spatial) + +# set spatial coordinates for the single cell data +coordinates <- matrix(1, dim(input_single_cell)[1], 2) +rownames(coordinates) <- rownames(input_single_cell) +input_single_cell$obsm <- list(coordinates = coordinates) + +# remove rare cell types to prevent RCTD error +# celltype_counts <- table(input_single_cell$obs$cell_type) +# input_single_cell <- input_single_cell[input_single_cell$obs$cell_type %in% names(as.table(celltype_counts[celltype_counts > 25]))] + +# get single cell reference counts +sc_counts <- t(input_single_cell$layers['counts']) +# get single cell reference labels +sc_cell_types <- factor(input_single_cell$obs$cell_type) +names(sc_cell_types) <- rownames(input_single_cell) +# construct reference object (specific for RCTD) +reference <- Reference(sc_counts, sc_cell_types) + +# get spatial data counts +sp_counts <- t(input_spatial$layers['counts']) +# get spatial data coordinates +sp_coords <- as.data.frame(input_spatial$obsm['coordinates']) +colnames(sp_coords) <- c("x", "y") +rownames(sp_coords) <- rownames(input_spatial) +# create spatial object to use in RCTD +puck <- SpatialRNA(sp_coords, sp_counts) + +# create RCTD object from reference and spatialRNA objects +if (!is.null(meta$cpus)) { +max_cores <- meta$cpus +} else { +max_cores <- 1 +} +rctd <- create.RCTD( + puck, + reference, + max_cores = max_cores, + fc_cutoff = par$fc_cutoff, + fc_cutoff_reg = par$fc_cutoff_reg, + test_mode = FALSE, + UMI_min_sigma = 100, + CELL_MIN_INSTANCE = 1 +) + +# run analysis and get results +rctd <- run.RCTD(rctd) +results <- rctd@results +cell_type_names <- rctd@cell_type_info$info[[2]] + +# extract proportions and normalize them (to sum to one) +norm_weights <- sweep(results$weights, 1, rowSums(results$weights), "/") +norm_weights <- as.matrix(norm_weights) +coordinates <- as.matrix(sp_coords) + +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + shape = input_spatial$shape, + obs = input_spatial$obs, + var = input_spatial$var, + uns = list( + cell_type_names = input_spatial$uns['cell_type_names'], + dataset_id = input_spatial$uns[["dataset_id"]], + method_id = meta[["functionality_name"]] + ), + obsm = list( + coordinates = coordinates, + proportions_pred = norm_weights + ), + layers = list( + counts = input_spatial$layers['counts'] + ) +) +output$write_h5ad(par[["output"]], compression = "gzip") diff --git a/src/methods/seurat/config.vsh.yaml b/src/methods/seurat/config.vsh.yaml new file mode 100644 index 0000000..c82a1c9 --- /dev/null +++ b/src/methods/seurat/config.vsh.yaml @@ -0,0 +1,39 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: seurat + info: + label: Seurat + summary: "Seurat method that is based on Canonical Correlation Analysis (CCA)." + description: | + This method applies the 'anchor'-based integration workflow introduced in Seurat v3, that enables the probabilistic transfer of annotations from a reference to a query set. First, mutual nearest neighbors (anchors) are identified from the reference scRNA-seq and query spatial datasets. Then, annotations are transfered from the single cell reference data to the sptial data along with prediction scores for each spot. + preferred_normalization: counts + reference: stuart2019comprehensive + documentation_url: https://satijalab.org/seurat/articles/spatial_vignette + repository_url: https://github.com/satijalab/seurat + + arguments: + - name: "--n_pcs" + type: integer + default: 30 + description: Number of principal components. + - name: "--sctransform_n_cells" + type: integer + default: 5000 + description: Number of cells sampled to build NB regression. + + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.4 + setup: + - type: r + cran: [Matrix, Seurat] + + - type: native + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/methods/seurat/script.R b/src/methods/seurat/script.R new file mode 100644 index 0000000..77917dd --- /dev/null +++ b/src/methods/seurat/script.R @@ -0,0 +1,99 @@ +library(anndata) +library(Seurat) + +## VIASH START +par <- list( + input_single_cell = "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad", + input_spatial_masked = "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad", + output = "output.h5ad", + n_pcs = 30, + sctransform_n_cells = 500 +) +meta <- list( + functionality_name = "seurat" +) +## VIASH END + +cat("Reading input files\n") +input_single_cell <- anndata::read_h5ad(par$input_single_cell) +input_spatial <- anndata::read_h5ad(par$input_spatial) + +cat(">> Converting AnnData to Seurat\n") +anndataToSeurat <- function(adata, assay) { + obj <- SeuratObject::CreateSeuratObject(counts = as(Matrix::t(adata$layers[["counts"]]), "CsparseMatrix"), assay = assay) + obj <- SeuratObject::AddMetaData(object = obj, metadata = adata$obs) + obj +} + +seurat_sc <- anndataToSeurat(input_single_cell, "RNA") +seurat_sp <- anndataToSeurat(input_spatial, "spatial") + +cat(">> Generate predictions\n") + +# Normalize and do dimred for spatial data +seurat_sp <- SCTransform( + seurat_sp, + assay = "spatial", + ncells = min(par$sctransform_n_cells, nrow(seurat_sp)), + verbose = TRUE, + conserve.memory = TRUE +) + +seurat_sp <- RunPCA(seurat_sp, assay = "SCT", verbose = FALSE, n_pcs = par$n_pcs) + +# Normalize and do dimred for single cell data +seurat_sc <- SCTransform( + seurat_sc, + assay = "RNA", + ncells = min(par$sctransform_n_cells, nrow(seurat_sc)), + verbose = TRUE, + conserve.memory = TRUE +) + +seurat_sc <- RunPCA(seurat_sc, verbose = FALSE, n_pcs = par$n_pcs) + +# find anchors (MNN's to compute adjustmen vectors) +anchors <- FindTransferAnchors( + reference = seurat_sc, + query = seurat_sp, + normalization.method = "SCT" +) + +# transfer labels from single cell data to spatial +predictions_assay <- TransferData( + anchorset = anchors, + refdata = as.factor(as.character(seurat_sc@meta.data$cell_type)), + prediction.assay = TRUE, + weight.reduction = seurat_sp[["pca"]], + dims = 1:par$n_pcs +) + +# format data and return results +predictions <- LayerData(predictions_assay, layer = "data") +predictions <- predictions[!(rownames(predictions) == "max"), ] +predictions <- t(predictions) + +sp_coords <- as.data.frame(input_spatial$obsm['coordinates']) +colnames(sp_coords) <- c("x", "y") +rownames(sp_coords) <- rownames(input_spatial) +sp_coords <- as.matrix(sp_coords) + +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + shape = input_spatial$shape, + obs = input_spatial$obs, + var = input_spatial$var, + uns = list( + cell_type_names = input_spatial$uns['cell_type_names'], + dataset_id = input_spatial$uns[["dataset_id"]], + method_id = meta[["functionality_name"]] + ), + obsm = list( + coordinates = sp_coords, + proportions_pred = predictions + ), + layers = list( + counts = input_spatial$layers['counts'] + ) +) +output$write_h5ad(par[["output"]], compression = "gzip") diff --git a/src/methods/stereoscope/config.vsh.yaml b/src/methods/stereoscope/config.vsh.yaml new file mode 100644 index 0000000..f9a74dd --- /dev/null +++ b/src/methods/stereoscope/config.vsh.yaml @@ -0,0 +1,43 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: stereoscope + + info: + label: Stereoscope + summary: "Stereoscope is a decomposition method based on Negative Binomial regression." + description: | + Stereoscope is a decomposition method based on Negative Binomial regression. It is similar in scope and implementation to cell2location but less flexible to incorporate additional covariates such as batch effects and other type of experimental design annotations. + preferred_normalization: counts + reference: andersson2020single + documentation_url: https://docs.scvi-tools.org/en/stable/user_guide/models/stereoscope.html + repository_url: https://github.com/scverse/scvi-tools + + arguments: + - name: "--max_epochs_sc" + type: integer + default: 100 + description: Number of of epochs to train RNAStereoscope model. + - name: "--max_epochs_sp" + type: integer + default: 1000 + description: Number of of epochs to train SpatialStereoscope model. + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + setup: + - type: python + packages: + - scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - type: native + - type: nextflow + directives: + label: [hightime, midmem, midcpu, gpu] diff --git a/src/methods/stereoscope/script.py b/src/methods/stereoscope/script.py new file mode 100644 index 0000000..e69bb5f --- /dev/null +++ b/src/methods/stereoscope/script.py @@ -0,0 +1,61 @@ +import anndata as ad +from scvi.external import RNAStereoscope +from scvi.external import SpatialStereoscope + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'output': 'output.h5ad', + 'max_epochs_sc': 100, + 'max_epochs_sp': 1000 +} +meta = { + 'functionality_name': 'stereoscope' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial = ad.read_h5ad(par['input_spatial_masked']) + +input_single_cell.X = input_single_cell.layers["counts"] +input_spatial.X = input_spatial.layers["counts"] + +print('Generate predictions', flush=True) + +RNAStereoscope.setup_anndata(input_single_cell, labels_key="cell_type") +sc_model = RNAStereoscope(input_single_cell) +sc_model.train( + max_epochs=par["max_epochs_sc"], + # early_stopping=True, + # early_stopping_monitor="elbo_validation" +) + +SpatialStereoscope.setup_anndata(input_spatial) +st_model = SpatialStereoscope.from_rna_model(input_spatial, sc_model) +st_model.train( + max_epochs=par["max_epochs_sp"], + # early_stopping=True, + # early_stopping_monitor="elbo_validation" +) +input_spatial.obsm["proportions_pred"] = st_model.get_proportions().to_numpy() + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_spatial.obs[[]], + var=input_spatial.var[[]], + obsm={ + 'coordinates': input_spatial.obsm['coordinates'], + 'proportions_pred': input_spatial.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial.layers['counts'] + }, + uns={ + 'cell_type_names': input_spatial.uns['cell_type_names'], + 'dataset_id': input_spatial.uns['dataset_id'], + 'method_id': meta['functionality_name'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/tangram/config.vsh.yaml b/src/methods/tangram/config.vsh.yaml new file mode 100644 index 0000000..e1320e7 --- /dev/null +++ b/src/methods/tangram/config.vsh.yaml @@ -0,0 +1,38 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: tangram + info: + label: Tangram + summary: "Tanagram maps single-cell gene expression data onto spatial gene expression data by fitting gene expression on shared genes" + description: | + Tangram is a method to map gene expression signatures from scRNA-seq data to spatial data. It performs the cell type mapping by learning a similarity matrix between single-cell and spatial locations based on gene expression profiles. + preferred_normalization: counts + reference: biancalani2021deep + documentation_url: https://tangram-sc.readthedocs.io/en/latest/index.html + repository_url: https://github.com/broadinstitute/Tangram + + arguments: + - name: "--num_epochs" + type: integer + default: 1000 + description: Number of epochs to use while mapping single cells to spatial locations. + - name: "--n_markers" + type: integer + default: 100 + description: Number of marker genes to use. + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: tangram-sc + - type: native + - type: nextflow + directives: + label: [midtime,midmem, midcpu] diff --git a/src/methods/tangram/script.py b/src/methods/tangram/script.py new file mode 100644 index 0000000..544664f --- /dev/null +++ b/src/methods/tangram/script.py @@ -0,0 +1,84 @@ +import anndata as ad +import pandas as pd +import scanpy as sc +import tangram as tg +import torch + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'output': 'output.h5ad', + 'num_epochs': 1000, + 'n_markers': 100 +} +meta = { + 'functionality_name': 'tangram' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial = ad.read_h5ad(par['input_spatial_masked']) + +print('Generate predictions', flush=True) +# analysis based on github.com/broadinstitute/Tangram/blob/master/tutorial_tangram_with_squidpy.ipynb +# using tangram from PyPi, not github version + +input_single_cell.X = input_single_cell.layers["counts"] +input_spatial.X = input_spatial.layers["counts"] + +# pre-process single cell data +sc.pp.normalize_total(input_single_cell, 1e4) +sc.pp.log1p(input_single_cell) +# identify marker genes +sc.tl.rank_genes_groups(input_single_cell, groupby="cell_type", use_raw=False) + +# extract marker genes to data frame +markers_df = pd.DataFrame(input_single_cell.uns["rank_genes_groups"]["names"]).iloc[0:par['n_markers'], :] + +# get union of all marker genes +markers = list(set(markers_df.melt().value.values)) + +# match genes between single cell and spatial data +tg.pp_adatas(input_single_cell, input_spatial, genes=markers) + +# get device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# map single cells to spatial locations +ad_map = tg.map_cells_to_space( + input_single_cell, + input_spatial, + device=device, + num_epochs=par['num_epochs'], +) + +# transfer labels from mapped cells to spatial location +tg.project_cell_annotations(adata_map=ad_map, adata_sp=input_spatial, annotation="cell_type") + +# normalize scores +pred_props = input_spatial.obsm["tangram_ct_pred"].to_numpy() +input_spatial.obsm["proportions_pred"] = pred_props / pred_props.sum(axis=1)[:, None] + +# remove un-normalized predictions +del input_spatial.obsm["tangram_ct_pred"] + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_spatial.obs[[]], + var=input_spatial.var[[]], + obsm={ + 'coordinates': input_spatial.obsm['coordinates'], + 'proportions_pred': input_spatial.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial.layers['counts'] + }, + uns={ + 'cell_type_names': input_spatial.uns['cell_type_names'], + 'dataset_id': input_spatial.uns['dataset_id'], + 'method_id': meta['functionality_name'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/vanillanmf/config.vsh.yaml b/src/methods/vanillanmf/config.vsh.yaml new file mode 100644 index 0000000..7341cee --- /dev/null +++ b/src/methods/vanillanmf/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: vanillanmf + info: + label: NMF + summary: "NMF reconstructs gene expression as a weighted combination of cell type signatures defined by scRNA-seq." + description: | + NMF is a decomposition method based on Non-negative Matrix Factorization (NMF) that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It is a simpler baseline than NMFreg as it only performs the NMF step based on mean expression signatures of cell types, returning the weights loading of the NMF as (normalized) cell type proportions, without the regression step. + preferred_normalization: counts + reference: cichocki2009fast + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html + repository_url: https://github.com/scikit-learn/scikit-learn/blob/92c9b1866/sklearn/decomposition/ + + arguments: + - name: "--max_iter" + type: integer + default: 4000 + description: Maximum number of iterations before timing out. + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - numpy + - scipy + - scikit-learn + - type: native + - type: nextflow + directives: + label: [midtime,midmem, midcpu] diff --git a/src/methods/vanillanmf/script.py b/src/methods/vanillanmf/script.py new file mode 100644 index 0000000..ff55079 --- /dev/null +++ b/src/methods/vanillanmf/script.py @@ -0,0 +1,77 @@ +import anndata as ad +import numpy as np +from scipy.sparse import issparse +from sklearn.decomposition import NMF + +## VIASH START +par = { + 'input_single_cell': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad', + 'input_spatial_masked': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/spatial_masked.h5ad', + 'output': 'output.h5ad', + 'max_iter': 4000 +} +meta = { + 'functionality_name': 'vanillanmf' +} +## VIASH END + +print('Reading input files', flush=True) +input_single_cell = ad.read_h5ad(par['input_single_cell']) +input_spatial = ad.read_h5ad(par['input_spatial_masked']) + +print('Generate predictions', flush=True) + +n_types = input_single_cell.obs["cell_type"].cat.categories.shape[0] +vanila_nmf_model = NMF( + n_components=n_types, + beta_loss="kullback-leibler", + solver="mu", + max_iter=par['max_iter'], + alpha_W=0.1, + alpha_H=0.1, + init="custom", + random_state=42, +) + +# Make profiles from single-cell expression dataset +# Compute means over each 'cell_type' +labels = input_single_cell.obs['cell_type'].cat.categories +n_var = input_single_cell.shape[1] +means = np.empty((labels.shape[0], n_var)) +for i, lab in enumerate(labels): + adata_lab = input_single_cell[input_single_cell.obs['cell_type'] == lab] + x_lab = adata_lab.layers['counts'] + means[i, :] = x_lab.mean(axis=0).flatten() +adata_means = ad.AnnData(means) +adata_means.obs_names = labels +adata_means.var_names = input_single_cell.var_names + +X = input_spatial.layers['counts'].toarray() + +Wa = vanila_nmf_model.fit_transform( + X.astype(adata_means.X.dtype), + H=adata_means.X, + W=np.ones((input_spatial.shape[0], n_types), dtype=adata_means.X.dtype), +) + +prop = Wa / Wa.sum(1)[:, np.newaxis] +input_spatial.obsm["proportions_pred"] = prop + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=input_spatial.obs[[]], + var=input_spatial.var[[]], + uns={ + 'cell_type_names': input_spatial.uns['cell_type_names'], + 'dataset_id': input_spatial.uns['dataset_id'], + 'method_id': meta['functionality_name'] + }, + obsm={ + 'coordinates': input_spatial.obsm['coordinates'], + 'proportions_pred': input_spatial.obsm['proportions_pred'] + }, + layers={ + 'counts': input_spatial.layers['counts'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/r2/config.vsh.yaml b/src/metrics/r2/config.vsh.yaml new file mode 100644 index 0000000..bc5585f --- /dev/null +++ b/src/metrics/r2/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_metric.yaml + +functionality: + name: r2 + info: + metrics: + - name: r2 + label: R2 + summary: "R2 represents the proportion of variance in the true proportions which is explained by the predicted proportions." + description: | + R2, or the “coefficient of determination”, reports the fraction of the true proportion values' variance that can be explained by the predicted proportion values. The best score, and upper bound, is 1.0. There is no fixed lower bound for the metric. The uniform/non-weighted average across all cell types/states is used to summarise performance. By default, cases resulting in a score of NaN (perfect predictions) or -Inf (imperfect predictions) are replaced with 1.0 (perfect predictions) or 0.0 (imperfect predictions) respectively. + reference: miles2005rsquared + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html + repository_url: https://github.com/scikit-learn/scikit-learn/tree/5c4aa5d0d90ba66247d675d4c3fc2fdfba3c39ff + min: -inf + max: 1 + maximize: true + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: scikit-learn + - type: native + - type: nextflow + directives: + label: [midtime,midmem, midcpu] diff --git a/src/metrics/r2/script.py b/src/metrics/r2/script.py new file mode 100644 index 0000000..35420e0 --- /dev/null +++ b/src/metrics/r2/script.py @@ -0,0 +1,39 @@ +import anndata as ad +import sklearn.metrics + +## VIASH START +par = { + 'input_method': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/output.h5ad', + 'input_solution': 'resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/solution.h5ad', + 'output': 'score.h5ad' +} +meta = { + 'functionality_name': 'r2' +} +## VIASH END + +print('Reading input files', flush=True) +input_method = ad.read_h5ad(par['input_method']) +input_solution = ad.read_h5ad(par['input_solution']) + +print('Compute metrics', flush=True) +prop_true = input_solution.obsm["proportions_true"] +prop_pred = input_method.obsm["proportions_pred"] +r2_score = sklearn.metrics.r2_score( + prop_true, prop_pred, sample_weight=None, multioutput="uniform_average" +) + +uns_metric_ids = [ 'r2' ] +uns_metric_values = [ r2_score ] + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_method.uns['dataset_id'], + 'method_id': input_method.uns['method_id'], + 'metric_ids': uns_metric_ids, + 'metric_values': uns_metric_values + } +) +output.write_h5ad(par['output'], compression='gzip') + diff --git a/src/process_dataset/split_dataset/config.vsh.yaml b/src/process_dataset/split_dataset/config.vsh.yaml new file mode 100644 index 0000000..c430b01 --- /dev/null +++ b/src/process_dataset/split_dataset/config.vsh.yaml @@ -0,0 +1,13 @@ +__merge__: ../../api/comp_process_dataset.yaml +functionality: + name: split_dataset + resources: + - type: python_script + path: script.py + - path: /common/helper_functions/subset_anndata.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/process_dataset/split_dataset/script.py b/src/process_dataset/split_dataset/script.py new file mode 100644 index 0000000..c1ae956 --- /dev/null +++ b/src/process_dataset/split_dataset/script.py @@ -0,0 +1,46 @@ +import anndata as ad +import sys +import numpy as np + +## VIASH START +par = { + "input": "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/dataset_simulated.h5ad", + "output_spatial_masked": "spatial_masked.h5ad", + "output_single_cell": "single_cell_ref.h5ad", + "output_solution": "solution.h5ad", +} +meta = { + "functionality_name": "split_dataset", + "resources_dir": "src/process_dataset/split_dataset", + "config": "target/nextflow/process_dataset/split_dataset/.config.vsh.yaml" +} +## VIASH END + +sys.path.append(meta['resources_dir']) +from subset_anndata import read_config_slots_info, subset_anndata + +print(">> Load dataset", flush=True) +adata = ad.read_h5ad(par["input"]) + +# TO DO: Non-integer values in the counts layer are detected as un-normalized data by some methods, thereby causing them to fail. +adata.layers['counts'] = adata.layers['counts'].floor() + +print(">> Figuring out which data needs to be copied to which output file", flush=True) +slot_info = read_config_slots_info(meta["config"]) + +print(">> Split dataset by modality", flush=True) +is_sp = adata.obs["modality"] == "sp" +adata_sp = adata[is_sp, :].copy() +adata_sc = adata[~is_sp, :].copy() + +print(">> Create dataset for methods", flush=True) +output_spatial_masked = subset_anndata(adata_sp, slot_info['output_spatial_masked']) +output_single_cell = subset_anndata(adata_sc, slot_info['output_single_cell']) + +print(">> Create solution object for metrics", flush=True) +output_solution = subset_anndata(adata_sp, slot_info['output_solution']) + +print(">> Write to disk", flush=True) +output_spatial_masked.write_h5ad(par["output_spatial_masked"]) +output_single_cell.write_h5ad(par["output_single_cell"]) +output_solution.write_h5ad(par["output_solution"]) diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000..47f3332 --- /dev/null +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,49 @@ +functionality: + name: "process_datasets" + namespace: "workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: "/src/api/file_common_dataset.yaml" + required: true + direction: input + - name: "--alpha" + type: double + required: false + direction: input + - name: Outputs + arguments: + - name: "--output_single_cell" + __merge__: /src/api/file_single_cell.yaml + required: true + direction: output + - name: "--output_spatial_masked" + __merge__: /src/api/file_spatial_masked.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/api/file_solution.yaml + required: true + direction: output + - name: "--simulated_data" + type: file + required: false + direction: output + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf + dependencies: + - name: common/check_dataset_schema + repository: openproblems_v2 + - name: dataset_simulator + - name: process_dataset/split_dataset + repositories: + - name: openproblems_v2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +platforms: + - type: nextflow diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf new file mode 100644 index 0000000..475470a --- /dev/null +++ b/src/workflows/process_datasets/main.nf @@ -0,0 +1,65 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | dataset_simulator.run( + runIf: {id, state -> state.alpha}, + fromState: [ + input: "dataset", + alpha: "alpha" + ], + toState: [ dataset: "simulated_data"], + auto: [publish: true] + ) + + | split_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_single_cell: "output_single_cell", + output_spatial_masked: "output_spatial_masked", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_single_cell", "output_spatial_masked", "output_solution"]) + + emit: + output_ch +} diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000..590ca6b --- /dev/null +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,76 @@ +functionality: + name: "run_benchmark" + namespace: "workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_single_cell" + __merge__: "/src/api/file_single_cell.yaml" + required: true + direction: input + - name: "--input_spatial_masked" + __merge__: "/src/api/file_spatial_masked.yaml" + required: true + direction: input + - name: "--input_solution" + __merge__: "/src/api/file_solution.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" + dependencies: + - name: common/check_dataset_schema + repository: openproblems_v2 + - name: common/extract_metadata + repository: openproblems_v2 + - name: control_methods/random_proportions + - name: control_methods/true_proportions + - name: methods/cell2location + - name: methods/destvi + - name: methods/nmfreg + - name: methods/nnls + - name: methods/rctd + - name: methods/seurat + - name: methods/stereoscope + - name: methods/tangram + - name: methods/vanillanmf + - name: metrics/r2 + repositories: + - name: openproblems_v2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf new file mode 100644 index 0000000..82a29fe --- /dev/null +++ b/src/workflows/run_benchmark/main.nf @@ -0,0 +1,198 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + random_proportions, + true_proportions, + cell2location, + destvi, + nmfreg, + nnls, + rctd, + seurat, + stereoscope, + tangram, + vanillanmf + ] + + // construct list of metrics + metrics = [ + r2 + ] + + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.functionality.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input_single_cell: state.input_single_cell, + input_spatial_masked: state.input_spatial_masked + ] + if (comp.config.functionality.info.type == "control_method") { + new_args.input_solution = state.input_solution + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.functionality.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + [ + input_solution: state.input_solution, + input_method: state.method_output + ] + }, + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.output + ] + } + ) + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +}