diff --git a/.github/components/dictionary.txt b/.github/components/dictionary.txt index 0c1060a96..414847cbe 100644 --- a/.github/components/dictionary.txt +++ b/.github/components/dictionary.txt @@ -7,9 +7,11 @@ altExp anaplasia anaplastic aneuploid +aneuploidy AnnData anonymize anonymized +ARI astrocytes AUCell authenticator @@ -23,8 +25,10 @@ bioinformatics blastema blastemal Bleijs +Cao Carpentries CellAssign +CELLxGENE chondrocytes chr CLI @@ -44,6 +48,7 @@ Ctrl cxds demultiplexing dendogram +derangements designee discoverable DM @@ -53,6 +58,7 @@ Dockerfile docstring docstrings DOI +dotplots doxxing dropdown ECM @@ -61,7 +67,10 @@ endothelium enforceability Ensembl enure +epithelia +epithelialize erythematosus +erythroid et ewings EWS-FLI1 @@ -75,12 +84,17 @@ GHAs GitHub GitKraken GitKraken's +glial +glioblastoma Goodspeed GPUs GTF +Hafemeister Halbritter +hematopoietic heterotypic histological +histologies homotypic HPC IAM @@ -95,6 +109,8 @@ Jaccard Jitter JSON Jupyter +karyotyping +leiden LGBTQ licensor licensor's @@ -105,6 +121,7 @@ linter linter's linters lockfile +Looney Louvain LSfR macOS @@ -112,22 +129,32 @@ macrophage md merchantability mesenchymal +mesenchyme metacell metacells +metanephric +microenvironment +microRNA Miniconda Miniforge misidentification +monocyte +monocytes MSC multifactor myeloid natively Nextflow +nephroblastoma +nephron nonconsensual octicons onboarded +oncotarget openscpca OpenScPCA OpenScPCA's +overclustered Panglao PanglaoDB PDX @@ -139,7 +166,9 @@ Posit's PowerShell PR pre +preprint PRs +pseudobulk README redistribution redistributions @@ -147,6 +176,9 @@ renv repo reproducibility reproducibly +ribosomal +RNAseq +rOpenScPCA RStudio RStudio's Rtools @@ -166,7 +198,9 @@ SSO stroma stromal Stumptown +subdiagnosis sublicensable +subtypes symlink symlinked synched @@ -175,12 +209,14 @@ Tirode trainings transferrable transphobic +Treg triaged TSV UCell UMAP uncomment unhide +uteric vCPU vCPUs Visser @@ -193,3 +229,4 @@ Xcode YAML Zappia Zenodo +Zhang diff --git a/00_run_workflow.R b/00_run_workflow.R deleted file mode 100644 index b312ea351..000000000 --- a/00_run_workflow.R +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env Rscript - -# USAGE: -# Rscript 00_run_workflow.R -# -# USAGE in CI: -# Rscript 00_run_workflow.R --testing - -# Set up options ---------------------------------------------------------------- - -library(optparse) - -option_list <- list( - make_option( - opt_str = c("--testing"), - type = "logical", - default = FALSE, - action = "store_true", - help = "Use this flag when running on test data" - ) -) - -opts <- parse_args(OptionParser(option_list = option_list)) -running_ci <- opts$testing - -# Run the Label transfer from two fetal references ------------------------------ - -# get list of samples in the library -------------------------------------------- -root_dir <- rprojroot::find_root(rprojroot::is_git_root) -project_id <- "SCPCP000006" -sample_metadata_file <- file.path(root_dir, "data", "current", project_id, "single_cell_metadata.tsv") -metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE) - -# set path to this module-------------------------------------------------------- -module_base <- file.path(root_dir, "analyses", "cell-type-wilms-tumor-06") - -# Download and create the fetal kidney reference (Stewart et al) ---------- -system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "download-and-create-fetal-kidney-ref.R"))) - -# We build the gene position file reference for infercnv ------------------------ -system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "06a_build-geneposition.R"))) - -# Characterize the two fetal references ----------------------------------------- - -# Characterize the fetal full reference (Cao et al.) -# To be done, next PR -notebook_template_dir <- file.path(module_base, "notebook_template") -notebook_output_dir <- file.path(module_base, "notebook") -# Characterize the fetal kidney reference (Stewart et al.) -rmarkdown::render(input = file.path(notebook_template_dir, "00b_characterize_fetal_kidney_reference_Stewart.Rmd"), - output_format = "html_document", - output_file = "00b_characterization_fetal_kidney_reference_Stewart.html", - output_dir = file.path(notebook_output_dir, "00-reference")) - - -# Run the workflow for (all) samples in the project ----------------------------- -for (sample_id in metadata$scpca_sample_id) { - - # create a directory to save the pre-processed and labeled `Seurat` objects - dir.create(file.path(module_base, "results", sample_id), showWarnings = FALSE) - # create a directory to save the notebooks - dir.create(file.path(module_base, "notebook", sample_id), showWarnings = FALSE) - - # Pre-process the data - `Seurat` workflow - rmarkdown::render(input = file.path(notebook_template_dir, "01_seurat-processing.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("01_seurat_processing_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - if (!running_ci) { - # Label transfer from the Cao reference using Azimuth - rmarkdown::render(input = file.path(notebook_template_dir, "02a_label-transfer_fetal_full_reference_Cao.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02a_fetal_all_reference_Cao_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - # Label transfer from the Stewart reference using Seurat - rmarkdown::render(input = file.path(notebook_template_dir, "02b_label-transfer_fetal_kidney_reference_Stewart.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02b_fetal_kidney_reference_Stewart_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - # Cluster exploration - rmarkdown::render(input = file.path(notebook_template_dir, "03_clustering_exploration.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("03_clustering_exploration_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - } -} - -if (!running_ci) { - for(thr in c(0.5, 0.75, 0.85, 0.95)){ - # Run notebook template to explore label transfer and clustering for all samples at once - rmarkdown::render(input = file.path(notebook_output_dir, "04_annotation_Across_Samples_exploration.Rmd"), - params = list(mapping_score_thr = thr), - output_format = "html_document", - output_file = glue::glue("04_annotation_Across_Samples_exploration_mappingscore_threshold_",thr,".html"), - output_dir = notebook_output_dir) - } - # Run infercnv and copykat for a selection of samples - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "explore-cnv-methods.R"))) - -} - - - - - diff --git a/analyses/cell-type-wilms-tumor-06/00_run_workflow.R b/analyses/cell-type-wilms-tumor-06/00_run_workflow.R index bba9a9fde..68b65fde7 100644 --- a/analyses/cell-type-wilms-tumor-06/00_run_workflow.R +++ b/analyses/cell-type-wilms-tumor-06/00_run_workflow.R @@ -35,7 +35,7 @@ metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE) module_base <- file.path(root_dir, "analyses", "cell-type-wilms-tumor-06") # Download and create the fetal kidney reference (Stewart et al) ---------- -system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "download-and-create-fetal-kidney-ref.R"))) +system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "download-and-create-fetal-kidney-ref.R"))) # We build the gene position file reference for infercnv ------------------------ system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "06a_build-geneposition.R"))) @@ -47,67 +47,99 @@ system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "06a_b notebook_template_dir <- file.path(module_base, "notebook_template") notebook_output_dir <- file.path(module_base, "notebook") # Characterize the fetal kidney reference (Stewart et al.) -rmarkdown::render(input = file.path(notebook_template_dir, "00b_characterize_fetal_kidney_reference_Stewart.Rmd"), - output_format = "html_document", - output_file = "00b_characterization_fetal_kidney_reference_Stewart.html", - output_dir = file.path(notebook_output_dir, "00-reference")) +rmarkdown::render( + input = file.path(notebook_template_dir, "00b_characterize_fetal_kidney_reference_Stewart.Rmd"), + output_format = "html_document", + output_file = "00b_characterization_fetal_kidney_reference_Stewart.html", + output_dir = file.path(notebook_output_dir, "00-reference") +) +sample_ids <- metadata |> + dplyr::filter(seq_unit != "spot") |> + dplyr::pull(scpca_sample_id) |> + unique() # Run the workflow for (all) samples in the project ----------------------------- -for (sample_id in metadata$scpca_sample_id) { - +for (sample_id in sample_ids) { # create a directory to save the pre-processed and labeled `Seurat` objects dir.create(file.path(module_base, "results", sample_id), showWarnings = FALSE) # create a directory to save the notebooks dir.create(file.path(module_base, "notebook", sample_id), showWarnings = FALSE) # Pre-process the data - `Seurat` workflow - rmarkdown::render(input = file.path(notebook_template_dir, "01_seurat-processing.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("01_seurat_processing_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) + rmarkdown::render( + input = file.path(notebook_template_dir, "01_seurat-processing.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("01_seurat_processing_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) if (!running_ci) { # Label transfer from the Cao reference using Azimuth - rmarkdown::render(input = file.path(notebook_template_dir, "02a_label-transfer_fetal_full_reference_Cao.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02a_fetal_all_reference_Cao_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) + rmarkdown::render( + input = file.path(notebook_template_dir, "02a_label-transfer_fetal_full_reference_Cao.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("02a_fetal_all_reference_Cao_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) # Label transfer from the Stewart reference using Seurat - rmarkdown::render(input = file.path(notebook_template_dir, "02b_label-transfer_fetal_kidney_reference_Stewart.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02b_fetal_kidney_reference_Stewart_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - # Cluster exploration - rmarkdown::render(input = file.path(notebook_template_dir, "03_clustering_exploration.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("03_clustering_exploration_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) + rmarkdown::render( + input = file.path(notebook_template_dir, "02b_label-transfer_fetal_kidney_reference_Stewart.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("02b_fetal_kidney_reference_Stewart_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) + # Cluster exploration + rmarkdown::render( + input = file.path(notebook_template_dir, "03_clustering_exploration.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("03_clustering_exploration_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) } } if (!running_ci) { - for(thr in c(0.5, 0.75, 0.85, 0.95)){ - # Run notebook template to explore label transfer and clustering for all samples at once - rmarkdown::render(input = file.path(notebook_output_dir, "04_annotation_Across_Samples_exploration.Rmd"), - params = list(predicted.score_thr = thr), - output_format = "html_document", - output_file = glue::glue("04_annotation_Across_Samples_exploration_predicted.score_threshold_",thr,".html"), - output_dir = notebook_output_dir) + for (thr in c(0.5, 0.75, 0.85, 0.95)) { + # Run notebook template to explore label transfer and clustering for all samples at once + rmarkdown::render( + input = file.path(notebook_output_dir, "04_annotation_Across_Samples_exploration.Rmd"), + output_format = "html_document", + output_file = "04_annotation_Across_Samples_exploration.html", + output_dir = notebook_output_dir + ) } # Run infercnv and copykat for a selection of samples - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "explore-cnv-methods.R"))) - -} - - - - + system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "explore-cnv-methods.R"))) + + # Run infercnv for all samples with HMM i3 and using "both" as the reference + for (sample_id in sample_ids) { + # We run SCPCS000190 with "none" instead of "both" since it does not have + # enough "normal" cells to use as a reference + if (sample_id == "SCPCS000190") { + reference <- "none" + } else { + reference <- "both" + } + + # don't repeat inference on selection of samples it's already been run on + output_file <- file.path(module_base, "results", sample_id, glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")) + if (!file.exists(output_file)) { + system(command = glue::glue("Rscript {file.path(module_base, 'scripts', '06_infercnv.R')} --sample_id {sample_id} --reference {reference} --HMM i3")) + } + } + # Render notebook to make draft annotations + rmarkdown::render( + input = file.path(notebook_output_dir, "07_combined_annotation_across_samples_exploration.Rmd "), + output_format = "html_document", + output_file = "07_combined_annotation_across_samples_exploration.html ", + output_dir = notebook_output_dir + ) +} diff --git a/analyses/cell-type-wilms-tumor-06/README.md b/analyses/cell-type-wilms-tumor-06/README.md index f205cc81b..18f966a1c 100644 --- a/analyses/cell-type-wilms-tumor-06/README.md +++ b/analyses/cell-type-wilms-tumor-06/README.md @@ -21,39 +21,36 @@ The analysis is/will be divided as the following: - [x] Script: label transfer from the fetal kidney atlas reference using runAzimuth - [x] Script: run copykat and inferCNV - [x] Notebook: explore results from steps 2 to 4 for about 5 to 10 samples -- [ ] Script: compile scripts 2 to 4 in a RMardown file with required adjustements and render it across all samples +- [x] Script: Run inferCNV for all samples - [x] Notebook: explore results from step 6, integrate all samples together and annotate the dataset using (i) metadatafile, (ii) CNV information, (iii) label transfer information ## Usage -From Rstudio, run the Rmd reports or render the R scripts (see below R studio session set up). -You can also simply have a look at the html reports in the notebook folder. -Here, no need to run anything, we try to guide you through the analysis. Have a look at the code using the unhide code button on the top right of each chunk! -## Input files - -### single nuclei data - -We work with the _processed.rds SingleCellExperiment objects. -From the module directory, make sure that the conda environment is set-up: +1. Ensure the conda environment is activated. ```shell conda activate openscpca ``` -log into AWS CLI: +2. log into AWS CLI: ```shell # replace `openscpca` with your AWS CLI profile name if it differs export AWS_PROFILE=openscpca aws sso login ``` +Of note, this requires AWS CLI setup to run as intended: https://openscpca.readthedocs.io/en/latest/technical-setup/environment-setup/configure-aws-cli/ + -use download-data.py to download the data as the following: +3. use download-data.py to download the data as the following: ```shell ../../download-data.py --projects SCPCP000006 ``` This is saving the data in OpenScPCA-analysis/data/current/SCPCP000006 -Of note, this requires AWS CLI setup to run as intended: https://openscpca.readthedocs.io/en/latest/technical-setup/environment-setup/configure-aws-cli/ +4. Run the module: +```shell +Rscript 00_run_workflow.R +``` ### sample metadata @@ -70,17 +67,17 @@ Some differenices are expected, some marker genes or pathways are associated wit for each of the steps, we have two types of `output`: -- the `notebook` saved in the `notebook` directory, with a subfolder for each sample. +- the `notebook` saved in the `notebook` directory, with a subfolder for each sample. -- the created objects saved in `results` directory, with a subfolder for each sample. +- the created objects saved in `results` directory, with a subfolder for each sample. # Analysis ## Marker sets -We first build a resource for later validation of the annotated cell types. -We gather from the litterature marker genes and specific genomic alterations that could help us characterizing the Wilms tumor ecosystem, including cancer and non-cancer cells. +We first build a resource for later validation of the annotated cell types. +We gather from the litterature marker genes and specific genomic alterations that could help us characterizing the Wilms tumor ecosystem, including cancer and non-cancer cells. ### The table CellType_metadata.csv contains the following column and information: @@ -135,11 +132,7 @@ We gather from the litterature marker genes and specific genomic alterations tha |1q|gain|malignant|NA|10.1016/S0002-9440(10)63982-X|NA|Associated_with_relapse| -## Clustering and label transfer from fetal references - -R Script to be rendered : `00_run_workflow.R` - -### Introduction +## workflow description The `00_run_workflow.R` contains the following steps: @@ -152,21 +145,21 @@ The `00_run_workflow.R` contains the following steps: - loop for each samples: - `Seurat workflow`, normalization and clustering: `01_seurat-processing.Rmd` in `notebook_template` - + - `Azimuth` label transfer from the fetal full reference (Cao et al.): `02a_label-transfer_fetal_full_reference_Cao.Rmd` in `notebook_template` - + - `Azimuth` label transfer from the fetal kidney reference (Stewart et al.): `02b_label-transfer_fetal_kidney_reference_Stewart.Rmd` in `notebook_template` - + - Exploration of clustering, label transfers, marker genes and pathways: `03_clustering_exploration.Rmd` in `notebook_template` - - CNV inference using [`infercnv`](https://github.com/broadinstitute/inferCNV/wiki) with endothelial and immune cells as reference from either the same patient or a pool of upfront resection Wilms tumor samples: `06_infercnv.R` in `script` - + - CNV inference using [`infercnv`](https://github.com/broadinstitute/inferCNV/wiki) with endothelial and immune cells as reference from either the same patient or a pool of upfront resection Wilms tumor samples: `06_infercnv.R` in `script` + -While we only selected the `infercnv` method with endothelium and immune cells as normal reference for the main workflow across samples, our analysis includes an exploration of cnv inference methods based on `copykat` and `infercnv` on a subselection of samples: +While we only selected the `infercnv` method with endothelium and immune cells as normal reference for the main workflow across samples, our analysis includes an exploration of cnv inference methods based on `copykat` and `infercnv` on a subselection of samples: the `script` `explore-cnv-methods.R` calls the independent scripts `05_copyKAT.R` and `06_infercnv.R` for the samples - "SCPCS000179", - "SCPCS000184", - - "SCPCS000194", + - "SCPCS000194", - "SCPCS000205", - "SCPCS000208". @@ -179,9 +172,9 @@ In addition, we explored the results for all samples in one notebook twice durin For each sample and each of the step, an html report is generated and accessible in the directory `notebook`. -### Justification +### Justification -The use of the right reference is crucial. +The use of the right reference is crucial. It is recommended that the cell types in the reference is representative to the cell types to be annotated in the query. Wilms tumors can contain up to three histologies that resemble fetal kidney: blastema, stroma, and epithelia [1-2]. @@ -191,26 +184,27 @@ We thus decided to test and compare two fetal (kidney) references that could be ##### Human fetal kidney atlas Stewart et al. -We first wanted to try the human fetal kidney atlas to transfer label into the Wilms tumor samples using azimuth. +We first wanted to try the human fetal kidney atlas to transfer label into the Wilms tumor samples using azimuth. You can find more about the human kidney atlas here: https://www.kidneycellatlas.org/ [3] ##### Human Azimuth fetal reference from Cao et al. -Azimuth also provide a human fetal atlas as a reference [4]. +Azimuth also provide a human fetal atlas as a reference [4]. -The data can be found on Zenodo: +The data can be found on Zenodo: https://zenodo.org/records/4738021#.YJIW4C2ZNQI -The reference contain cells from 15 organs including kidney from fetal samples. +The reference contain cells from 15 organs including kidney from fetal samples. Here we will use `Azimuth` to transfer labels from the reference. ### Input and outputs -We start with the `_process.Rds` data to run `01_seurat-processing.Rmd`. +We start with the `_process.Rds` data to run `01_seurat-processing.Rmd`. The output of `01_seurat-processing.Rmd` is saved in `results` in a subfolder for each sample and is the input of the second step `02a_label-transfer_fetal_full_reference_Cao.Rmd`. The output of `02a_label-transfer_fetal_full_reference_Cao.Rmd` is then the input of `02b_label-transfer_fetal_kidney_reference_Stewart.Rmd`. Following the same approach, the output of `02b_label-transfer_fetal_kidney_reference_Stewart.Rmd` is the input of `03_clustering_exploration.Rmd` and `06_infercnv.R`. -The outputs of `06_infercnv.R` `06_infercnv_HMM-i3_{sample_id}_{reference-type}.rds` is finally the input of `07_annotation_Across_Samples_exploration.Rmd`. +The outputs of `06_infercnv.R` `06_infercnv_HMM-i3_{sample_id}_{reference-type}.rds` is finally the input of `07_combined_annotation_across_samples_exploration.Rmd`, which produces a TSV with annotations in `results/SCPCP000006-annotations.tsv `. + All inputs/outputs generated and used in the main workflow are saved in the `results/{sample_id}` folder. Results in subfolders such as `results/{sample_id}/05_copyKAT` or `results/{sample_id}/06_infercnv` have been obtained for a subselection of samples in the exploratory analysis, and are thus kept separated from the results of the main workflow. @@ -223,16 +217,12 @@ At the end of the workflow, we have a `Seurat`object that contains: ## Software requirements -To perform the analysis, run the RMarkdown script in R (version 4.4.1). -The main packages used are: -- Seurat version 5 -- Azimuth version 5 -- inferCNV -- SCpubr for visualization -- DT for table visualization -- DElegate for differential expression analysis +### renv -The dependencies required are saved in `components/dependencies.R` +This module uses `renv`. +If you are using RStudio Server within the container, the `renv` project will not be activated by default. +You can install packages within the container and use `renv::snapshot()` to update the lockfile without activating the project without a problem in our testing. +The `renv` lockfile is used to install R packages in the Docker image. ### Docker @@ -266,24 +256,20 @@ If you are on a Mac with an M series chip, you will not be able to use RStudio S You must build an ARM image locally to be able to use RStudio Server within the container. #### A note for Halbritter lab internal development -This work has been developed on a system that uses podman instead of docker. The steps to run the docker/podman images are slightly different and we saved in run-podman-internal.sh our internal approach to run the container. Please, refer to the Docker section to build and run the container instead. +This work has been developed on a system that uses podman instead of docker. The steps to run the docker/podman images are slightly different and we saved in run-podman-internal.sh our internal approach to run the container. Please, refer to the Docker section to build and run the container instead. -### renv - -This module uses `renv`. -If you are using RStudio Server within the container, the `renv` project will not be activated by default. -You can install packages within the container and use `renv::snapshot()` to update the lockfile without activating the project without a problem in our testing. -The `renv` lockfile is used to install R packages in the Docker image. ## Computational resources -## References +This module may require up to 16 cores to run certain exploratory steps, but the main workflow can be run on a laptop. + +## References -- [1] https://www.ncbi.nlm.nih.gov/books/NBK373356/ +- [1] https://www.ncbi.nlm.nih.gov/books/NBK373356/ -- [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9915828/ +- [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9915828/ -- [3] https://www.science.org/doi/10.1126/science.aat5031 +- [3] https://www.science.org/doi/10.1126/science.aat5031 - [4] https://www.science.org/doi/10.1126/science.aba7721 diff --git a/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.Rmd b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.Rmd new file mode 100644 index 000000000..ba95cbb32 --- /dev/null +++ b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.Rmd @@ -0,0 +1,553 @@ +--- +title: "Combined annotation exploration for SCPCP000006" +author: "Maud PLASCHKA" +date: "`r Sys.Date()`" +params: + predicted.celltype.threshold: 0.85 + cnv_threshold: 0 +output: + html_document: + toc: yes + toc_float: yes + code_folding: hide + highlight: pygments + df_print: paged +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + message = FALSE, + warnings = FALSE, + knitr.digits.signif = FALSE +) +``` + + +## Introduction + + +The aim is to combine label transfer and CNV inference to annotate Wilms tumor samples in SCPCP000006. +The proposed annotation will be based on the combination of: + +- the label transfer from the fetal kidney reference (Stewart et al.), in particular the `fetal_kidney_predicted.compartment` and `fetal_kidney:predicted.cell_type`, as well as the `prediction.score` for each compartment, + +- the predicted CNV calculated using intra-sample endothelial and immune cells (`--reference both`) as normal reference + + + +In a second time, we will explore and validate the chosen annotation. + +We will use some of the [markers genes](https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/main/analyses/cell-type-wilms-tumor-06#the-table-celltype_metadatacsv-contains-the-following-column-and-information) to validate visually the annotations. + +The analysis can be summarized as the following: + +_Where `cnv.thr` and `pred.thr` need to be discussed_ + + +first level annotation | second level annotation | selection of the cells | marker genes for validation | cnv validation +-- | -- | -- | -- | -- +normal | endothelial | compartment == "endothelium" & predicted.score > pred.thr & cnv_score < cnv.thr | WVF | no cnv +normal | immune | compartment == "immune" & predicted.score > pred.thr & cnv_score < cnv.thr | PTPRC, CD163, CD68 | no cnv +normal | kidney | cell_type %in% c("kidney cell", "kidney epithelial", "podocyte") & predicted.score > pred.thr & cnv_score < cnv.thr | CDH1, PODXL, LTL | no cnv +normal | stroma | compartment == "stroma" & predicted.score > pred.thr & cnv_score < cnv.thr | VIM | no cnv +cancer | stroma | compartment == "stroma" & cnv_score > cnv.thr | VIM | proportion_cnv_chr -1 -4 -11 -16 -17 -18 +cancer | blastema | compartment == "fetal_nephron" & cell_type == "mesenchymal cell" & cnv_score > cnv.thr | CITED1 | proportion_cnv_chr -1 -4 -11 -16 -17 -18 +cancer | epithelial | compartment == "fetal_nephron" & cell_type != "mesenchymal cell" & cnv_score > cnv.thr | CDH1 | proportion_cnv_chr -1 -4 -11 -16 -17 -18 +unknown | - | the rest of the cells | - | proportion_cnv_chr -1 -4 -11 -16 -17 -18 + + + +### Packages + + +```{r packages, message=FALSE, warning=FALSE} +library(Seurat) +library(tidyverse) +library(patchwork) +library(DT) +``` + + +### Base directories + +```{r base paths, eval=TRUE} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) + +# The current data directory, found within the repository base directory +data_dir <- file.path(repository_base, "data", "current", "SCPCP000006") + +# The path to this module +module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06") + +result_dir <- file.path(module_base, "results") +``` + +### Output files + +```{r} +# cell type annotation table +annotations_tsv <- file.path(result_dir, "SCPCP000006-annotations.tsv") +``` + +### Input files + +In this notebook, we are working with all of the samples in SCPCP000006. + +The sample metadata can be found in `sample_metadata_file` in the `data` folder. + +We extracted from the pre-processed and labeled `Seurat` object from `results/{sample_id}/06_infercnv_HMM-i3_SCPCS000169_reference-both.rds`. +the following information per cell: + +- the predicted compartment in `fetal_kidney_predicted.compartment` and `fetal_kidney_predicted.compartment.score` + +- the predicted compartment in `fetal_kidney_predicted.cell_type` and `fetal_kidney_predicted.cell_type.score` + + +- the sample identifier in `sample_id` + + +- the proportion of estimated CNV per chromosome, {i} in 1 to 22 `proportion_cnv_chr{i}` + + +- the `counts`of [markers genes](https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/main/analyses/cell-type-wilms-tumor-06#the-table-celltype_metadatacsv-contains-the-following-column-and-information) + + + +```{r path_to_query} +sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv") + +metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE) +sample_ids <- metadata |> + dplyr::filter(seq_unit != "spot") |> + dplyr::pull(scpca_sample_id) |> + unique() + +# Create a data frames of all annotations +cell_type_df <- sample_ids |> + purrr::map( + # For each sample_id, do the following: + \(sample_id) { + # This sample was run with "none" for reference + if (sample_id == "SCPCS000190") { + reference <- "none" + } else { + reference <- "both" + } + input_file <- file.path( + result_dir, + sample_id, + glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds") + ) + + # Read in the Seurat object + srat <- readRDS(input_file) + + # Create and return a data frame from the Seurat object with relevant annotations + # this data frame will have four columns: barcode, sample_id, compartment, organ + data.frame( + # label transfer from the fetal kidney reference + cell_type = srat$fetal_kidney_predicted.cell_type, + compartment = srat$fetal_kidney_predicted.compartment, + + # predicted.scores from the label transfer from the fetal kidney reference + cell_type.score = srat$fetal_kidney_predicted.cell_type.score, + compartment.score = srat$fetal_kidney_predicted.compartment.score, + + # cell embedding + umap = srat@reductions$umap@cell.embeddings, + + # marker genes + PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"), + VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"), + VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"), + CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"), + CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"), + PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"), + COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"), + SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"), + NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"), + THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"), + + + + + + # proportion of cnv per chromosome + proportion_cnv_chr1 = srat$proportion_cnv_chr1, + proportion_cnv_chr2 = srat$proportion_cnv_chr2, + proportion_cnv_chr3 = srat$proportion_cnv_chr3, + proportion_cnv_chr4 = srat$proportion_cnv_chr4, + proportion_cnv_chr5 = srat$proportion_cnv_chr5, + proportion_cnv_chr6 = srat$proportion_cnv_chr6, + proportion_cnv_chr7 = srat$proportion_cnv_chr7, + proportion_cnv_chr8 = srat$proportion_cnv_chr8, + proportion_cnv_chr9 = srat$proportion_cnv_chr9, + proportion_cnv_chr10 = srat$proportion_cnv_chr10, + proportion_cnv_chr11 = srat$proportion_cnv_chr11, + proportion_cnv_chr12 = srat$proportion_cnv_chr12, + proportion_cnv_chr13 = srat$proportion_cnv_chr13, + proportion_cnv_chr14 = srat$proportion_cnv_chr14, + proportion_cnv_chr15 = srat$proportion_cnv_chr15, + proportion_cnv_chr16 = srat$proportion_cnv_chr16, + proportion_cnv_chr17 = srat$proportion_cnv_chr17, + proportion_cnv_chr18 = srat$proportion_cnv_chr18, + proportion_cnv_chr19 = srat$proportion_cnv_chr19, + proportion_cnv_chr20 = srat$proportion_cnv_chr20, + proportion_cnv_chr21 = srat$proportion_cnv_chr21, + proportion_cnv_chr22 = srat$proportion_cnv_chr22, + + # cnv global estimation per chromosome + has_cnv_chr1 = srat$has_cnv_chr1, + has_cnv_chr2 = srat$has_cnv_chr2, + has_cnv_chr3 = srat$has_cnv_chr3, + has_cnv_chr4 = srat$has_cnv_chr4, + has_cnv_chr5 = srat$has_cnv_chr5, + has_cnv_chr6 = srat$has_cnv_chr6, + has_cnv_chr7 = srat$has_cnv_chr7, + has_cnv_chr8 = srat$has_cnv_chr8, + has_cnv_chr9 = srat$has_cnv_chr9, + has_cnv_chr10 = srat$has_cnv_chr10, + has_cnv_chr11 = srat$has_cnv_chr11, + has_cnv_chr12 = srat$has_cnv_chr12, + has_cnv_chr13 = srat$has_cnv_chr13, + has_cnv_chr14 = srat$has_cnv_chr14, + has_cnv_chr15 = srat$has_cnv_chr15, + has_cnv_chr16 = srat$has_cnv_chr16, + has_cnv_chr17 = srat$has_cnv_chr17, + has_cnv_chr18 = srat$has_cnv_chr18, + has_cnv_chr19 = srat$has_cnv_chr19, + has_cnv_chr20 = srat$has_cnv_chr20, + has_cnv_chr21 = srat$has_cnv_chr21, + has_cnv_chr22 = srat$has_cnv_chr22 + ) |> + tibble::rownames_to_column("barcode") |> + dplyr::mutate(sample_id = sample_id) + } + ) |> + # now combine all dataframes to make one big one + dplyr::bind_rows() +``` + +### Output file + +The report will be saved in the `notebook` directory. + + +## Functions + +#### do_Feature_mean + +`do_Feature_mean` shows heatmap of mean expression of a feature grouped by a metadata. + +- `df` is the name of the table containing metadata and feature expression (counts) per cells +- `group.by` is the name of the metadata to group the cells +- `feature` is the name of the gene to average the expression + +```{r fig.width=10, fig.height=4, out.width='100%'} +do_Feature_mean <- function(df, group.by, feature) { + df <- df %>% + group_by(sample_id, !!sym(group.by)) %>% + summarise(m = mean(!!sym(feature))) + + + p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) + + geom_tile() + + scale_fill_viridis_c() + + theme_bw() + + theme(text = element_text(size = 20)) + + theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) + + guides(fill = guide_colourbar(title = paste0(feature))) + + return(p) +} +``` + + +## Analysis + +### Global cnv score + +As done in `06_cnv_infercnv_exploration.Rmd`, we calculate single CNV score and assess its potential in identifying cells with CNV versus normal cells without CNV. + +We simply checked for each chromosome if the cell `has_cnv_chr`. +Would the cell have more than `cnv_threshold` chromosome with CNV, the global `has_cnv_score` will be TRUE. +Else, the cell will have a `has_cnv_score` set to FALSE. + + + + +```{r fig.width=10, fig.height=10, out.width='100%', results='asis'} +cell_type_df <- cell_type_df |> + mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_cnv_chr", colnames(cell_type_df))])) |> + mutate(has_cnv_score = case_when( + has_cnv_score > params$cnv_threshold ~ "CNV", + has_cnv_score <= params$cnv_threshold ~ "no CNV" + )) + + +table(cell_type_df$has_cnv_score) +``` + +### First level annotation + +At first, we like to indicate in the `first.level_annotation` if a cell is normal, cancer or unknown. + +- _normal_ cells can be observe in all four compartments (`endothelium`, `immune`, `stroma` or `fetal nephron`) and do not have cnv. +We only allow a bit of flexibility in terms of cnv profile for immune and endothelium cells that have a high predicted score. +Indeed, we know that false positive cnv can be observed in a cell type specific manner. + +The threshold used for the `predicted.score` is defined as a parameter of this notebook as `r params$predicted.celltype.threshold`. +The threshold used for the identification of cnv is also defined in the params of the notebook as `r params$cnv_threshold`. + +- _cancer_ cells are either from the `stroma` or `fetal nephron` compartments and must have at least few cnv. + + + +```{r fig.width=10, fig.height=10, out.width='100%', results='asis'} +# Define normal cells +# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold +cell_type_df <- cell_type_df |> + mutate(first.level_annotation = case_when( + # assign normal/cancer based on condition + compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "no CNV" ~ "normal", + compartment %in% c("endothelium", "immune") & cell_type.score > params$predicted.celltype.threshold ~ "normal", + compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "CNV" ~ "cancer", + .default = "unknown" + )) +``` + +Using this basic strategy, we identified `r table(cell_type_df$first.level_annotation)["cancer"]` _cancer_ cells, `r table(cell_type_df$first.level_annotation)["normal"]` _normal_ cells. +Only `r table(cell_type_df$first.level_annotation)["unknown"]`cells remain _unknown_ + + + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation), shape = 19, size = 1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + + +### Second level annotation + +#### Normal cells + +- Normal cells from the `fetal nephron` compartment must be normal kidney cells. + +- Normal cells from the `stroma` compartment must be normal stroma cells. + +- Immune and endothelial cells have been already identified by label transfer. + +#### Cancer cells + +Wilms tumor cancer cells can be: + +- _cancer stroma_: We define as _cancer stroma_ all cancer cells from the stroma compartment. + + +- _blastema_,: we defined as _bastema_ every cancer cell that has a `fetal_kidney_predicted.cell_type == mesenchymal cell`. +We know that these _mesenchymal_ cells are cells from the cap mesenchyme that are not expected to be in a mature kidney. +These blastema cells should express higher _CITED1_. + +- _cancer epithelium_: we defined as _cancer epithelium_ all cancer cells that are neither stroma nor blastemal cells. +We expect these cells to express epithelial markers. +Their predicted cell type should correspond to more mature kidney epithelial subunits. + +```{r fig.width=10, fig.height=10, out.width='100%', results='asis'} +cell_type_df <- cell_type_df |> + mutate(second.level_annotation = case_when( + # assign normal cells based on condition + cell_type_df$compartment %in% c("fetal_nephron") & + cell_type_df$has_cnv_score == "no CNV" ~ "kidney", + cell_type_df$compartment %in% c("stroma") & + cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma", + cell_type_df$compartment %in% c("endothelium") & + (cell_type_df$compartment.score > params$predicted.celltype.threshold | + cell_type_df$has_cnv_score == "no CNV") ~ "endothelium", + cell_type_df$compartment %in% c("immune") & + (cell_type_df$compartment.score > params$predicted.celltype.threshold | + cell_type_df$has_cnv_score == "no CNV") ~ "immune", + + # assign cancer cells based on condition + cell_type_df$compartment %in% c("stroma") & + cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma", + cell_type_df$compartment %in% c("fetal_nephron") & + cell_type_df$has_cnv_score == "CNV" & + cell_type_df$cell_type == "mesenchymal cell" ~ "blastema", + cell_type_df$compartment %in% c("fetal_nephron") & + cell_type_df$has_cnv_score == "CNV" & + cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium", + .default = "unknown" + )) +``` + + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df[cell_type_df$first.level_annotation == "normal", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + + + + + + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df[cell_type_df$first.level_annotation == "cancer", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 0.1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + +#### Cancer and normal cells + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + +### Validation cancer versus normal based on the cnv profile + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +for (i in 1:22) { + print(do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = glue::glue("proportion_cnv_chr", i))) +} +``` + +### Validation of second level annotation using marker genes + +#### Immune, _PTPRC_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000081237") +``` + +#### Endothelium, _VWF_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000110799") +``` + +#### Stroma, _Vimentin_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000026025") +``` + +#### Stroma, _COL6A3_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000163359") +``` + + +#### Stroma, _THY1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000154096") +``` + +#### Blastema, _CITED1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000125931") +``` + + +#### Blastema, _NCAM1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000149294") +``` + +#### stemness marker (blastema and primitive epithelium), _SIX2_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000170577") +``` + +#### Epithelium, _CDH1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000039068") +``` + + +#### Epithelium, _PODXL_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000128567") +``` + + +## Create annotation table for export + +This section creates the cell type annotation table for export. + +```{r} +annotations_table <- cell_type_df |> + select( + cell_barcode = barcode, + scpca_sample_id = sample_id, + tumor_cell_classification = first.level_annotation, + cell_type_assignment = second.level_annotation + ) |> + mutate( + # change cancer --> tumor, but keep the other labels + tumor_cell_classification = ifelse( + tumor_cell_classification == "cancer", "tumor", tumor_cell_classification + ), + cell_type_assignment = str_replace_all( + cell_type_assignment, + "cancer ", + "tumor " + ) + ) + +write_tsv(annotations_table, annotations_tsv) +``` + + +Confirm how many samples we have annotations for: +```{r} +length(unique(annotations_table$scpca_sample_id)) +``` + +## Conclusion + +- Combining label transfer and CNV inference we have produced draft annotations for all 40 Wilms tumor samples in SCPCP000006 + +- The heatmaps of cnv proportion and marker genes support our annotations, but signals with some marker genes are very low. +Also, there is no universal marker for each entity of Wilms tumor that cover all tumor cells from all patient. +This makes the validation of the annotations quite difficult. + +- However, we could try to take the problem from the other side, and used the current annotation to perform differential expression analysis and try to find marker genes that are consistent across patient and Wilms tumor histologies. + +- In each histology (i.e. epithelial and stroma), the distinction between cancer and non cancer cell is difficult (as expected). +In this analysis, we suggested to rely on the cnv score to assess the normality of the cell. +Here again, we could try to run differential expression analysis and compare epithelial (resp. stroma) cancer versus non-cancer cells across patient, aiming to find a share transcriptional program allowing the classification cancer versus normal. + +- In our annotation, we haven't taken into account the favorable/anaplastic status of the sample. +However, as anaplasia can occur in every (but do not has to) wilms tumor histology, I am not sure how to integrate the information into the annotation. + +- This notebook could be finally rendered using different parameters, i.e. threshold for the cnv score and predicted score to use. + +## Session Info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + + diff --git a/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.html b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.html new file mode 100644 index 000000000..cac8877f3 --- /dev/null +++ b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.html @@ -0,0 +1,3642 @@ + + + + + + + + + + + + + + + +Combined annotation exploration for SCPCP000006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
+

Introduction

+

The aim is to combine label transfer and CNV inference to annotate +Wilms tumor samples in SCPCP000006. The proposed annotation will be +based on the combination of:

+
    +
  • the label transfer from the fetal kidney reference (Stewart et +al.), in particular the fetal_kidney_predicted.compartment +and fetal_kidney:predicted.cell_type, as well as the +prediction.score for each compartment,

  • +
  • the predicted CNV calculated using intra-sample endothelial and +immune cells (--reference both) as normal +reference

  • +
+

In a second time, we will explore and validate the chosen +annotation.

+

We will use some of the markers +genes to validate visually the annotations.

+

The analysis can be summarized as the following:

+

Where cnv.thr and pred.thr need to be +discussed

+ +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
first level annotationsecond level annotationselection of the cellsmarker genes for validationcnv validation
normalendothelialcompartment == “endothelium” & predicted.score > pred.thr +& cnv_score < cnv.thrWVFno cnv
normalimmunecompartment == “immune” & predicted.score > pred.thr & +cnv_score < cnv.thrPTPRC, CD163, CD68no cnv
normalkidneycell_type %in% c(“kidney cell”, “kidney epithelial”, “podocyte”) +& predicted.score > pred.thr & cnv_score < cnv.thrCDH1, PODXL, LTLno cnv
normalstromacompartment == “stroma” & predicted.score > pred.thr & +cnv_score < cnv.thrVIMno cnv
cancerstromacompartment == “stroma” & cnv_score > cnv.thrVIMproportion_cnv_chr -1 -4 -11 -16 -17 -18
cancerblastemacompartment == “fetal_nephron” & cell_type == “mesenchymal cell” +& cnv_score > cnv.thrCITED1proportion_cnv_chr -1 -4 -11 -16 -17 -18
cancerepithelialcompartment == “fetal_nephron” & cell_type != “mesenchymal cell” +& cnv_score > cnv.thrCDH1proportion_cnv_chr -1 -4 -11 -16 -17 -18
unknown-the rest of the cells-proportion_cnv_chr -1 -4 -11 -16 -17 -18
+
+

Packages

+
library(Seurat)
+library(tidyverse)
+library(patchwork)
+library(DT)
+
+
+

Base directories

+
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The current data directory, found within the repository base directory
+data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")
+
+# The path to this module
+module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
+
+result_dir <- file.path(module_base, "results")
+
+
+

Output files

+
# cell type annotation table
+annotations_tsv <- file.path(result_dir, "SCPCP000006-annotations.tsv")
+
+
+

Input files

+

In this notebook, we are working with all of the samples in +SCPCP000006.

+

The sample metadata can be found in sample_metadata_file +in the data folder.

+

We extracted from the pre-processed and labeled Seurat +object from +results/{sample_id}/06_infercnv_HMM-i3_SCPCS000169_reference-both.rds. +the following information per cell:

+
    +
  • the predicted compartment in +fetal_kidney_predicted.compartment and +fetal_kidney_predicted.compartment.score

  • +
  • the predicted compartment in +fetal_kidney_predicted.cell_type and +fetal_kidney_predicted.cell_type.score

  • +
  • the sample identifier in sample_id

  • +
  • the proportion of estimated CNV per chromosome, {i} in 1 to 22 +proportion_cnv_chr{i}

  • +
  • the countsof markers +genes

  • +
+
sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
+
+metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)
+sample_ids <- metadata |>
+  dplyr::filter(seq_unit != "spot") |>
+  dplyr::pull(scpca_sample_id) |>
+  unique()
+
+# Create a data frames of all annotations
+cell_type_df <- sample_ids |>
+  purrr::map(
+    # For each sample_id, do the following:
+    \(sample_id) {
+      # This sample was run with "none" for reference
+      if (sample_id == "SCPCS000190") {
+        reference <- "none"
+      } else {
+        reference <- "both"
+      }
+      input_file <- file.path(
+        result_dir,
+        sample_id,
+        glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")
+      )
+
+      # Read in the Seurat object
+      srat <- readRDS(input_file)
+
+      # Create and return a data frame from the Seurat object with relevant annotations
+      # this data frame will have four columns: barcode, sample_id, compartment, organ
+      data.frame(
+        # label transfer from the fetal kidney reference
+        cell_type = srat$fetal_kidney_predicted.cell_type,
+        compartment = srat$fetal_kidney_predicted.compartment,
+
+        # predicted.scores from the label transfer from the fetal kidney reference
+        cell_type.score = srat$fetal_kidney_predicted.cell_type.score,
+        compartment.score = srat$fetal_kidney_predicted.compartment.score,
+
+        # cell embedding
+        umap = srat@reductions$umap@cell.embeddings,
+
+        # marker genes
+        PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
+        VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
+        VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"),
+        CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"),
+        CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"),
+        PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"),
+        COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"),
+        SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"),
+        NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"),
+        THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"),
+
+        # proportion of cnv per chromosome
+        proportion_cnv_chr1 = srat$proportion_cnv_chr1,
+        proportion_cnv_chr2 = srat$proportion_cnv_chr2,
+        proportion_cnv_chr3 = srat$proportion_cnv_chr3,
+        proportion_cnv_chr4 = srat$proportion_cnv_chr4,
+        proportion_cnv_chr5 = srat$proportion_cnv_chr5,
+        proportion_cnv_chr6 = srat$proportion_cnv_chr6,
+        proportion_cnv_chr7 = srat$proportion_cnv_chr7,
+        proportion_cnv_chr8 = srat$proportion_cnv_chr8,
+        proportion_cnv_chr9 = srat$proportion_cnv_chr9,
+        proportion_cnv_chr10 = srat$proportion_cnv_chr10,
+        proportion_cnv_chr11 = srat$proportion_cnv_chr11,
+        proportion_cnv_chr12 = srat$proportion_cnv_chr12,
+        proportion_cnv_chr13 = srat$proportion_cnv_chr13,
+        proportion_cnv_chr14 = srat$proportion_cnv_chr14,
+        proportion_cnv_chr15 = srat$proportion_cnv_chr15,
+        proportion_cnv_chr16 = srat$proportion_cnv_chr16,
+        proportion_cnv_chr17 = srat$proportion_cnv_chr17,
+        proportion_cnv_chr18 = srat$proportion_cnv_chr18,
+        proportion_cnv_chr19 = srat$proportion_cnv_chr19,
+        proportion_cnv_chr20 = srat$proportion_cnv_chr20,
+        proportion_cnv_chr21 = srat$proportion_cnv_chr21,
+        proportion_cnv_chr22 = srat$proportion_cnv_chr22,
+
+        # cnv global estimation per chromosome
+        has_cnv_chr1 = srat$has_cnv_chr1,
+        has_cnv_chr2 = srat$has_cnv_chr2,
+        has_cnv_chr3 = srat$has_cnv_chr3,
+        has_cnv_chr4 = srat$has_cnv_chr4,
+        has_cnv_chr5 = srat$has_cnv_chr5,
+        has_cnv_chr6 = srat$has_cnv_chr6,
+        has_cnv_chr7 = srat$has_cnv_chr7,
+        has_cnv_chr8 = srat$has_cnv_chr8,
+        has_cnv_chr9 = srat$has_cnv_chr9,
+        has_cnv_chr10 = srat$has_cnv_chr10,
+        has_cnv_chr11 = srat$has_cnv_chr11,
+        has_cnv_chr12 = srat$has_cnv_chr12,
+        has_cnv_chr13 = srat$has_cnv_chr13,
+        has_cnv_chr14 = srat$has_cnv_chr14,
+        has_cnv_chr15 = srat$has_cnv_chr15,
+        has_cnv_chr16 = srat$has_cnv_chr16,
+        has_cnv_chr17 = srat$has_cnv_chr17,
+        has_cnv_chr18 = srat$has_cnv_chr18,
+        has_cnv_chr19 = srat$has_cnv_chr19,
+        has_cnv_chr20 = srat$has_cnv_chr20,
+        has_cnv_chr21 = srat$has_cnv_chr21,
+        has_cnv_chr22 = srat$has_cnv_chr22
+      ) |>
+        tibble::rownames_to_column("barcode") |>
+        dplyr::mutate(sample_id = sample_id)
+    }
+  ) |>
+  # now combine all dataframes to make one big one
+  dplyr::bind_rows()
+
+
+

Output file

+

The report will be saved in the notebook directory.

+
+
+
+

Functions

+
+

do_Feature_mean

+

do_Feature_mean shows heatmap of mean expression of a +feature grouped by a metadata.

+
    +
  • df is the name of the table containing metadata and +feature expression (counts) per cells
  • +
  • group.by is the name of the metadata to group the +cells
  • +
  • feature is the name of the gene to average the +expression
  • +
+
do_Feature_mean <- function(df, group.by, feature) {
+  df <- df %>%
+    group_by(sample_id, !!sym(group.by)) %>%
+    summarise(m = mean(!!sym(feature)))
+
+
+  p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
+    geom_tile() +
+    scale_fill_viridis_c() +
+    theme_bw() +
+    theme(text = element_text(size = 20)) +
+    theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
+    guides(fill = guide_colourbar(title = paste0(feature)))
+
+  return(p)
+}
+
+
+
+

Analysis

+
+

Global cnv score

+

As done in 06_cnv_infercnv_exploration.Rmd, we calculate +single CNV score and assess its potential in identifying cells with CNV +versus normal cells without CNV.

+

We simply checked for each chromosome if the cell +has_cnv_chr. Would the cell have more than +cnv_threshold chromosome with CNV, the global +has_cnv_score will be TRUE. Else, the cell will have a +has_cnv_score set to FALSE.

+
cell_type_df <- cell_type_df |>
+  mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_cnv_chr", colnames(cell_type_df))])) |>
+  mutate(has_cnv_score = case_when(
+    has_cnv_score > params$cnv_threshold ~ "CNV",
+    has_cnv_score <= params$cnv_threshold ~ "no CNV"
+  ))
+
+
+table(cell_type_df$has_cnv_score)
+

CNV no CNV 190793 9545

+
+
+

First level annotation

+

At first, we like to indicate in the +first.level_annotation if a cell is normal, cancer or +unknown.

+
    +
  • normal cells can be observe in all four compartments +(endothelium, immune, stroma or +fetal nephron) and do not have cnv. We only allow a bit of +flexibility in terms of cnv profile for immune and endothelium cells +that have a high predicted score. Indeed, we know that false positive +cnv can be observed in a cell type specific manner.
  • +
+

The threshold used for the predicted.score is defined as +a parameter of this notebook as 0.85. The threshold used for the +identification of cnv is also defined in the params of the notebook as +0.

+
    +
  • cancer cells are either from the stroma or +fetal nephron compartments and must have at least few +cnv.
  • +
+
# Define normal cells
+# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold
+cell_type_df <- cell_type_df |>
+  mutate(first.level_annotation = case_when(
+    # assign normal/cancer based on condition
+    compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "no CNV" ~ "normal",
+    compartment %in% c("endothelium", "immune") & cell_type.score > params$predicted.celltype.threshold ~ "normal",
+    compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "CNV" ~ "cancer",
+    .default = "unknown"
+  ))
+

Using this basic strategy, we identified 190303 cancer +cells, 8088 normal cells. Only 1947cells remain +unknown

+
ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation), shape = 19, size = 1) +
+  geom_point() +
+  facet_wrap(facets = ~sample_id, ncol = 5) +
+  theme_bw() +
+  theme(text = element_text(size = 22))
+

+
+
+

Second level annotation

+
+

Normal cells

+
    +
  • Normal cells from the fetal nephron compartment must +be normal kidney cells.

  • +
  • Normal cells from the stroma compartment must be +normal stroma cells.

  • +
  • Immune and endothelial cells have been already identified by +label transfer.

  • +
+
+
+

Cancer cells

+

Wilms tumor cancer cells can be:

+
    +
  • cancer stroma: We define as cancer stroma all +cancer cells from the stroma compartment.

  • +
  • blastema,: we defined as bastema every cancer +cell that has a +fetal_kidney_predicted.cell_type == mesenchymal cell. We +know that these mesenchymal cells are cells from the cap +mesenchyme that are not expected to be in a mature kidney. These +blastema cells should express higher CITED1.

  • +
  • cancer epithelium: we defined as cancer +epithelium all cancer cells that are neither stroma nor blastemal +cells. We expect these cells to express epithelial markers. Their +predicted cell type should correspond to more mature kidney epithelial +subunits.

  • +
+
cell_type_df <- cell_type_df |>
+  mutate(second.level_annotation = case_when(
+    # assign normal cells based on condition
+    cell_type_df$compartment %in% c("fetal_nephron") &
+      cell_type_df$has_cnv_score == "no CNV" ~ "kidney",
+    cell_type_df$compartment %in% c("stroma") &
+      cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma",
+    cell_type_df$compartment %in% c("endothelium") &
+      (cell_type_df$compartment.score > params$predicted.celltype.threshold |
+        cell_type_df$has_cnv_score == "no CNV") ~ "endothelium",
+    cell_type_df$compartment %in% c("immune") &
+      (cell_type_df$compartment.score > params$predicted.celltype.threshold |
+        cell_type_df$has_cnv_score == "no CNV") ~ "immune",
+
+    # assign cancer cells based on condition
+    cell_type_df$compartment %in% c("stroma") &
+      cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma",
+    cell_type_df$compartment %in% c("fetal_nephron") &
+      cell_type_df$has_cnv_score == "CNV" &
+      cell_type_df$cell_type == "mesenchymal cell" ~ "blastema",
+    cell_type_df$compartment %in% c("fetal_nephron") &
+      cell_type_df$has_cnv_score == "CNV" &
+      cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium",
+    .default = "unknown"
+  ))
+
ggplot(cell_type_df[cell_type_df$first.level_annotation == "normal", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) +
+  geom_point() +
+  facet_wrap(facets = ~sample_id, ncol = 5) +
+  theme_bw() +
+  theme(text = element_text(size = 22))
+

+
ggplot(cell_type_df[cell_type_df$first.level_annotation == "cancer", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 0.1) +
+  geom_point() +
+  facet_wrap(facets = ~sample_id, ncol = 5) +
+  theme_bw() +
+  theme(text = element_text(size = 22))
+

+
+
+

Cancer and normal cells

+
ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) +
+  geom_point() +
+  facet_wrap(facets = ~sample_id, ncol = 5) +
+  theme_bw() +
+  theme(text = element_text(size = 22))
+

+
+
+
+

Validation cancer versus normal based on the cnv profile

+
for (i in 1:22) {
+  print(do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = glue::glue("proportion_cnv_chr", i)))
+}
+

+
+
+

Validation of second level annotation using marker genes

+
+

Immune, PTPRC expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000081237")
+

+
+
+

Endothelium, VWF expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000110799")
+

+
+
+

Stroma, Vimentin expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000026025")
+

+
+
+

Stroma, COL6A3 expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000163359")
+

+
+
+

Stroma, THY1 expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000154096")
+

+
+
+

Blastema, CITED1 expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000125931")
+

+
+
+

Blastema, NCAM1 expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000149294")
+

+
+
+

stemness marker (blastema and primitive epithelium), SIX2 +expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000170577")
+

+
+
+

Epithelium, CDH1 expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000039068")
+

+
+
+

Epithelium, PODXL expression

+
do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000128567")
+

+
+
+
+
+

Create annotation table for export

+

This section creates the cell type annotation table for export.

+
annotations_table <- cell_type_df |>
+  select(
+    cell_barcode = barcode,
+    scpca_sample_id = sample_id,
+    tumor_cell_classification = first.level_annotation,
+    cell_type_assignment = second.level_annotation
+  ) |>
+  mutate(
+    # change cancer --> tumor, but keep the other labels
+    tumor_cell_classification = ifelse(
+      tumor_cell_classification == "cancer", "tumor", tumor_cell_classification
+    ),
+    cell_type_assignment = str_replace_all(
+      cell_type_assignment,
+      "cancer ",
+      "tumor "
+    )
+  )
+
+write_tsv(annotations_table, annotations_tsv)
+

Confirm how many samples we have annotations for:

+
length(unique(annotations_table$scpca_sample_id))
+
## [1] 40
+
+
+

Conclusion

+
    +
  • Combining label transfer and CNV inference we have produced draft +annotations for all 40 Wilms tumor samples in SCPCP000006

  • +
  • The heatmaps of cnv proportion and marker genes support our +annotations, but signals with some marker genes are very low. Also, +there is no universal marker for each entity of Wilms tumor that cover +all tumor cells from all patient. This makes the validation of the +annotations quite difficult.

  • +
  • However, we could try to take the problem from the other side, +and used the current annotation to perform differential expression +analysis and try to find marker genes that are consistent across patient +and Wilms tumor histologies.

  • +
  • In each histology (i.e. epithelial and stroma), the distinction +between cancer and non cancer cell is difficult (as expected). In this +analysis, we suggested to rely on the cnv score to assess the normality +of the cell. Here again, we could try to run differential expression +analysis and compare epithelial (resp. stroma) cancer versus non-cancer +cells across patient, aiming to find a share transcriptional program +allowing the classification cancer versus normal.

  • +
  • In our annotation, we haven’t taken into account the +favorable/anaplastic status of the sample. However, as anaplasia can +occur in every (but do not has to) wilms tumor histology, I am not sure +how to integrate the information into the annotation.

  • +
  • This notebook could be finally rendered using different +parameters, i.e. threshold for the cnv score and predicted score to +use.

  • +
+
+
+

Session Info

+
# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+
## R version 4.4.1 (2024-06-14)
+## Platform: aarch64-apple-darwin20
+## Running under: macOS 15.1
+## 
+## Matrix products: default
+## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: America/New_York
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+##  [1] DT_0.33            patchwork_1.2.0    lubridate_1.9.3    forcats_1.0.0     
+##  [5] stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2        readr_2.1.5       
+##  [9] tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0   
+## [13] Seurat_5.1.0       SeuratObject_5.0.2 sp_2.1-4          
+## 
+## loaded via a namespace (and not attached):
+##   [1] RColorBrewer_1.1-3     rstudioapi_0.16.0      jsonlite_1.8.8        
+##   [4] magrittr_2.0.3         spatstat.utils_3.1-0   farver_2.1.2          
+##   [7] rmarkdown_2.28         vctrs_0.6.5            ROCR_1.0-11           
+##  [10] spatstat.explore_3.3-2 htmltools_0.5.8.1      sass_0.4.9            
+##  [13] sctransform_0.4.1      parallelly_1.38.0      KernSmooth_2.23-24    
+##  [16] bslib_0.8.0            htmlwidgets_1.6.4      ica_1.0-3             
+##  [19] plyr_1.8.9             plotly_4.10.4          zoo_1.8-12            
+##  [22] cachem_1.1.0           igraph_2.0.3           mime_0.12             
+##  [25] lifecycle_1.0.4        pkgconfig_2.0.3        Matrix_1.7-0          
+##  [28] R6_2.5.1               fastmap_1.2.0          fitdistrplus_1.2-1    
+##  [31] future_1.34.0          shiny_1.9.1            digest_0.6.37         
+##  [34] colorspace_2.1-1       rprojroot_2.0.4        tensor_1.5            
+##  [37] RSpectra_0.16-2        irlba_2.3.5.1          labeling_0.4.3        
+##  [40] progressr_0.14.0       fansi_1.0.6            spatstat.sparse_3.1-0 
+##  [43] timechange_0.3.0       httr_1.4.7             polyclip_1.10-7       
+##  [46] abind_1.4-5            compiler_4.4.1         bit64_4.0.5           
+##  [49] withr_3.0.1            fastDummies_1.7.4      highr_0.11            
+##  [52] MASS_7.3-61            tools_4.4.1            lmtest_0.9-40         
+##  [55] httpuv_1.6.15          future.apply_1.11.2    goftest_1.2-3         
+##  [58] glue_1.7.0             nlme_3.1-166           promises_1.3.0        
+##  [61] grid_4.4.1             Rtsne_0.17             cluster_2.1.6         
+##  [64] reshape2_1.4.4         generics_0.1.3         gtable_0.3.5          
+##  [67] spatstat.data_3.1-2    tzdb_0.4.0             data.table_1.16.0     
+##  [70] hms_1.1.3              utf8_1.2.4             spatstat.geom_3.3-2   
+##  [73] RcppAnnoy_0.0.22       ggrepel_0.9.5          RANN_2.6.2            
+##  [76] pillar_1.9.0           vroom_1.6.5            spam_2.10-0           
+##  [79] RcppHNSW_0.6.0         later_1.3.2            splines_4.4.1         
+##  [82] lattice_0.22-6         bit_4.0.5              survival_3.7-0        
+##  [85] deldir_2.0-4           tidyselect_1.2.1       miniUI_0.1.1.1        
+##  [88] pbapply_1.7-2          knitr_1.48             gridExtra_2.3         
+##  [91] scattermore_1.2        xfun_0.47              matrixStats_1.3.0     
+##  [94] stringi_1.8.4          lazyeval_0.2.2         yaml_2.3.10           
+##  [97] evaluate_0.24.0        codetools_0.2-20       cli_3.6.3             
+## [100] uwot_0.2.2             xtable_1.8-4           reticulate_1.38.0     
+## [103] munsell_0.5.1          jquerylib_0.1.4        Rcpp_1.0.13           
+## [106] globals_0.16.3         spatstat.random_3.3-1  png_0.1-8             
+## [109] spatstat.univar_3.0-0  parallel_4.4.1         dotCall64_1.1-1       
+## [112] listenv_0.9.1          viridisLite_0.4.2      scales_1.3.0          
+## [115] ggridges_0.5.6         crayon_1.5.3           leiden_0.4.3.1        
+## [118] rlang_1.1.4            cowplot_1.1.3
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + + diff --git a/analyses/cell-type-wilms-tumor-06/notebook/README.md b/analyses/cell-type-wilms-tumor-06/notebook/README.md index ee4b03d58..28c4442f9 100644 --- a/analyses/cell-type-wilms-tumor-06/notebook/README.md +++ b/analyses/cell-type-wilms-tumor-06/notebook/README.md @@ -1,29 +1,29 @@ # Notebook directory instructions -The notebook directory holds subdirectory for each of the sample in the Wilms tumor dataset SCPCP000006 and the fetal kidney reference that we used for label transfer. +The notebook directory holds subdirectory for each of the sample in the Wilms tumor dataset SCPCP000006 and the fetal kidney reference that we used for label transfer. ## Azimuth compatible fetal kidney reference To perform label transfer using Azimuth and the fetal kidney atlas, a reference is built via [`scripts/download-and-create-fetal-kidney-ref.R`](../scripts/download-and-create-fetal-kidney-ref.R) using the fetal_full.Rds object download from: "https://datasets.cellxgene.cziscience.com/40ebb8e4-1a25-4a33-b8ff-02d1156e4e9b.rds" -As part of the `00b_characterize_fetal_kidney_reference_Stewart.Rmd` notebook template, we characterized the fetal kidney reference and generated lists of marker genes for the compartment and cell types composing the reference. +As part of the `00b_characterize_fetal_kidney_reference_Stewart.Rmd` notebook template, we characterized the fetal kidney reference and generated lists of marker genes for the compartment and cell types composing the reference. ## Analysis per sample -Each of the sample of the Wilms tumor dataset SCPCP000006 as been pre-processed and characterized as the following. +Each of the sample of the Wilms tumor dataset SCPCP000006 as been pre-processed and characterized as the following. Reports for each of the steps are found in the notebook/{sample_id} directory: - [x] `01_seurat_processing_{sample-id}.html` is the output of the [`01_seurat-processing.Rmd`](../notebook_template/01_seurat-processing.Rmd) notebook template. In brief, the `_processed.rds` `sce object` is converted to `Seurat` and normalized using `SCTransform`. -Dimensionality reduction (`RunPCA` and `RunUMAP`) and clustering (`FindNeighbors` and `FindClusters`) are performed before saving the `Seurat` object. +Dimensionality reduction (`RunPCA` and `RunUMAP`) and clustering (`FindNeighbors` and `FindClusters`) are performed before saving the `Seurat` object. - [x] `02a_fetal_full_label-transfer_{sample-id}.html` is the output of the [`02a_label-transfer_fetal_full_reference_Cao.Rmd`](../notebook_template/02a_label-transfer_fetal_full_reference_Cao.Rmd) notebook template. -In brief, we used `Azimuth` to transfer labels from the `Azimuth` fetal full reference (Cao et al.) +In brief, we used `Azimuth` to transfer labels from the `Azimuth` fetal full reference (Cao et al.) - [x] `02b_fetal_kidney_label-transfer_{sample-id}.html` is the output of the [`02b_label-transfer_fetal_kidney_reference_Stewart.Rmd`](../notebook_template/02b_label-transfer_fetal_kidney_reference_Stewart.Rmd) notebook template. -In brief, we used `Azimuth` to transfer labels from the fetal kidney reference (Stewart et al.) +In brief, we used `Azimuth` to transfer labels from the fetal kidney reference (Stewart et al.) - [x] `03_clustering_exploration_{sample-id}.html` is the output of the [`03_clustering_exploration.Rmd`](../notebook_template/03_clustering_exploration.Rmd) notebook template. In brief, we explore the clustering results, we look into some marker genes, pathways enrichment and label transfer. @@ -33,10 +33,10 @@ In brief, we explore the clustering results, we look into some marker genes, pat The next step in analysis is to identify tumor vs. normal cells. -- [x] `04_annotation_Across_Samples_exploration.html` is the output of the [`04_annotation_Across_Samples_exploration.Rmd`](../notebook/04_annotation_Across_Samples_exploration.Rmd) notebook. +- [x] `04_annotation_Across_Samples_exploration.html` is the output of the [`04_annotation_Across_Samples_exploration.Rmd`](../notebook/04_annotation_Across_Samples_exploration.Rmd) notebook. In brief, we explored the label transfer results across all samples in the Wilms tumor dataset SCPCP000006 in order to identify a few samples that we can begin next analysis steps with. -One way to evaluate the label transfer is to look at the `predicted.score` for each label being transfered, which more or less correspond to the certainty for a label transfer to be _TRUE_. More informations on the cell-level metric `predicted.score` can be found in the [mapping QC](https://azimuth.hubmapconsortium.org/#Mapping%20QC) section of `Azimuth` documentation. +One way to evaluate the label transfer is to look at the `predicted.score` for each label being transfered, which more or less correspond to the certainty for a label transfer to be _TRUE_. More informations on the cell-level metric `predicted.score` can be found in the [mapping QC](https://azimuth.hubmapconsortium.org/#Mapping%20QC) section of `Azimuth` documentation. We render the notebook with different thresholds for the `predicted.score` and evaluate the impact of filtering out cells with a `predicted.score` below 0.5, 0.75, 0.85 and 0.95. @@ -46,10 +46,13 @@ Of important notes: - We would rather use the `predicted.score` threshold to select normal cells for which we have a high confidency, i.e. immune and endothelial cells, but not to filter out all cells below the threshold. -- While a `predicted.score` of 0.5 is much too low (almost all cells having a higher `predicted.score`) and 0.95 is too high (so few cells pass the threshold), 0.75 and 0.85 looked both appropriate for our purpose. +- While a `predicted.score` of 0.5 is much too low (almost all cells having a higher `predicted.score`) and 0.95 is too high (so few cells pass the threshold), 0.75 and 0.85 looked both appropriate for our purpose. --> We decided to go with the most stringent threshold of 0.85 as we want to be sure of our selection of normal cells (i.e. endothelial and immune cells) that we will use to run `infercnv`. +- [x] `07_combined_annotation_across_samples_exploration.html` is the output of the [`07_combined_annotation_across_samples_exploration.Rmd`](../notebook/07_combined_annotation_across_samples_exploration.Rmd) notebook. +This notebook performs a draft annotation of samples using information from CNV inference and label transfer. + ## Exploratory analysis We selected in [`04_annotation_Across_Samples_exploration.Rmd`](../notebook/04_annotation_Across_Samples_exploration.Rmd) 5 samples to test for aneuploidy and CNV inference: @@ -61,12 +64,12 @@ We selected in [`04_annotation_Across_Samples_exploration.Rmd`](../notebook/04_a - [x] `05_copykat_exploration_{sample_id}.html` is the output of the [`05_copykat_exploration.Rmd`](../notebook_template/05_copykat_exploration.Rmd) notebook template. -In brief, we wanted to test `copykat` results obtained with or without normal cells as reference, using either an euclidean or statistical (spearman) method for CNV heatmap clustering. +In brief, we wanted to test `copykat` results obtained with or without normal cells as reference, using either an euclidean or statistical (spearman) method for CNV heatmap clustering. This impact the final decision made by `copykat` for each cell to be either aneuploid or diploid, and it is thus crucial to explore the results using the different methods. For each of the selected samples, we explore the results in the template `notebook` [`05_copykat_exploration.Rmd`](../notebook_template/05_copykat_exploration.Rmd), which creates a notebook `05_cnv_copykat_exploration_{sample_id}.html` for each sample. These `notebooks` are inspired by the plots written for the Ewing Sarcoma analysis in [`03-copykat.Rmd`](https://github.com/AlexsLemonade/OpenScPCA-analysis/blob/main/analyses/cell-type-ewings/exploratory_analysis/03-copykat.Rmd). - [x] `06_infercnv_exploration_{sample_id}.html` is the output of the [`06_infercnv_exploration.Rmd`](../notebook_template/06_infercnv_exploration.Rmd) notebook template. -In brief, we wanted to test `infercnv` results obtained with or without endothelium and/or immune cells as reference. -We also explore the potential of using a HMM model to assign CNV scores for each cells and discriminate normal from cancer cells. +In brief, we wanted to test `infercnv` results obtained with or without endothelium and/or immune cells as reference. +We also explore the potential of using a HMM model to assign CNV scores for each cells and discriminate normal from cancer cells. diff --git a/analyses/cell-type-wilms-tumor-06/notebook_template/06_cnv_infercnv_exploration.Rmd b/analyses/cell-type-wilms-tumor-06/notebook_template/06_cnv_infercnv_exploration.Rmd index 56dd86f98..bf97851a8 100644 --- a/analyses/cell-type-wilms-tumor-06/notebook_template/06_cnv_infercnv_exploration.Rmd +++ b/analyses/cell-type-wilms-tumor-06/notebook_template/06_cnv_infercnv_exploration.Rmd @@ -16,12 +16,11 @@ output: --- ```{r setup, include=FALSE} - -knitr::opts_chunk$set(echo = TRUE, - message=FALSE, - warnings=FALSE - ) - +knitr::opts_chunk$set( + echo = TRUE, + message = FALSE, + warnings = FALSE +) ``` @@ -29,14 +28,12 @@ knitr::opts_chunk$set(echo = TRUE, ```{r} - subdiagnosis <- readr::read_tsv( file.path("..", "..", "..", "data", "current", params$scpca_project_id, "single_cell_metadata.tsv"), show_col_types = FALSE - ) |> +) |> dplyr::filter(scpca_sample_id == params$sample_id) |> dplyr::pull(subdiagnosis) - ``` This notebook explores using [`infercnv`](https://github.com/broadinstitute/inferCNV/wiki) to estimate tumor and normal cells in `r params$sample_id` from SCPCP000006. @@ -67,7 +64,6 @@ In this notebook, we want to compare the heatmaps of CNV profiles, and evaluate ```{r} - library(infercnv) library(SCpubr) library(ggplot2) @@ -93,23 +89,21 @@ For a Seurat object `object`, the function `Do_CNV_heatmap` load the `infercnv` - `reference_value` indicates the reference used for `infercnv`, either endothelial, immune, both or pool. ```{r} - -Do_CNV_heatmap <- function(object, infercnv_obj, group.by, reference_value){ - - out <- SCpubr::do_CopyNumberVariantPlot(sample = object, - infercnv_object = infercnv_obj, - using_metacells = FALSE, - chromosome_locations = SCpubr::human_chr_locations, - return_object = FALSE, - group.by = group.by - ) - out <- out + +Do_CNV_heatmap <- function(object, infercnv_obj, group.by, reference_value) { + out <- SCpubr::do_CopyNumberVariantPlot( + sample = object, + infercnv_object = infercnv_obj, + using_metacells = FALSE, + chromosome_locations = SCpubr::human_chr_locations, + return_object = FALSE, + group.by = group.by + ) + out <- out + ggtitle(glue::glue("Copy Number Variant Plot, ", reference_value)) + ylab(label = "") - + return(out) } - ``` #### Calculate a global CNV score per cell to check the general distribution @@ -123,11 +117,11 @@ The function `Do_CNV_score` returns the `Seurat` object with an additional metad ```{r} +Do_CNV_score <- function(seurat_oject, infercnv_obj, reference_value) { + score <- apply(infercnv_obj@expr.data, 2, function(x) { + sum(x < 0.95 | x > 1.05) / length(x) + }) -Do_CNV_score <- function(seurat_oject,infercnv_obj,reference_value){ - - score <- apply(infercnv_obj@expr.data,2,function(x){ sum(x < 0.95 | x > 1.05)/length(x) }) - seurat_obj <- AddMetaData(seurat_oject, score, col.name = glue::glue("CNV-score_", reference_value)) return(seurat_obj) } @@ -145,41 +139,38 @@ For a Seurat object `object`and a metadata `metadata`, the function `visualize_m - `group.by` is the metadata used for grouping the violin plots ```{r } -visualize_metadata <- function(object, meta, group.by){ - - if(is.numeric(object@meta.data[,meta])){ - d <- SCpubr::do_FeaturePlot(object, - features = meta, - pt.size = 0.2, - legend.width = 0.5, - legend.length = 5, - legend.position = "right") + ggtitle(meta) - b <- SCpubr::do_ViolinPlot(object, - features = meta, - ncol = 1, - group.by = group.by, - legend.position = "none") - - return(d + b + plot_layout(ncol = 2, widths = c(2,4))) +visualize_metadata <- function(object, meta, group.by) { + if (is.numeric(object@meta.data[, meta])) { + d <- SCpubr::do_FeaturePlot(object, + features = meta, + pt.size = 0.2, + legend.width = 0.5, + legend.length = 5, + legend.position = "right" + ) + ggtitle(meta) + b <- SCpubr::do_ViolinPlot(object, + features = meta, + ncol = 1, + group.by = group.by, + legend.position = "none" + ) + + return(d + b + plot_layout(ncol = 2, widths = c(2, 4))) + } else { + d <- SCpubr::do_DimPlot(object, reduction = "umap", group.by = group.by, label = TRUE, repel = TRUE) + ggtitle(paste0(meta, " - umap")) + theme(text = element_text(size = 18)) + b <- SCpubr::do_BarPlot( + sample = object, + group.by = meta, + split.by = group.by, + position = "fill", + font.size = 10, + legend.ncol = 3 + ) + + ggtitle("% cells") + + xlab(print(group.by)) + + theme(text = element_text(size = 18)) + return(d + b + plot_layout(ncol = 2, widths = c(2, 4))) } - - - else{ - - - d <- SCpubr::do_DimPlot(object, reduction="umap", group.by = group.by, label = TRUE, repel = TRUE) + ggtitle(paste0(meta," - umap")) + theme(text=element_text(size=18)) - b <- SCpubr::do_BarPlot(sample = object, - group.by = meta, - split.by = group.by, - position = "fill", - font.size = 10, - legend.ncol = 3) + - ggtitle("% cells")+ - xlab(print(group.by)) + - theme(text=element_text(size=18)) - return(d + b + plot_layout(ncol = 2, widths = c(2,4))) - } - } ``` @@ -196,22 +187,23 @@ For a Seurat object `object`and a features `features`, the function `visualize_f - `group.by` is the metadata used for grouping the violin plots ```{r } -visualize_feature <- function(object, feature, group.by ){ - - d <- SCpubr::do_FeaturePlot(object, - features = feature, - pt.size = 0.2, - legend.width = 0.5, - legend.length = 5, - legend.position = "right") + ggtitle(as.character(feature)) - b <- SCpubr::do_ViolinPlot(object, - features = feature, - ncol = 1, - group.by = group.by, - legend.position = "none", - assay = "SCT") + ylab(as.character(feature)) - - return(d + b + plot_layout(ncol = 2, widths = c(2,4))) +visualize_feature <- function(object, feature, group.by) { + d <- SCpubr::do_FeaturePlot(object, + features = feature, + pt.size = 0.2, + legend.width = 0.5, + legend.length = 5, + legend.position = "right" + ) + ggtitle(as.character(feature)) + b <- SCpubr::do_ViolinPlot(object, + features = feature, + ncol = 1, + group.by = group.by, + legend.position = "none", + assay = "SCT" + ) + ylab(as.character(feature)) + + return(d + b + plot_layout(ncol = 2, widths = c(2, 4))) } ``` @@ -227,18 +219,16 @@ For a Seurat object `object`and a features `features`, the function `visualize_f - `group.by` is the metadata used for grouping the violin plots ```{r } -visualize_density <- function(object, features, group.by ){ - - d <- SCpubr::do_RidgePlot(object, - feature = features, - group.by = group.by, - legend.position = "none", - assay = "SCT") + ylab(features) - - return(d) +visualize_density <- function(object, features, group.by) { + d <- SCpubr::do_RidgePlot(object, + feature = features, + group.by = group.by, + legend.position = "none", + assay = "SCT" + ) + ylab(features) + + return(d) } - - ``` #### Wrapper function to explore `infercnv` HMM cnv prediction @@ -278,43 +268,40 @@ Else, the cell will have a `has_cnv_score` set to FALSE. ```{r} - -wrapper_explore_hmm <- function(infercnv_obj, cnv_threshold = 1){ - +wrapper_explore_hmm <- function(infercnv_obj, cnv_threshold = 1) { p <- list() #### Plot UMAP of known Wilms tumor related alteration p[["compartment"]] <- visualize_metadata(infercnv_obj, "fetal_kidney_predicted.compartment", "seurat_clusters") - + #### Feature plot and repartition of the CNV per chromosome - for(i in 1:22){ - tmp1 <- visualize_feature(infercnv_obj, feature=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment") - tmp2 <- visualize_density(infercnv_obj, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment") - p[[glue::glue("proportion_cnv_chr", i)]] <- tmp1 + tmp2 + plot_layout(ncol = 3, widths = c(1,2,2)) + for (i in 1:22) { + tmp1 <- visualize_feature(infercnv_obj, feature = glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment") + tmp2 <- visualize_density(infercnv_obj, features = glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment") + p[[glue::glue("proportion_cnv_chr", i)]] <- tmp1 + tmp2 + plot_layout(ncol = 3, widths = c(1, 2, 2)) } - + #### DotPlot - p[["Dotplot"]] <- DotPlot(infercnv_obj, features = colnames(infercnv_obj@meta.data)[grepl("has_cnv_chr", colnames(infercnv_obj@meta.data))], group.by = "fetal_kidney_predicted.compartment", assay = "SCT") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - + p[["Dotplot"]] <- DotPlot(infercnv_obj, features = colnames(infercnv_obj@meta.data)[grepl("has_cnv_chr", colnames(infercnv_obj@meta.data))], group.by = "fetal_kidney_predicted.compartment", assay = "SCT") + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + #### CNV score - meta <- infercnv_obj@meta.data - meta$has_cnv_score <- rowSums(meta[,grepl("has_cnv_chr", colnames(meta))]) - meta$has_cnv_score[meta$has_cnv_score > cnv_threshold] <- "CNV" + meta <- infercnv_obj@meta.data |> + mutate(has_cnv_score = rowSums(meta[, grepl("has_cnv_chr", colnames(meta))])) |> + mutate(has_cnv_score = case_when( + has_cnv_score > cnv_threshold ~ "CNV", + has_cnv_score <= cnv_threshold ~ "no CNV" + )) - # we know that some immune cells might have false positive CNV - meta$has_cnv_score[meta$has_cnv_score %in% c(0:cnv_threshold)] <- "no CNV" - infercnv_obj <- AddMetaData(infercnv_obj, meta$has_cnv_score, col.name = "has_cnv_score") + + infercnv_obj <- AddMetaData(infercnv_obj, meta$has_cnv_score, col.name = "has_cnv_score") p[["infercnv_obj"]] <- infercnv_obj p[["cnv_seurat"]] <- visualize_metadata(infercnv_obj, "has_cnv_score", "seurat_clusters") p[["cnv_compartment"]] <- visualize_metadata(infercnv_obj, "has_cnv_score", "fetal_kidney_predicted.compartment") -return(p) - - + return(p) } - ``` ### Base directories @@ -333,9 +320,7 @@ module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06" The input for this notebook are the results of `06_inferCNV.R` ```{r} - result_dir <- file.path(module_base, "results", params$sample_id, "06_infercnv") - ``` #### `Seurat` object @@ -343,9 +328,7 @@ result_dir <- file.path(module_base, "results", params$sample_id, "06_infercnv") We load the `Seurat` object generated in `06_infercnv.R` ```{r} - srat <- readRDS(file.path(module_base, "results", params$sample_id, glue::glue("02b-fetal_kidney_label-transfer_", params$sample_id, ".Rds"))) - ``` ## Analysis @@ -381,11 +364,10 @@ We then used the `infercnv object` to look at mean CNV value across compartments ```{r fig.width=20, fig.height=5, out.width='100%', out.height='100%'} infercnv_obj <- list() -for(reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")){ - infercnv_obj[[reference_value]] <- readRDS(file.path(result_dir, glue::glue(reference_value, "_HMM-no") , glue::glue("06_infercnv_", params$sample_id, "_", reference_value, "_HMM-no.rds"))) +for (reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")) { + infercnv_obj[[reference_value]] <- readRDS(file.path(result_dir, glue::glue(reference_value, "_HMM-no"), glue::glue("06_infercnv_", params$sample_id, "_", reference_value, "_HMM-no.rds"))) print(Do_CNV_heatmap(object = srat, infercnv_obj = infercnv_obj[[reference_value]], group.by = "fetal_kidney_predicted.compartment", reference_value = reference_value)) } - ``` These heatmaps emphasize the importance of the selection of normal cells prior the inference of CNV. The normal reference should contain as much cell types as possible, in order to minimize false positive CNV. @@ -409,14 +391,12 @@ We would expect: Ideally, the CNV score distribution should show two peaks, one for the normal cells with no CNV, one for cancer cells with more genomic rearrangements. ```{r fig.width=20, fig.height=5, out.width='100%', out.height='100%', warning=FALSE} - -for(reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")){ - srat <- Do_CNV_score(srat,infercnv_obj = infercnv_obj[[reference_value]], reference_value) - p1 <- visualize_feature(srat, feature = glue::glue("CNV-score_", reference_value) , group.by = "fetal_kidney_predicted.compartment" ) - p2 <- visualize_density(srat, features=glue::glue("CNV-score_", reference_value), group.by = "fetal_kidney_predicted.compartment") - print(p1 + p2 + plot_layout(ncol = 3, widths = c(1,2,2))) +for (reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")) { + srat <- Do_CNV_score(srat, infercnv_obj = infercnv_obj[[reference_value]], reference_value) + p1 <- visualize_feature(srat, feature = glue::glue("CNV-score_", reference_value), group.by = "fetal_kidney_predicted.compartment") + p2 <- visualize_density(srat, features = glue::glue("CNV-score_", reference_value), group.by = "fetal_kidney_predicted.compartment") + print(p1 + p2 + plot_layout(ncol = 3, widths = c(1, 2, 2))) } - ``` This unique CNV score does not look promising. @@ -433,11 +413,8 @@ We then explore infercnv results generated with immune and endothelial cells as We load the `Seurat` object generated in `06_infercnv.R` ```{r} - #### Load the `Seurat` object srat_i3 <- readRDS(file.path(module_base, "results", params$sample_id, glue::glue("06_infercnv_HMM-i3_", params$sample_id, "_reference-both.rds"))) - - ``` and explore the CNV results using the `wrapper_explore_hmm` fucntion. @@ -457,9 +434,7 @@ We then explore `infercnv` results generated with immune and endothelial cells a We load the `Seurat` object generated in `06_infercnv.R` ```{r} - srat_i6 <- readRDS(file.path(module_base, "results", params$sample_id, glue::glue("06_infercnv_HMM-i6_", params$sample_id, "_reference-both.rds"))) - ``` @@ -492,10 +467,10 @@ seurat_full <- readRDS(file.path(module_base, "results", params$sample_id, glue: keep_cells <- colnames(srat) seurat <- subset(seurat_full, cells = keep_cells) -meta <- seurat@meta.data[colnames(srat),] +meta <- seurat@meta.data[colnames(srat), ] # merge the new info into the srat metadata -srat_pull <- AddMetaData(srat, meta[,grepl("_cnv_chr", colnames(meta))], col.name = colnames(meta[grepl("_cnv_chr", colnames(meta))])) +srat_pull <- AddMetaData(srat, meta[, grepl("_cnv_chr", colnames(meta))], col.name = colnames(meta[grepl("_cnv_chr", colnames(meta))])) ``` @@ -518,7 +493,6 @@ We compare here the binary CNV scores calculated with the three HMM prediction m ```{r fig.width=20, fig.height=8, out.width='100%'} - p1 <- SCpubr::do_DimPlot(pull$infercnv_obj, "has_cnv_score", plot.title = "inter-patient reference, HMM-i3", reduction = "umap") p2 <- SCpubr::do_DimPlot(p$infercnv_obj, "has_cnv_score", plot.title = "intra-patient reference, HMM-i3", reduction = "umap") @@ -526,7 +500,6 @@ p2 <- SCpubr::do_DimPlot(p$infercnv_obj, "has_cnv_score", plot.title = "intra-pa p3 <- SCpubr::do_DimPlot(q$infercnv_obj, "has_cnv_score", plot.title = "intra-patient reference, HMM-i6", reduction = "umap") p1 + p2 + p3 + plot_layout(ncol = 3) - ``` diff --git a/analyses/cell-type-wilms-tumor-06/results/README.md b/analyses/cell-type-wilms-tumor-06/results/README.md index fb6943603..153343d80 100644 --- a/analyses/cell-type-wilms-tumor-06/results/README.md +++ b/analyses/cell-type-wilms-tumor-06/results/README.md @@ -12,35 +12,35 @@ To perform label transfer using Azimuth and the fetal kidney atlas, a reference The output can be found in the S3 bucket name `researcher-008971640512-us-east-2` named `references/ref.Rds` and `references/idx.annoy`. -As part of the `00b_characterize_fetal_kidney_reference_Stewart.Rmd` notebook template, we characterized the fetal kidney reference and generated lists of marker genes for the compartment and cell types composing the reference. +As part of the `00b_characterize_fetal_kidney_reference_Stewart.Rmd` notebook template, we characterized the fetal kidney reference and generated lists of marker genes for the compartment and cell types composing the reference. The tables of marker genes can be found in the S3 bucket name `researcher-008971640512-us-east-2`: - `references/00a_marker_cell-type_fetal_kidney_Stewart.csv` -- `references/00a_marker_compartment_fetal_kidney_Stewart.csv` - +- `references/00a_marker_compartment_fetal_kidney_Stewart.csv` + # Clustering and label transfer from fetal references -The S3 bucket name `researcher-008971640512-us-east-2` contains one folder for each of the samples of the Wilms tumor dataset SCPCP000006. +The S3 bucket name `researcher-008971640512-us-east-2` contains one folder for each of the samples of the Wilms tumor dataset SCPCP000006. - `01-Seurat_{sample-id}.Rds` is the output of the [`01_seurat-processing.Rmd`](../notebook_template/01_seurat-processing.Rmd) notebook template. In brief, the `_processed.rds` `sce object` is converted to `Seurat` and normalized using `SCTransform`. -Dimensionality reduction (`RunPCA` and `RunUMAP`) and clustering (`FindNeighbors` and `FindClusters`) are performed before saving the `Seurat` object. +Dimensionality reduction (`RunPCA` and `RunUMAP`) and clustering (`FindNeighbors` and `FindClusters`) are performed before saving the `Seurat` object. - `02a-fetal_full_label-transfer_{sample-id}.Rds` is the output of the [`02a_label-transfer_fetal_full_reference_Cao.Rmd`](../notebook_template/02a_label-transfer_fetal_full_reference_Cao.Rmd) notebook template. -In brief, we used `Azimuth` to transfer labels from the `Azimuth` fetal full reference (Cao et al.) +In brief, we used `Azimuth` to transfer labels from the `Azimuth` fetal full reference (Cao et al.) - `02b-fetal_kidney_label-transfer_{sample-id}.Rds` is the output of the [`02b_label-transfer_fetal_kidney_reference_Stewart.Rmd`](../notebook_template/02b_label-transfer_fetal_kidney_reference_Stewart.Rmd) notebook template. -In brief, we used `Azimuth` to transfer labels from the fetal kidney reference (Stewart et al.) +In brief, we used `Azimuth` to transfer labels from the fetal kidney reference (Stewart et al.) -Of note, as `01-Seurat_{sample-id}.Rds` is the input of `02a_label-transfer_fetal_full_reference_Cao.Rmd` and `02a-fetal_full_label-transfer_{sample-id}.Rds` the input of `02b-fetal_kidney_label-transfer_{sample-id}.Rds`, the latest `02b-fetal_kidney_label-transfer_{sample-id}.Rds` contains all metadata generated by the 3 notebook templates. +Of note, as `01-Seurat_{sample-id}.Rds` is the input of `02a_label-transfer_fetal_full_reference_Cao.Rmd` and `02a-fetal_full_label-transfer_{sample-id}.Rds` the input of `02b-fetal_kidney_label-transfer_{sample-id}.Rds`, the latest `02b-fetal_kidney_label-transfer_{sample-id}.Rds` contains all metadata generated by the 3 notebook templates. # Copy Number Alterations ## using `copykat` -We tried to infer aneuploidy in cancer cells using `copykat` and tested different parameters. -Mostly, we tested to run `copykat` with and without normal cells as reference and defined the clustering distance to be either `euclidean` or `spearman`. +We tried to infer aneuploidy in cancer cells using `copykat` and tested different parameters. +Mostly, we tested to run `copykat` with and without normal cells as reference and defined the clustering distance to be either `euclidean` or `spearman`. We selected previously 5 samples to test for these parameters: - sample SCPCS000194 has > 85 % of cells predicted as kidney and 234 + 83 endothelium and immune cells. @@ -49,7 +49,7 @@ We selected previously 5 samples to test for these parameters: - sample SCPCS000205 has > 89 % of cells predicted as kidney and 92 + 76 endothelium and immune cells. - sample SCPCS0000208 has > 95 % of cells predicted as kidney and 18 + 35 endothelium and immune cells. -For each of the samples, we ran `05_copyKAT.R` script, defining the `distance` parameters. +For each of the samples, we ran `05_copyKAT.R` script, defining the `distance` parameters. For each sample and each condition (reference and distance), we saved in `results/{sample_id}/05_copykat/{reference}/{distance}/`: @@ -60,7 +60,7 @@ For each sample and each condition (reference and distance), we saved in `result ## using `infercnv` -We also tried to infer large CNV in cancer cells using `infercnv` and tested the sensibility of the output in regard to the definition of the normal cells. +We also tried to infer large CNV in cancer cells using `infercnv` and tested the sensibility of the output in regard to the definition of the normal cells. We selected previously 5 samples to test for these parameters: - sample SCPCS000194 has > 85 % of cells predicted as kidney and 234 + 83 endothelium and immune cells. @@ -85,9 +85,12 @@ Of note, the final `infercnv` rds object includes the following slots: Based on the above slots, it would be straightforward to extract info of interest and/or move data into other analysis frameworks. -In addition, fot the condition `reference = "both"`, we ran `infercnv` with `HMM = TRUE`. +In addition, fot the condition `reference = "both"`, we ran `infercnv` with `HMM = TRUE`. [HMM CNV prediction methods](https://github.com/broadinstitute/infercnv/wiki/inferCNV-HMM-based-CNV-Prediction-Methods) will allow us to explore the CNV results better, with an easy [merge](https://github.com/broadinstitute/infercnv/wiki/Extracting-features) of `infercnv` result with the `Seurat` object. -However, HMM CNV prediction methods uses a lot of resources, including time (~2h/sample/condition), and often lead to RSession end. -This is why we only ran the HMM model for one `reference` condition. After selection of the best reference to use, we will run it for all samples. +However, HMM CNV prediction methods uses a lot of resources, including time (~2h/sample/condition), and often lead to RSession end. +This is why we only ran the HMM model for one `reference` condition. After selection of the best reference to use, we will run it for all samples. + +## Annotations +Annotations in `SCPCP000006-annotations.tsv` were produced by `notebook/07_combined_annotation_across_samples_exploration.Rmd`. diff --git a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R index 0d74b3666..24d58b76b 100644 --- a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R +++ b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R @@ -9,6 +9,7 @@ # --ncore 16 # + library(optparse) library(Seurat) library(copykat) @@ -30,25 +31,33 @@ option_list <- list( type = "integer", default = 16, help = "number of cores used to run copyKAT" - ) - , + ), make_option( opt_str = c("-d", "--distance"), type = "character", default = "euclidean", help = "method used to calculate distance in copyKAT" - ) - , + ), make_option( - opt_str = c("-r", "--use_reference"), - type = "character", - default = "ref", - help = "either to run copyKAT with or without reference normal cells" + opt_str = c("-r", "--use_reference"), + type = "character", + default = "ref", + help = "either to run copyKAT with or without reference normal cells" + ), + make_option( + opt_str = "--seed", + type = "integer", + default = 12345, + help = "random seed to set" ) ) opts <- parse_args(OptionParser(option_list = option_list)) +# Note that the version of copyKAT used here overrides random seeds, so while we set one, it isn't used: +# https://github.com/navinlabcode/copykat/blob/d7d6569ae9e30bf774908301af312f626de4cbd5/R/copykat.R#L33 +set.seed(opts$seed) + # paths to data ---------------------------------------------------------------- # The base path for the OpenScPCA repository, found by its (hidden) .git directory @@ -60,31 +69,31 @@ result_dir <- file.path(module_base, "results", opts$sample_id) # Create directories to save the results of copykat with/without reference using opts$distance -dir.create(file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance), recursive = TRUE) +dir.create(file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance), recursive = TRUE) # define scratch directory for tempory saving the output of copykat scratch_dir <- file.path(module_base, "scratch", opts$sample_id) dir.create(scratch_dir, recursive = TRUE) # path for copykat rds output -name_file <- glue::glue("05_copykat_",opts$sample_id,"_",opts$use_reference,"_distance-", opts$distance, ".rds") -name_full <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, name_file) +name_file <- glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, ".rds") +name_full <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, name_file) # path to scratch and final heatmap file to copy over -jpeg_file <- glue::glue(opts$sample_id,"_copykat_heatmap.jpeg") +jpeg_file <- glue::glue(opts$sample_id, "_copykat_heatmap.jpeg") scratch_jpeg <- file.path(scratch_dir, jpeg_file) -output_jpeg_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_",opts$sample_id,"_", opts$use_reference,"_distance-", opts$distance, "_copykat_heatmap.png")) +output_jpeg_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, "_copykat_heatmap.png")) # path to scratch and final .txt prediction file to copy over -prediction_file <- glue::glue(opts$sample_id,"_copykat_prediction.txt") +prediction_file <- glue::glue(opts$sample_id, "_copykat_prediction.txt") scratch_prediction <- file.path(scratch_dir, prediction_file) -output_prediction_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_",opts$sample_id,"_",opts$use_reference ,"_distance-", opts$distance, "_copykat_prediction.txt")) +output_prediction_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, "_copykat_prediction.txt")) # path to scratch and final .txt CNA file to copy over -CNA_file <- glue::glue(opts$sample_id,"_copykat_CNA_results.txt") +CNA_file <- glue::glue(opts$sample_id, "_copykat_CNA_results.txt") scratch_CNA <- file.path(scratch_dir, CNA_file) -output_CNA_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_",opts$sample_id,"_", opts$use_reference, "_distance-", opts$distance, "_copykat_CNA_results.txt")) +output_CNA_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, "_copykat_CNA_results.txt")) # change working directory of the script to the scratch directory @@ -94,7 +103,7 @@ setwd(scratch_dir) # Read in data ----------------------------------------------------------------- srat <- readRDS( - file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_", opts$sample_id, ".Rds")) + file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_", opts$sample_id, ".Rds")) ) # Extract raw counts ----------------------------------------------------------- @@ -106,16 +115,18 @@ normal_cell <- WhichCells(object = srat, expression = fetal_kidney_predicted.com # Run copyKAT without reference ------------------------------------------------ -copykat.ref <- copykat(rawmat=exp.rawdata, - sam.name=opts$sample_id, - distance=opts$distance, - norm.cell.names=ifelse(opts$use_reference == "ref", normal_cell, ""), - genome="hg20", - n.cores= opts$n_core, - id.type = "E", - plot.genes = FALSE, - output.seg = FALSE, - KS.cut = 0.05) +copykat.ref <- copykat( + rawmat = exp.rawdata, + sam.name = opts$sample_id, + distance = opts$distance, + norm.cell.names = ifelse(opts$use_reference == "ref", normal_cell, ""), + genome = "hg20", + n.cores = opts$n_core, + id.type = "E", + plot.genes = FALSE, + output.seg = FALSE, + KS.cut = 0.05 +) # Save copykat output reference ---------------------------------------- @@ -126,5 +137,3 @@ writePNG(img, target = output_jpeg_ref) fs::file_move(scratch_prediction, output_prediction_ref, overwrite = TRUE) fs::file_move(scratch_CNA, output_CNA_ref, overwrite = TRUE) - - diff --git a/analyses/cell-type-wilms-tumor-06/scripts/06_infercnv.R b/analyses/cell-type-wilms-tumor-06/scripts/06_infercnv.R index f48b0a0b4..7e5f67732 100644 --- a/analyses/cell-type-wilms-tumor-06/scripts/06_infercnv.R +++ b/analyses/cell-type-wilms-tumor-06/scripts/06_infercnv.R @@ -38,11 +38,18 @@ option_list <- list( type = "character", default = "i3", help = "If running an additional HMM model to call CNV, either no, or i3 or i6" + ), + make_option( + opt_str = c("--seed"), + type = "character", + default = "12345", + help = "Random seed to set" ) ) opts <- parse_args(OptionParser(option_list = option_list)) +set.seed(opts$seed) # paths to data ---------------------------------------------------------------- # The base path for the OpenScPCA repository, found by its (hidden) .git directory @@ -55,27 +62,31 @@ module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06" result_dir <- file.path(module_base, "results", opts$sample_id) # path to the pull of normal cells (inter-patient) -srat_normal_file <- file.path(module_base, "results","references", '06b_normal-cell-reference.rds') +srat_normal_file <- file.path(module_base, "results", "references", "06b_normal-cell-reference.rds") # path to output infercnv object -output_dir <- file.path(result_dir, "06_infercnv", glue::glue("reference-",opts$reference, "_HMM-", opts$HMM )) -output_rds <- file.path(output_dir, glue::glue("06_infercnv_",opts$sample_id,"_reference-", opts$reference, "_HMM-", opts$HMM, ".rds")) +output_dir <- file.path(result_dir, "06_infercnv", glue::glue("reference-", opts$reference, "_HMM-", opts$HMM)) +output_rds <- file.path(output_dir, glue::glue("06_infercnv_", opts$sample_id, "_reference-", opts$reference, "_HMM-", opts$HMM, ".rds")) # path to heatmap png png_file <- glue::glue("infercnv.png") scratch_png <- file.path(output_dir, png_file) -output_png <- file.path(output_dir, glue::glue("06_infercnv_",opts$sample_id,"_reference-", opts$reference, "_HMM-", opts$HMM, "_heatmap.png")) +output_png <- file.path(output_dir, glue::glue("06_infercnv_", opts$sample_id, "_reference-", opts$reference, "_HMM-", opts$HMM, "_heatmap.png")) # path to updated seurat object -output_srat <- file.path(result_dir, glue::glue("06_infercnv_", "HMM-", opts$HMM, "_", opts$sample_id, "_reference-", opts$reference, ".rds")) +output_srat <- file.path(result_dir, glue::glue("06_infercnv_", "HMM-", opts$HMM, "_", opts$sample_id, "_reference-", opts$reference, ".rds")) # Define functions ------------------------------------------------------------- # read_infercnv_mat will read outputs saved automatically by of infercnv in file_path read_infercnv_mat <- function(file_path) { - obs_table <- readr::read_delim(file = file_path, delim = ' ', quote = '"', skip = 1, - col_names = FALSE, progress = FALSE, show_col_types = FALSE) + obs_table <- readr::read_delim( + file = file_path, delim = " ", quote = '"', skip = 1, + col_names = FALSE, progress = FALSE, show_col_types = FALSE + ) mat <- t(as.matrix(obs_table[, -1])) - cell_names <- readr::read_delim(file = file_path, delim = ' ', quote = '"', n_max = 1, - col_names = FALSE, progress = FALSE, show_col_types = FALSE) + cell_names <- readr::read_delim( + file = file_path, delim = " ", quote = '"', n_max = 1, + col_names = FALSE, progress = FALSE, show_col_types = FALSE + ) cell_names <- as.character(cell_names[1, ]) rownames(mat) <- cell_names colnames(mat) <- as.character(dplyr::pull(obs_table, 1)) @@ -85,28 +96,38 @@ read_infercnv_mat <- function(file_path) { # Read in data ----------------------------------------------------------------- srat <- readRDS( - file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_", opts$sample_id, ".Rds")) + file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_", opts$sample_id, ".Rds")) ) - stopifnot("Incorrect reference provided" = opts$reference %in% c("none", "immune", "endothelium", "both", "pull")) -if(opts$reference == "both"){ - normal_cells <- c("endothelium", "immune") -} else if(opts$reference == "none"){ + +if (opts$reference %in% c("both", "endothelium", "immune")) { + if (opts$reference == "both") { + normal_cells <- c("endothelium", "immune") + + # keep only the labels actually present in the annotations to avoid infercnv error + normal_cells <- normal_cells[normal_cells %in% unique(srat@meta.data$fetal_kidney_predicted.compartment)] + } else { + normal_cells <- opts$reference + } + # the total count of normal cells needs to be greater than 1 + total_normal <- sum(srat@meta.data$fetal_kidney_predicted.compartment %in% normal_cells) + stopifnot("There must be more than 1 normal cell to use a reference." = length(normal_cells) > 1) +} else if (opts$reference == "none") { normal_cells <- NULL -} else if(opts$reference == "pull"){ +} else if (opts$reference == "pull") { srat_normal <- readRDS(srat_normal_file) # we merge the spike-in cells into the `Seurat` object srat <- merge(srat, srat_normal) - srat <- JoinLayers(srat) # else GetAssayData won't work + srat <- JoinLayers(srat) # else GetAssayData won't work # we rename the `fetal_kidney_predicted.compartment` for these cells as "spike" to_rename <- srat@meta.data$fetal_kidney_predicted.compartment names(to_rename) <- rownames(srat@meta.data) to_rename[grepl("spike", names(to_rename))] <- "spike" srat@meta.data$fetal_kidney_predicted.compartment <- to_rename normal_cells <- "spike" -} else{ +} else { normal_cells <- opts$reference } @@ -119,10 +140,10 @@ rownames(annot_df) <- colnames(counts) # We only run the CNV HMM prediction model if the reference is "both" -HMM_logical = TRUE +HMM_logical <- TRUE HMM_type <- opts$HMM -if(opts$HMM == "no"){ +if (opts$HMM == "no") { HMM_logical <- FALSE HMM_type <- NULL } @@ -135,35 +156,42 @@ gene_order_file <- file.path(module_base, "results", "references", "gencode_v19_ # Run infercnv ------------------------------------------------------------------ # create inferCNV object and run method -options(future.globals.maxSize= 89128960000000) +options(future.globals.maxSize = 89128960000000) +options(scipen = 100) # recommended for infercnv analysis_mode = "subclusters" + infercnv_obj <- infercnv::CreateInfercnvObject( raw_counts_matrix = as.matrix(counts), annotations_file = annot_df, ref_group_names = normal_cells, - gene_order_file = gene_order_file) + gene_order_file = gene_order_file, + # ensure all cells are included + min_max_counts_per_cell = c(-Inf, +Inf) +) infercnv_obj <- infercnv::run( infercnv_obj, - cutoff=0.1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics - out_dir=output_dir, - analysis_mode='subclusters', - cluster_by_groups=T, - denoise=TRUE, - HMM=HMM_logical, + cutoff = 0.1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics + out_dir = output_dir, + analysis_mode = "subclusters", + cluster_by_groups = T, + denoise = TRUE, + HMM = HMM_logical, HMM_type = HMM_type, save_rds = TRUE, save_final_rds = TRUE ) -if(HMM_logical){ -# Add `infercnv` data to the `Seurat` object -srat = infercnv::add_to_seurat(infercnv_output_path=output_dir, - seurat_obj=srat, - top_n=10) +if (HMM_logical) { + # Add `infercnv` data to the `Seurat` object + srat <- infercnv::add_to_seurat( + infercnv_output_path = output_dir, + seurat_obj = srat, + top_n = 10 + ) -# save `Seurat` object -saveRDS(srat, output_srat) + # save `Seurat` object + saveRDS(srat, output_srat) } @@ -177,4 +205,3 @@ files.in.dir <- list.files(output_dir, full.names = T) files.to.keep <- c(output_png, output_rds) files.to.remove <- list(files.in.dir[!(files.in.dir %in% files.to.keep)]) do.call(unlink, files.to.remove) - diff --git a/analyses/cell-type-wilms-tumor-14/results/README.md b/analyses/cell-type-wilms-tumor-14/results/README.md index dfd3eea95..5d6adfc76 100644 --- a/analyses/cell-type-wilms-tumor-14/results/README.md +++ b/analyses/cell-type-wilms-tumor-14/results/README.md @@ -31,4 +31,8 @@ No result files. * `results/01_anchor_transfer_seurat/SCT`: Results generated by normalization method `SCTransform`. * Anchor transfer results for all 10 samples are available on S3 `s3://researcher-009160072044-us-east-2/cell-type-wilms-tumor-14/results/01_anchor_transfer_seurat/summary_results.csv`. + * Rough tumor cell classification is inferred based on anchor transfer results: + - Cells annotated as "Cap mesenchyme" were labeled as tumor cells: CM should have gone since born [(Ref)](https://pmc.ncbi.nlm.nih.gov/articles/PMC4675508/). + - Cells annotated as "Immune" or "Endothelium" were labeled as normal cells: Wilms tumor shows a tri-phasic type, meaning blastemal, stromal, and epithelial cells can be tumor cells ([Ref1](https://pmc.ncbi.nlm.nih.gov/articles/PMC6076422/) and [Ref2](https://www.frontiersin.org/journals/oncology/articles/10.3389/fonc.2023.1137346/full)). In contrast, "Immune" or "Endothelium" cells are regarded as normal cells. + - Other cells were labeled as "unknown". diff --git a/analyses/cell-type-wilms-tumor-14/scripts/summary_results.R b/analyses/cell-type-wilms-tumor-14/scripts/summary_results.R index 36416e16e..ecfe68100 100644 --- a/analyses/cell-type-wilms-tumor-14/scripts/summary_results.R +++ b/analyses/cell-type-wilms-tumor-14/scripts/summary_results.R @@ -78,7 +78,6 @@ concat_sample <- function(anchor_assay, } - ################### Creating summary table ################### @@ -89,7 +88,29 @@ arg_df <- tidyr::expand_grid( dfs <- purrr::pmap(arg_df, \(anchor_assay, level) concat_sample(anchor_assay, level, meta) ) +# add rough tumor cell classification, CM = tumor cells, immune/endothelium = normal +normal_compartments <- c("immune", "endothelium") final_table <- dfs %>% - purrr::reduce(left_join) + purrr::reduce(left_join) %>% + mutate( tumor_cell_classification_RNA = case_when( + cell_type_assignment_RNA_compartment %in% normal_compartments ~ "normal", + cell_type_assignment_RNA_compartment == "fetal_nephron" & cell_type_assignment_RNA_celltype == "Cap mesenchyme" ~ "tumor", + .default = "unknown" + ) ) +if ("SCT" %in% assays) { + final_table <- final_table %>% + mutate( tumor_cell_classification_SCT = case_when( + cell_type_assignment_SCT_compartment %in% normal_compartments ~ "normal", + cell_type_assignment_SCT_compartment == "fetal_nephron" & cell_type_assignment_SCT_celltype == "Cap mesenchyme" ~ "tumor", + .default = "unknown" + ) ) +} +final_table <- final_table %>% + # use logNormalize as "default" final result, combine immune, endo, and stroma compartments + mutate( cell_type_assignment = if_else(cell_type_assignment_RNA_compartment != "fetal_nephron", + cell_type_assignment_RNA_compartment, + cell_type_assignment_RNA_celltype ) ) %>% + mutate( tumor_cell_classification = tumor_cell_classification_RNA ) + write.csv(final_table, file = outfile) diff --git a/analyses/hello-python/Dockerfile b/analyses/hello-python/Dockerfile index 80cb68374..792c5c906 100644 --- a/analyses/hello-python/Dockerfile +++ b/analyses/hello-python/Dockerfile @@ -1,5 +1,5 @@ # Dockerfile for the hello-python analysis -FROM condaforge/miniforge3:24.3.0-0 +FROM condaforge/miniforge3:24.9.0-0 # Labels following the Open Containers Initiative (OCI) recommendations # For more information, see https://specs.opencontainers.org/image-spec/annotations/?v=v1.0.1