diff --git a/.github/components/dictionary.txt b/.github/components/dictionary.txt index 0c1060a96..414847cbe 100644 --- a/.github/components/dictionary.txt +++ b/.github/components/dictionary.txt @@ -7,9 +7,11 @@ altExp anaplasia anaplastic aneuploid +aneuploidy AnnData anonymize anonymized +ARI astrocytes AUCell authenticator @@ -23,8 +25,10 @@ bioinformatics blastema blastemal Bleijs +Cao Carpentries CellAssign +CELLxGENE chondrocytes chr CLI @@ -44,6 +48,7 @@ Ctrl cxds demultiplexing dendogram +derangements designee discoverable DM @@ -53,6 +58,7 @@ Dockerfile docstring docstrings DOI +dotplots doxxing dropdown ECM @@ -61,7 +67,10 @@ endothelium enforceability Ensembl enure +epithelia +epithelialize erythematosus +erythroid et ewings EWS-FLI1 @@ -75,12 +84,17 @@ GHAs GitHub GitKraken GitKraken's +glial +glioblastoma Goodspeed GPUs GTF +Hafemeister Halbritter +hematopoietic heterotypic histological +histologies homotypic HPC IAM @@ -95,6 +109,8 @@ Jaccard Jitter JSON Jupyter +karyotyping +leiden LGBTQ licensor licensor's @@ -105,6 +121,7 @@ linter linter's linters lockfile +Looney Louvain LSfR macOS @@ -112,22 +129,32 @@ macrophage md merchantability mesenchymal +mesenchyme metacell metacells +metanephric +microenvironment +microRNA Miniconda Miniforge misidentification +monocyte +monocytes MSC multifactor myeloid natively Nextflow +nephroblastoma +nephron nonconsensual octicons onboarded +oncotarget openscpca OpenScPCA OpenScPCA's +overclustered Panglao PanglaoDB PDX @@ -139,7 +166,9 @@ Posit's PowerShell PR pre +preprint PRs +pseudobulk README redistribution redistributions @@ -147,6 +176,9 @@ renv repo reproducibility reproducibly +ribosomal +RNAseq +rOpenScPCA RStudio RStudio's Rtools @@ -166,7 +198,9 @@ SSO stroma stromal Stumptown +subdiagnosis sublicensable +subtypes symlink symlinked synched @@ -175,12 +209,14 @@ Tirode trainings transferrable transphobic +Treg triaged TSV UCell UMAP uncomment unhide +uteric vCPU vCPUs Visser @@ -193,3 +229,4 @@ Xcode YAML Zappia Zenodo +Zhang diff --git a/00_run_workflow.R b/00_run_workflow.R deleted file mode 100644 index b312ea351..000000000 --- a/00_run_workflow.R +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env Rscript - -# USAGE: -# Rscript 00_run_workflow.R -# -# USAGE in CI: -# Rscript 00_run_workflow.R --testing - -# Set up options ---------------------------------------------------------------- - -library(optparse) - -option_list <- list( - make_option( - opt_str = c("--testing"), - type = "logical", - default = FALSE, - action = "store_true", - help = "Use this flag when running on test data" - ) -) - -opts <- parse_args(OptionParser(option_list = option_list)) -running_ci <- opts$testing - -# Run the Label transfer from two fetal references ------------------------------ - -# get list of samples in the library -------------------------------------------- -root_dir <- rprojroot::find_root(rprojroot::is_git_root) -project_id <- "SCPCP000006" -sample_metadata_file <- file.path(root_dir, "data", "current", project_id, "single_cell_metadata.tsv") -metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE) - -# set path to this module-------------------------------------------------------- -module_base <- file.path(root_dir, "analyses", "cell-type-wilms-tumor-06") - -# Download and create the fetal kidney reference (Stewart et al) ---------- -system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "download-and-create-fetal-kidney-ref.R"))) - -# We build the gene position file reference for infercnv ------------------------ -system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "06a_build-geneposition.R"))) - -# Characterize the two fetal references ----------------------------------------- - -# Characterize the fetal full reference (Cao et al.) -# To be done, next PR -notebook_template_dir <- file.path(module_base, "notebook_template") -notebook_output_dir <- file.path(module_base, "notebook") -# Characterize the fetal kidney reference (Stewart et al.) -rmarkdown::render(input = file.path(notebook_template_dir, "00b_characterize_fetal_kidney_reference_Stewart.Rmd"), - output_format = "html_document", - output_file = "00b_characterization_fetal_kidney_reference_Stewart.html", - output_dir = file.path(notebook_output_dir, "00-reference")) - - -# Run the workflow for (all) samples in the project ----------------------------- -for (sample_id in metadata$scpca_sample_id) { - - # create a directory to save the pre-processed and labeled `Seurat` objects - dir.create(file.path(module_base, "results", sample_id), showWarnings = FALSE) - # create a directory to save the notebooks - dir.create(file.path(module_base, "notebook", sample_id), showWarnings = FALSE) - - # Pre-process the data - `Seurat` workflow - rmarkdown::render(input = file.path(notebook_template_dir, "01_seurat-processing.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("01_seurat_processing_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - if (!running_ci) { - # Label transfer from the Cao reference using Azimuth - rmarkdown::render(input = file.path(notebook_template_dir, "02a_label-transfer_fetal_full_reference_Cao.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02a_fetal_all_reference_Cao_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - # Label transfer from the Stewart reference using Seurat - rmarkdown::render(input = file.path(notebook_template_dir, "02b_label-transfer_fetal_kidney_reference_Stewart.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02b_fetal_kidney_reference_Stewart_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - # Cluster exploration - rmarkdown::render(input = file.path(notebook_template_dir, "03_clustering_exploration.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("03_clustering_exploration_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - } -} - -if (!running_ci) { - for(thr in c(0.5, 0.75, 0.85, 0.95)){ - # Run notebook template to explore label transfer and clustering for all samples at once - rmarkdown::render(input = file.path(notebook_output_dir, "04_annotation_Across_Samples_exploration.Rmd"), - params = list(mapping_score_thr = thr), - output_format = "html_document", - output_file = glue::glue("04_annotation_Across_Samples_exploration_mappingscore_threshold_",thr,".html"), - output_dir = notebook_output_dir) - } - # Run infercnv and copykat for a selection of samples - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "explore-cnv-methods.R"))) - -} - - - - - diff --git a/analyses/cell-type-wilms-tumor-06/00_run_workflow.R b/analyses/cell-type-wilms-tumor-06/00_run_workflow.R index bba9a9fde..68b65fde7 100644 --- a/analyses/cell-type-wilms-tumor-06/00_run_workflow.R +++ b/analyses/cell-type-wilms-tumor-06/00_run_workflow.R @@ -35,7 +35,7 @@ metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE) module_base <- file.path(root_dir, "analyses", "cell-type-wilms-tumor-06") # Download and create the fetal kidney reference (Stewart et al) ---------- -system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "download-and-create-fetal-kidney-ref.R"))) +system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "download-and-create-fetal-kidney-ref.R"))) # We build the gene position file reference for infercnv ------------------------ system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "06a_build-geneposition.R"))) @@ -47,67 +47,99 @@ system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "06a_b notebook_template_dir <- file.path(module_base, "notebook_template") notebook_output_dir <- file.path(module_base, "notebook") # Characterize the fetal kidney reference (Stewart et al.) -rmarkdown::render(input = file.path(notebook_template_dir, "00b_characterize_fetal_kidney_reference_Stewart.Rmd"), - output_format = "html_document", - output_file = "00b_characterization_fetal_kidney_reference_Stewart.html", - output_dir = file.path(notebook_output_dir, "00-reference")) +rmarkdown::render( + input = file.path(notebook_template_dir, "00b_characterize_fetal_kidney_reference_Stewart.Rmd"), + output_format = "html_document", + output_file = "00b_characterization_fetal_kidney_reference_Stewart.html", + output_dir = file.path(notebook_output_dir, "00-reference") +) +sample_ids <- metadata |> + dplyr::filter(seq_unit != "spot") |> + dplyr::pull(scpca_sample_id) |> + unique() # Run the workflow for (all) samples in the project ----------------------------- -for (sample_id in metadata$scpca_sample_id) { - +for (sample_id in sample_ids) { # create a directory to save the pre-processed and labeled `Seurat` objects dir.create(file.path(module_base, "results", sample_id), showWarnings = FALSE) # create a directory to save the notebooks dir.create(file.path(module_base, "notebook", sample_id), showWarnings = FALSE) # Pre-process the data - `Seurat` workflow - rmarkdown::render(input = file.path(notebook_template_dir, "01_seurat-processing.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("01_seurat_processing_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) + rmarkdown::render( + input = file.path(notebook_template_dir, "01_seurat-processing.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("01_seurat_processing_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) if (!running_ci) { # Label transfer from the Cao reference using Azimuth - rmarkdown::render(input = file.path(notebook_template_dir, "02a_label-transfer_fetal_full_reference_Cao.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02a_fetal_all_reference_Cao_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) + rmarkdown::render( + input = file.path(notebook_template_dir, "02a_label-transfer_fetal_full_reference_Cao.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("02a_fetal_all_reference_Cao_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) # Label transfer from the Stewart reference using Seurat - rmarkdown::render(input = file.path(notebook_template_dir, "02b_label-transfer_fetal_kidney_reference_Stewart.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("02b_fetal_kidney_reference_Stewart_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - # Cluster exploration - rmarkdown::render(input = file.path(notebook_template_dir, "03_clustering_exploration.Rmd"), - params = list(scpca_project_id = project_id, sample_id = sample_id), - output_format = "html_document", - output_file = paste0("03_clustering_exploration_", sample_id, ".html"), - output_dir = file.path(notebook_output_dir, sample_id)) + rmarkdown::render( + input = file.path(notebook_template_dir, "02b_label-transfer_fetal_kidney_reference_Stewart.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("02b_fetal_kidney_reference_Stewart_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) + # Cluster exploration + rmarkdown::render( + input = file.path(notebook_template_dir, "03_clustering_exploration.Rmd"), + params = list(scpca_project_id = project_id, sample_id = sample_id), + output_format = "html_document", + output_file = paste0("03_clustering_exploration_", sample_id, ".html"), + output_dir = file.path(notebook_output_dir, sample_id) + ) } } if (!running_ci) { - for(thr in c(0.5, 0.75, 0.85, 0.95)){ - # Run notebook template to explore label transfer and clustering for all samples at once - rmarkdown::render(input = file.path(notebook_output_dir, "04_annotation_Across_Samples_exploration.Rmd"), - params = list(predicted.score_thr = thr), - output_format = "html_document", - output_file = glue::glue("04_annotation_Across_Samples_exploration_predicted.score_threshold_",thr,".html"), - output_dir = notebook_output_dir) + for (thr in c(0.5, 0.75, 0.85, 0.95)) { + # Run notebook template to explore label transfer and clustering for all samples at once + rmarkdown::render( + input = file.path(notebook_output_dir, "04_annotation_Across_Samples_exploration.Rmd"), + output_format = "html_document", + output_file = "04_annotation_Across_Samples_exploration.html", + output_dir = notebook_output_dir + ) } # Run infercnv and copykat for a selection of samples - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "explore-cnv-methods.R"))) - -} - - - - + system(command = glue::glue("Rscript ", file.path(module_base, "scripts", "explore-cnv-methods.R"))) + + # Run infercnv for all samples with HMM i3 and using "both" as the reference + for (sample_id in sample_ids) { + # We run SCPCS000190 with "none" instead of "both" since it does not have + # enough "normal" cells to use as a reference + if (sample_id == "SCPCS000190") { + reference <- "none" + } else { + reference <- "both" + } + + # don't repeat inference on selection of samples it's already been run on + output_file <- file.path(module_base, "results", sample_id, glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")) + if (!file.exists(output_file)) { + system(command = glue::glue("Rscript {file.path(module_base, 'scripts', '06_infercnv.R')} --sample_id {sample_id} --reference {reference} --HMM i3")) + } + } + # Render notebook to make draft annotations + rmarkdown::render( + input = file.path(notebook_output_dir, "07_combined_annotation_across_samples_exploration.Rmd "), + output_format = "html_document", + output_file = "07_combined_annotation_across_samples_exploration.html ", + output_dir = notebook_output_dir + ) +} diff --git a/analyses/cell-type-wilms-tumor-06/README.md b/analyses/cell-type-wilms-tumor-06/README.md index f205cc81b..18f966a1c 100644 --- a/analyses/cell-type-wilms-tumor-06/README.md +++ b/analyses/cell-type-wilms-tumor-06/README.md @@ -21,39 +21,36 @@ The analysis is/will be divided as the following: - [x] Script: label transfer from the fetal kidney atlas reference using runAzimuth - [x] Script: run copykat and inferCNV - [x] Notebook: explore results from steps 2 to 4 for about 5 to 10 samples -- [ ] Script: compile scripts 2 to 4 in a RMardown file with required adjustements and render it across all samples +- [x] Script: Run inferCNV for all samples - [x] Notebook: explore results from step 6, integrate all samples together and annotate the dataset using (i) metadatafile, (ii) CNV information, (iii) label transfer information ## Usage -From Rstudio, run the Rmd reports or render the R scripts (see below R studio session set up). -You can also simply have a look at the html reports in the notebook folder. -Here, no need to run anything, we try to guide you through the analysis. Have a look at the code using the unhide code button on the top right of each chunk! -## Input files - -### single nuclei data - -We work with the _processed.rds SingleCellExperiment objects. -From the module directory, make sure that the conda environment is set-up: +1. Ensure the conda environment is activated. ```shell conda activate openscpca ``` -log into AWS CLI: +2. log into AWS CLI: ```shell # replace `openscpca` with your AWS CLI profile name if it differs export AWS_PROFILE=openscpca aws sso login ``` +Of note, this requires AWS CLI setup to run as intended: https://openscpca.readthedocs.io/en/latest/technical-setup/environment-setup/configure-aws-cli/ + -use download-data.py to download the data as the following: +3. use download-data.py to download the data as the following: ```shell ../../download-data.py --projects SCPCP000006 ``` This is saving the data in OpenScPCA-analysis/data/current/SCPCP000006 -Of note, this requires AWS CLI setup to run as intended: https://openscpca.readthedocs.io/en/latest/technical-setup/environment-setup/configure-aws-cli/ +4. Run the module: +```shell +Rscript 00_run_workflow.R +``` ### sample metadata @@ -70,17 +67,17 @@ Some differenices are expected, some marker genes or pathways are associated wit for each of the steps, we have two types of `output`: -- the `notebook` saved in the `notebook` directory, with a subfolder for each sample. +- the `notebook` saved in the `notebook` directory, with a subfolder for each sample. -- the created objects saved in `results` directory, with a subfolder for each sample. +- the created objects saved in `results` directory, with a subfolder for each sample. # Analysis ## Marker sets -We first build a resource for later validation of the annotated cell types. -We gather from the litterature marker genes and specific genomic alterations that could help us characterizing the Wilms tumor ecosystem, including cancer and non-cancer cells. +We first build a resource for later validation of the annotated cell types. +We gather from the litterature marker genes and specific genomic alterations that could help us characterizing the Wilms tumor ecosystem, including cancer and non-cancer cells. ### The table CellType_metadata.csv contains the following column and information: @@ -135,11 +132,7 @@ We gather from the litterature marker genes and specific genomic alterations tha |1q|gain|malignant|NA|10.1016/S0002-9440(10)63982-X|NA|Associated_with_relapse| -## Clustering and label transfer from fetal references - -R Script to be rendered : `00_run_workflow.R` - -### Introduction +## workflow description The `00_run_workflow.R` contains the following steps: @@ -152,21 +145,21 @@ The `00_run_workflow.R` contains the following steps: - loop for each samples: - `Seurat workflow`, normalization and clustering: `01_seurat-processing.Rmd` in `notebook_template` - + - `Azimuth` label transfer from the fetal full reference (Cao et al.): `02a_label-transfer_fetal_full_reference_Cao.Rmd` in `notebook_template` - + - `Azimuth` label transfer from the fetal kidney reference (Stewart et al.): `02b_label-transfer_fetal_kidney_reference_Stewart.Rmd` in `notebook_template` - + - Exploration of clustering, label transfers, marker genes and pathways: `03_clustering_exploration.Rmd` in `notebook_template` - - CNV inference using [`infercnv`](https://github.com/broadinstitute/inferCNV/wiki) with endothelial and immune cells as reference from either the same patient or a pool of upfront resection Wilms tumor samples: `06_infercnv.R` in `script` - + - CNV inference using [`infercnv`](https://github.com/broadinstitute/inferCNV/wiki) with endothelial and immune cells as reference from either the same patient or a pool of upfront resection Wilms tumor samples: `06_infercnv.R` in `script` + -While we only selected the `infercnv` method with endothelium and immune cells as normal reference for the main workflow across samples, our analysis includes an exploration of cnv inference methods based on `copykat` and `infercnv` on a subselection of samples: +While we only selected the `infercnv` method with endothelium and immune cells as normal reference for the main workflow across samples, our analysis includes an exploration of cnv inference methods based on `copykat` and `infercnv` on a subselection of samples: the `script` `explore-cnv-methods.R` calls the independent scripts `05_copyKAT.R` and `06_infercnv.R` for the samples - "SCPCS000179", - "SCPCS000184", - - "SCPCS000194", + - "SCPCS000194", - "SCPCS000205", - "SCPCS000208". @@ -179,9 +172,9 @@ In addition, we explored the results for all samples in one notebook twice durin For each sample and each of the step, an html report is generated and accessible in the directory `notebook`. -### Justification +### Justification -The use of the right reference is crucial. +The use of the right reference is crucial. It is recommended that the cell types in the reference is representative to the cell types to be annotated in the query. Wilms tumors can contain up to three histologies that resemble fetal kidney: blastema, stroma, and epithelia [1-2]. @@ -191,26 +184,27 @@ We thus decided to test and compare two fetal (kidney) references that could be ##### Human fetal kidney atlas Stewart et al. -We first wanted to try the human fetal kidney atlas to transfer label into the Wilms tumor samples using azimuth. +We first wanted to try the human fetal kidney atlas to transfer label into the Wilms tumor samples using azimuth. You can find more about the human kidney atlas here: https://www.kidneycellatlas.org/ [3] ##### Human Azimuth fetal reference from Cao et al. -Azimuth also provide a human fetal atlas as a reference [4]. +Azimuth also provide a human fetal atlas as a reference [4]. -The data can be found on Zenodo: +The data can be found on Zenodo: https://zenodo.org/records/4738021#.YJIW4C2ZNQI -The reference contain cells from 15 organs including kidney from fetal samples. +The reference contain cells from 15 organs including kidney from fetal samples. Here we will use `Azimuth` to transfer labels from the reference. ### Input and outputs -We start with the `_process.Rds` data to run `01_seurat-processing.Rmd`. +We start with the `_process.Rds` data to run `01_seurat-processing.Rmd`. The output of `01_seurat-processing.Rmd` is saved in `results` in a subfolder for each sample and is the input of the second step `02a_label-transfer_fetal_full_reference_Cao.Rmd`. The output of `02a_label-transfer_fetal_full_reference_Cao.Rmd` is then the input of `02b_label-transfer_fetal_kidney_reference_Stewart.Rmd`. Following the same approach, the output of `02b_label-transfer_fetal_kidney_reference_Stewart.Rmd` is the input of `03_clustering_exploration.Rmd` and `06_infercnv.R`. -The outputs of `06_infercnv.R` `06_infercnv_HMM-i3_{sample_id}_{reference-type}.rds` is finally the input of `07_annotation_Across_Samples_exploration.Rmd`. +The outputs of `06_infercnv.R` `06_infercnv_HMM-i3_{sample_id}_{reference-type}.rds` is finally the input of `07_combined_annotation_across_samples_exploration.Rmd`, which produces a TSV with annotations in `results/SCPCP000006-annotations.tsv `. + All inputs/outputs generated and used in the main workflow are saved in the `results/{sample_id}` folder. Results in subfolders such as `results/{sample_id}/05_copyKAT` or `results/{sample_id}/06_infercnv` have been obtained for a subselection of samples in the exploratory analysis, and are thus kept separated from the results of the main workflow. @@ -223,16 +217,12 @@ At the end of the workflow, we have a `Seurat`object that contains: ## Software requirements -To perform the analysis, run the RMarkdown script in R (version 4.4.1). -The main packages used are: -- Seurat version 5 -- Azimuth version 5 -- inferCNV -- SCpubr for visualization -- DT for table visualization -- DElegate for differential expression analysis +### renv -The dependencies required are saved in `components/dependencies.R` +This module uses `renv`. +If you are using RStudio Server within the container, the `renv` project will not be activated by default. +You can install packages within the container and use `renv::snapshot()` to update the lockfile without activating the project without a problem in our testing. +The `renv` lockfile is used to install R packages in the Docker image. ### Docker @@ -266,24 +256,20 @@ If you are on a Mac with an M series chip, you will not be able to use RStudio S You must build an ARM image locally to be able to use RStudio Server within the container. #### A note for Halbritter lab internal development -This work has been developed on a system that uses podman instead of docker. The steps to run the docker/podman images are slightly different and we saved in run-podman-internal.sh our internal approach to run the container. Please, refer to the Docker section to build and run the container instead. +This work has been developed on a system that uses podman instead of docker. The steps to run the docker/podman images are slightly different and we saved in run-podman-internal.sh our internal approach to run the container. Please, refer to the Docker section to build and run the container instead. -### renv - -This module uses `renv`. -If you are using RStudio Server within the container, the `renv` project will not be activated by default. -You can install packages within the container and use `renv::snapshot()` to update the lockfile without activating the project without a problem in our testing. -The `renv` lockfile is used to install R packages in the Docker image. ## Computational resources -## References +This module may require up to 16 cores to run certain exploratory steps, but the main workflow can be run on a laptop. + +## References -- [1] https://www.ncbi.nlm.nih.gov/books/NBK373356/ +- [1] https://www.ncbi.nlm.nih.gov/books/NBK373356/ -- [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9915828/ +- [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9915828/ -- [3] https://www.science.org/doi/10.1126/science.aat5031 +- [3] https://www.science.org/doi/10.1126/science.aat5031 - [4] https://www.science.org/doi/10.1126/science.aba7721 diff --git a/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.Rmd b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.Rmd new file mode 100644 index 000000000..ba95cbb32 --- /dev/null +++ b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.Rmd @@ -0,0 +1,553 @@ +--- +title: "Combined annotation exploration for SCPCP000006" +author: "Maud PLASCHKA" +date: "`r Sys.Date()`" +params: + predicted.celltype.threshold: 0.85 + cnv_threshold: 0 +output: + html_document: + toc: yes + toc_float: yes + code_folding: hide + highlight: pygments + df_print: paged +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + message = FALSE, + warnings = FALSE, + knitr.digits.signif = FALSE +) +``` + + +## Introduction + + +The aim is to combine label transfer and CNV inference to annotate Wilms tumor samples in SCPCP000006. +The proposed annotation will be based on the combination of: + +- the label transfer from the fetal kidney reference (Stewart et al.), in particular the `fetal_kidney_predicted.compartment` and `fetal_kidney:predicted.cell_type`, as well as the `prediction.score` for each compartment, + +- the predicted CNV calculated using intra-sample endothelial and immune cells (`--reference both`) as normal reference + + + +In a second time, we will explore and validate the chosen annotation. + +We will use some of the [markers genes](https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/main/analyses/cell-type-wilms-tumor-06#the-table-celltype_metadatacsv-contains-the-following-column-and-information) to validate visually the annotations. + +The analysis can be summarized as the following: + +_Where `cnv.thr` and `pred.thr` need to be discussed_ + + +first level annotation | second level annotation | selection of the cells | marker genes for validation | cnv validation +-- | -- | -- | -- | -- +normal | endothelial | compartment == "endothelium" & predicted.score > pred.thr & cnv_score < cnv.thr | WVF | no cnv +normal | immune | compartment == "immune" & predicted.score > pred.thr & cnv_score < cnv.thr | PTPRC, CD163, CD68 | no cnv +normal | kidney | cell_type %in% c("kidney cell", "kidney epithelial", "podocyte") & predicted.score > pred.thr & cnv_score < cnv.thr | CDH1, PODXL, LTL | no cnv +normal | stroma | compartment == "stroma" & predicted.score > pred.thr & cnv_score < cnv.thr | VIM | no cnv +cancer | stroma | compartment == "stroma" & cnv_score > cnv.thr | VIM | proportion_cnv_chr -1 -4 -11 -16 -17 -18 +cancer | blastema | compartment == "fetal_nephron" & cell_type == "mesenchymal cell" & cnv_score > cnv.thr | CITED1 | proportion_cnv_chr -1 -4 -11 -16 -17 -18 +cancer | epithelial | compartment == "fetal_nephron" & cell_type != "mesenchymal cell" & cnv_score > cnv.thr | CDH1 | proportion_cnv_chr -1 -4 -11 -16 -17 -18 +unknown | - | the rest of the cells | - | proportion_cnv_chr -1 -4 -11 -16 -17 -18 + + + +### Packages + + +```{r packages, message=FALSE, warning=FALSE} +library(Seurat) +library(tidyverse) +library(patchwork) +library(DT) +``` + + +### Base directories + +```{r base paths, eval=TRUE} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) + +# The current data directory, found within the repository base directory +data_dir <- file.path(repository_base, "data", "current", "SCPCP000006") + +# The path to this module +module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06") + +result_dir <- file.path(module_base, "results") +``` + +### Output files + +```{r} +# cell type annotation table +annotations_tsv <- file.path(result_dir, "SCPCP000006-annotations.tsv") +``` + +### Input files + +In this notebook, we are working with all of the samples in SCPCP000006. + +The sample metadata can be found in `sample_metadata_file` in the `data` folder. + +We extracted from the pre-processed and labeled `Seurat` object from `results/{sample_id}/06_infercnv_HMM-i3_SCPCS000169_reference-both.rds`. +the following information per cell: + +- the predicted compartment in `fetal_kidney_predicted.compartment` and `fetal_kidney_predicted.compartment.score` + +- the predicted compartment in `fetal_kidney_predicted.cell_type` and `fetal_kidney_predicted.cell_type.score` + + +- the sample identifier in `sample_id` + + +- the proportion of estimated CNV per chromosome, {i} in 1 to 22 `proportion_cnv_chr{i}` + + +- the `counts`of [markers genes](https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/main/analyses/cell-type-wilms-tumor-06#the-table-celltype_metadatacsv-contains-the-following-column-and-information) + + + +```{r path_to_query} +sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv") + +metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE) +sample_ids <- metadata |> + dplyr::filter(seq_unit != "spot") |> + dplyr::pull(scpca_sample_id) |> + unique() + +# Create a data frames of all annotations +cell_type_df <- sample_ids |> + purrr::map( + # For each sample_id, do the following: + \(sample_id) { + # This sample was run with "none" for reference + if (sample_id == "SCPCS000190") { + reference <- "none" + } else { + reference <- "both" + } + input_file <- file.path( + result_dir, + sample_id, + glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds") + ) + + # Read in the Seurat object + srat <- readRDS(input_file) + + # Create and return a data frame from the Seurat object with relevant annotations + # this data frame will have four columns: barcode, sample_id, compartment, organ + data.frame( + # label transfer from the fetal kidney reference + cell_type = srat$fetal_kidney_predicted.cell_type, + compartment = srat$fetal_kidney_predicted.compartment, + + # predicted.scores from the label transfer from the fetal kidney reference + cell_type.score = srat$fetal_kidney_predicted.cell_type.score, + compartment.score = srat$fetal_kidney_predicted.compartment.score, + + # cell embedding + umap = srat@reductions$umap@cell.embeddings, + + # marker genes + PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"), + VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"), + VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"), + CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"), + CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"), + PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"), + COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"), + SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"), + NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"), + THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"), + + + + + + # proportion of cnv per chromosome + proportion_cnv_chr1 = srat$proportion_cnv_chr1, + proportion_cnv_chr2 = srat$proportion_cnv_chr2, + proportion_cnv_chr3 = srat$proportion_cnv_chr3, + proportion_cnv_chr4 = srat$proportion_cnv_chr4, + proportion_cnv_chr5 = srat$proportion_cnv_chr5, + proportion_cnv_chr6 = srat$proportion_cnv_chr6, + proportion_cnv_chr7 = srat$proportion_cnv_chr7, + proportion_cnv_chr8 = srat$proportion_cnv_chr8, + proportion_cnv_chr9 = srat$proportion_cnv_chr9, + proportion_cnv_chr10 = srat$proportion_cnv_chr10, + proportion_cnv_chr11 = srat$proportion_cnv_chr11, + proportion_cnv_chr12 = srat$proportion_cnv_chr12, + proportion_cnv_chr13 = srat$proportion_cnv_chr13, + proportion_cnv_chr14 = srat$proportion_cnv_chr14, + proportion_cnv_chr15 = srat$proportion_cnv_chr15, + proportion_cnv_chr16 = srat$proportion_cnv_chr16, + proportion_cnv_chr17 = srat$proportion_cnv_chr17, + proportion_cnv_chr18 = srat$proportion_cnv_chr18, + proportion_cnv_chr19 = srat$proportion_cnv_chr19, + proportion_cnv_chr20 = srat$proportion_cnv_chr20, + proportion_cnv_chr21 = srat$proportion_cnv_chr21, + proportion_cnv_chr22 = srat$proportion_cnv_chr22, + + # cnv global estimation per chromosome + has_cnv_chr1 = srat$has_cnv_chr1, + has_cnv_chr2 = srat$has_cnv_chr2, + has_cnv_chr3 = srat$has_cnv_chr3, + has_cnv_chr4 = srat$has_cnv_chr4, + has_cnv_chr5 = srat$has_cnv_chr5, + has_cnv_chr6 = srat$has_cnv_chr6, + has_cnv_chr7 = srat$has_cnv_chr7, + has_cnv_chr8 = srat$has_cnv_chr8, + has_cnv_chr9 = srat$has_cnv_chr9, + has_cnv_chr10 = srat$has_cnv_chr10, + has_cnv_chr11 = srat$has_cnv_chr11, + has_cnv_chr12 = srat$has_cnv_chr12, + has_cnv_chr13 = srat$has_cnv_chr13, + has_cnv_chr14 = srat$has_cnv_chr14, + has_cnv_chr15 = srat$has_cnv_chr15, + has_cnv_chr16 = srat$has_cnv_chr16, + has_cnv_chr17 = srat$has_cnv_chr17, + has_cnv_chr18 = srat$has_cnv_chr18, + has_cnv_chr19 = srat$has_cnv_chr19, + has_cnv_chr20 = srat$has_cnv_chr20, + has_cnv_chr21 = srat$has_cnv_chr21, + has_cnv_chr22 = srat$has_cnv_chr22 + ) |> + tibble::rownames_to_column("barcode") |> + dplyr::mutate(sample_id = sample_id) + } + ) |> + # now combine all dataframes to make one big one + dplyr::bind_rows() +``` + +### Output file + +The report will be saved in the `notebook` directory. + + +## Functions + +#### do_Feature_mean + +`do_Feature_mean` shows heatmap of mean expression of a feature grouped by a metadata. + +- `df` is the name of the table containing metadata and feature expression (counts) per cells +- `group.by` is the name of the metadata to group the cells +- `feature` is the name of the gene to average the expression + +```{r fig.width=10, fig.height=4, out.width='100%'} +do_Feature_mean <- function(df, group.by, feature) { + df <- df %>% + group_by(sample_id, !!sym(group.by)) %>% + summarise(m = mean(!!sym(feature))) + + + p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) + + geom_tile() + + scale_fill_viridis_c() + + theme_bw() + + theme(text = element_text(size = 20)) + + theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) + + guides(fill = guide_colourbar(title = paste0(feature))) + + return(p) +} +``` + + +## Analysis + +### Global cnv score + +As done in `06_cnv_infercnv_exploration.Rmd`, we calculate single CNV score and assess its potential in identifying cells with CNV versus normal cells without CNV. + +We simply checked for each chromosome if the cell `has_cnv_chr`. +Would the cell have more than `cnv_threshold` chromosome with CNV, the global `has_cnv_score` will be TRUE. +Else, the cell will have a `has_cnv_score` set to FALSE. + + + + +```{r fig.width=10, fig.height=10, out.width='100%', results='asis'} +cell_type_df <- cell_type_df |> + mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_cnv_chr", colnames(cell_type_df))])) |> + mutate(has_cnv_score = case_when( + has_cnv_score > params$cnv_threshold ~ "CNV", + has_cnv_score <= params$cnv_threshold ~ "no CNV" + )) + + +table(cell_type_df$has_cnv_score) +``` + +### First level annotation + +At first, we like to indicate in the `first.level_annotation` if a cell is normal, cancer or unknown. + +- _normal_ cells can be observe in all four compartments (`endothelium`, `immune`, `stroma` or `fetal nephron`) and do not have cnv. +We only allow a bit of flexibility in terms of cnv profile for immune and endothelium cells that have a high predicted score. +Indeed, we know that false positive cnv can be observed in a cell type specific manner. + +The threshold used for the `predicted.score` is defined as a parameter of this notebook as `r params$predicted.celltype.threshold`. +The threshold used for the identification of cnv is also defined in the params of the notebook as `r params$cnv_threshold`. + +- _cancer_ cells are either from the `stroma` or `fetal nephron` compartments and must have at least few cnv. + + + +```{r fig.width=10, fig.height=10, out.width='100%', results='asis'} +# Define normal cells +# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold +cell_type_df <- cell_type_df |> + mutate(first.level_annotation = case_when( + # assign normal/cancer based on condition + compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "no CNV" ~ "normal", + compartment %in% c("endothelium", "immune") & cell_type.score > params$predicted.celltype.threshold ~ "normal", + compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "CNV" ~ "cancer", + .default = "unknown" + )) +``` + +Using this basic strategy, we identified `r table(cell_type_df$first.level_annotation)["cancer"]` _cancer_ cells, `r table(cell_type_df$first.level_annotation)["normal"]` _normal_ cells. +Only `r table(cell_type_df$first.level_annotation)["unknown"]`cells remain _unknown_ + + + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation), shape = 19, size = 1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + + +### Second level annotation + +#### Normal cells + +- Normal cells from the `fetal nephron` compartment must be normal kidney cells. + +- Normal cells from the `stroma` compartment must be normal stroma cells. + +- Immune and endothelial cells have been already identified by label transfer. + +#### Cancer cells + +Wilms tumor cancer cells can be: + +- _cancer stroma_: We define as _cancer stroma_ all cancer cells from the stroma compartment. + + +- _blastema_,: we defined as _bastema_ every cancer cell that has a `fetal_kidney_predicted.cell_type == mesenchymal cell`. +We know that these _mesenchymal_ cells are cells from the cap mesenchyme that are not expected to be in a mature kidney. +These blastema cells should express higher _CITED1_. + +- _cancer epithelium_: we defined as _cancer epithelium_ all cancer cells that are neither stroma nor blastemal cells. +We expect these cells to express epithelial markers. +Their predicted cell type should correspond to more mature kidney epithelial subunits. + +```{r fig.width=10, fig.height=10, out.width='100%', results='asis'} +cell_type_df <- cell_type_df |> + mutate(second.level_annotation = case_when( + # assign normal cells based on condition + cell_type_df$compartment %in% c("fetal_nephron") & + cell_type_df$has_cnv_score == "no CNV" ~ "kidney", + cell_type_df$compartment %in% c("stroma") & + cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma", + cell_type_df$compartment %in% c("endothelium") & + (cell_type_df$compartment.score > params$predicted.celltype.threshold | + cell_type_df$has_cnv_score == "no CNV") ~ "endothelium", + cell_type_df$compartment %in% c("immune") & + (cell_type_df$compartment.score > params$predicted.celltype.threshold | + cell_type_df$has_cnv_score == "no CNV") ~ "immune", + + # assign cancer cells based on condition + cell_type_df$compartment %in% c("stroma") & + cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma", + cell_type_df$compartment %in% c("fetal_nephron") & + cell_type_df$has_cnv_score == "CNV" & + cell_type_df$cell_type == "mesenchymal cell" ~ "blastema", + cell_type_df$compartment %in% c("fetal_nephron") & + cell_type_df$has_cnv_score == "CNV" & + cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium", + .default = "unknown" + )) +``` + + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df[cell_type_df$first.level_annotation == "normal", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + + + + + + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df[cell_type_df$first.level_annotation == "cancer", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 0.1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + +#### Cancer and normal cells + +```{r fig.width=20, fig.height=20, out.width='100%', results='asis'} +ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) + + geom_point() + + facet_wrap(facets = ~sample_id, ncol = 5) + + theme_bw() + + theme(text = element_text(size = 22)) +``` + +### Validation cancer versus normal based on the cnv profile + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +for (i in 1:22) { + print(do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = glue::glue("proportion_cnv_chr", i))) +} +``` + +### Validation of second level annotation using marker genes + +#### Immune, _PTPRC_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000081237") +``` + +#### Endothelium, _VWF_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000110799") +``` + +#### Stroma, _Vimentin_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000026025") +``` + +#### Stroma, _COL6A3_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000163359") +``` + + +#### Stroma, _THY1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000154096") +``` + +#### Blastema, _CITED1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000125931") +``` + + +#### Blastema, _NCAM1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000149294") +``` + +#### stemness marker (blastema and primitive epithelium), _SIX2_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000170577") +``` + +#### Epithelium, _CDH1_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000039068") +``` + + +#### Epithelium, _PODXL_ expression + +```{r fig.width=20, fig.height=5, out.width='100%', results='asis'} +do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000128567") +``` + + +## Create annotation table for export + +This section creates the cell type annotation table for export. + +```{r} +annotations_table <- cell_type_df |> + select( + cell_barcode = barcode, + scpca_sample_id = sample_id, + tumor_cell_classification = first.level_annotation, + cell_type_assignment = second.level_annotation + ) |> + mutate( + # change cancer --> tumor, but keep the other labels + tumor_cell_classification = ifelse( + tumor_cell_classification == "cancer", "tumor", tumor_cell_classification + ), + cell_type_assignment = str_replace_all( + cell_type_assignment, + "cancer ", + "tumor " + ) + ) + +write_tsv(annotations_table, annotations_tsv) +``` + + +Confirm how many samples we have annotations for: +```{r} +length(unique(annotations_table$scpca_sample_id)) +``` + +## Conclusion + +- Combining label transfer and CNV inference we have produced draft annotations for all 40 Wilms tumor samples in SCPCP000006 + +- The heatmaps of cnv proportion and marker genes support our annotations, but signals with some marker genes are very low. +Also, there is no universal marker for each entity of Wilms tumor that cover all tumor cells from all patient. +This makes the validation of the annotations quite difficult. + +- However, we could try to take the problem from the other side, and used the current annotation to perform differential expression analysis and try to find marker genes that are consistent across patient and Wilms tumor histologies. + +- In each histology (i.e. epithelial and stroma), the distinction between cancer and non cancer cell is difficult (as expected). +In this analysis, we suggested to rely on the cnv score to assess the normality of the cell. +Here again, we could try to run differential expression analysis and compare epithelial (resp. stroma) cancer versus non-cancer cells across patient, aiming to find a share transcriptional program allowing the classification cancer versus normal. + +- In our annotation, we haven't taken into account the favorable/anaplastic status of the sample. +However, as anaplasia can occur in every (but do not has to) wilms tumor histology, I am not sure how to integrate the information into the annotation. + +- This notebook could be finally rendered using different parameters, i.e. threshold for the cnv score and predicted score to use. + +## Session Info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + + diff --git a/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.html b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.html new file mode 100644 index 000000000..cac8877f3 --- /dev/null +++ b/analyses/cell-type-wilms-tumor-06/notebook/07_combined_annotation_across_samples_exploration.html @@ -0,0 +1,3642 @@ + + + + +
+ + + + + + + + + + +The aim is to combine label transfer and CNV inference to annotate +Wilms tumor samples in SCPCP000006. The proposed annotation will be +based on the combination of:
+the label transfer from the fetal kidney reference (Stewart et
+al.), in particular the fetal_kidney_predicted.compartment
+and fetal_kidney:predicted.cell_type
, as well as the
+prediction.score
for each compartment,
the predicted CNV calculated using intra-sample endothelial and
+immune cells (--reference both
) as normal
+reference
In a second time, we will explore and validate the chosen +annotation.
+We will use some of the markers +genes to validate visually the annotations.
+The analysis can be summarized as the following:
+Where cnv.thr
and pred.thr
need to be
+discussed
first level annotation | +second level annotation | +selection of the cells | +marker genes for validation | +cnv validation | +
---|---|---|---|---|
normal | +endothelial | +compartment == “endothelium” & predicted.score > pred.thr +& cnv_score < cnv.thr | +WVF | +no cnv | +
normal | +immune | +compartment == “immune” & predicted.score > pred.thr & +cnv_score < cnv.thr | +PTPRC, CD163, CD68 | +no cnv | +
normal | +kidney | +cell_type %in% c(“kidney cell”, “kidney epithelial”, “podocyte”) +& predicted.score > pred.thr & cnv_score < cnv.thr | +CDH1, PODXL, LTL | +no cnv | +
normal | +stroma | +compartment == “stroma” & predicted.score > pred.thr & +cnv_score < cnv.thr | +VIM | +no cnv | +
cancer | +stroma | +compartment == “stroma” & cnv_score > cnv.thr | +VIM | +proportion_cnv_chr -1 -4 -11 -16 -17 -18 | +
cancer | +blastema | +compartment == “fetal_nephron” & cell_type == “mesenchymal cell” +& cnv_score > cnv.thr | +CITED1 | +proportion_cnv_chr -1 -4 -11 -16 -17 -18 | +
cancer | +epithelial | +compartment == “fetal_nephron” & cell_type != “mesenchymal cell” +& cnv_score > cnv.thr | +CDH1 | +proportion_cnv_chr -1 -4 -11 -16 -17 -18 | +
unknown | +- | +the rest of the cells | +- | +proportion_cnv_chr -1 -4 -11 -16 -17 -18 | +
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The current data directory, found within the repository base directory
+data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")
+
+# The path to this module
+module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
+
+result_dir <- file.path(module_base, "results")
In this notebook, we are working with all of the samples in +SCPCP000006.
+The sample metadata can be found in sample_metadata_file
+in the data
folder.
We extracted from the pre-processed and labeled Seurat
+object from
+results/{sample_id}/06_infercnv_HMM-i3_SCPCS000169_reference-both.rds
.
+the following information per cell:
the predicted compartment in
+fetal_kidney_predicted.compartment
and
+fetal_kidney_predicted.compartment.score
the predicted compartment in
+fetal_kidney_predicted.cell_type
and
+fetal_kidney_predicted.cell_type.score
the sample identifier in sample_id
the proportion of estimated CNV per chromosome, {i} in 1 to 22
+proportion_cnv_chr{i}
the counts
of markers
+genes
sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
+
+metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)
+sample_ids <- metadata |>
+ dplyr::filter(seq_unit != "spot") |>
+ dplyr::pull(scpca_sample_id) |>
+ unique()
+
+# Create a data frames of all annotations
+cell_type_df <- sample_ids |>
+ purrr::map(
+ # For each sample_id, do the following:
+ \(sample_id) {
+ # This sample was run with "none" for reference
+ if (sample_id == "SCPCS000190") {
+ reference <- "none"
+ } else {
+ reference <- "both"
+ }
+ input_file <- file.path(
+ result_dir,
+ sample_id,
+ glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")
+ )
+
+ # Read in the Seurat object
+ srat <- readRDS(input_file)
+
+ # Create and return a data frame from the Seurat object with relevant annotations
+ # this data frame will have four columns: barcode, sample_id, compartment, organ
+ data.frame(
+ # label transfer from the fetal kidney reference
+ cell_type = srat$fetal_kidney_predicted.cell_type,
+ compartment = srat$fetal_kidney_predicted.compartment,
+
+ # predicted.scores from the label transfer from the fetal kidney reference
+ cell_type.score = srat$fetal_kidney_predicted.cell_type.score,
+ compartment.score = srat$fetal_kidney_predicted.compartment.score,
+
+ # cell embedding
+ umap = srat@reductions$umap@cell.embeddings,
+
+ # marker genes
+ PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
+ VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
+ VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"),
+ CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"),
+ CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"),
+ PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"),
+ COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"),
+ SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"),
+ NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"),
+ THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"),
+
+ # proportion of cnv per chromosome
+ proportion_cnv_chr1 = srat$proportion_cnv_chr1,
+ proportion_cnv_chr2 = srat$proportion_cnv_chr2,
+ proportion_cnv_chr3 = srat$proportion_cnv_chr3,
+ proportion_cnv_chr4 = srat$proportion_cnv_chr4,
+ proportion_cnv_chr5 = srat$proportion_cnv_chr5,
+ proportion_cnv_chr6 = srat$proportion_cnv_chr6,
+ proportion_cnv_chr7 = srat$proportion_cnv_chr7,
+ proportion_cnv_chr8 = srat$proportion_cnv_chr8,
+ proportion_cnv_chr9 = srat$proportion_cnv_chr9,
+ proportion_cnv_chr10 = srat$proportion_cnv_chr10,
+ proportion_cnv_chr11 = srat$proportion_cnv_chr11,
+ proportion_cnv_chr12 = srat$proportion_cnv_chr12,
+ proportion_cnv_chr13 = srat$proportion_cnv_chr13,
+ proportion_cnv_chr14 = srat$proportion_cnv_chr14,
+ proportion_cnv_chr15 = srat$proportion_cnv_chr15,
+ proportion_cnv_chr16 = srat$proportion_cnv_chr16,
+ proportion_cnv_chr17 = srat$proportion_cnv_chr17,
+ proportion_cnv_chr18 = srat$proportion_cnv_chr18,
+ proportion_cnv_chr19 = srat$proportion_cnv_chr19,
+ proportion_cnv_chr20 = srat$proportion_cnv_chr20,
+ proportion_cnv_chr21 = srat$proportion_cnv_chr21,
+ proportion_cnv_chr22 = srat$proportion_cnv_chr22,
+
+ # cnv global estimation per chromosome
+ has_cnv_chr1 = srat$has_cnv_chr1,
+ has_cnv_chr2 = srat$has_cnv_chr2,
+ has_cnv_chr3 = srat$has_cnv_chr3,
+ has_cnv_chr4 = srat$has_cnv_chr4,
+ has_cnv_chr5 = srat$has_cnv_chr5,
+ has_cnv_chr6 = srat$has_cnv_chr6,
+ has_cnv_chr7 = srat$has_cnv_chr7,
+ has_cnv_chr8 = srat$has_cnv_chr8,
+ has_cnv_chr9 = srat$has_cnv_chr9,
+ has_cnv_chr10 = srat$has_cnv_chr10,
+ has_cnv_chr11 = srat$has_cnv_chr11,
+ has_cnv_chr12 = srat$has_cnv_chr12,
+ has_cnv_chr13 = srat$has_cnv_chr13,
+ has_cnv_chr14 = srat$has_cnv_chr14,
+ has_cnv_chr15 = srat$has_cnv_chr15,
+ has_cnv_chr16 = srat$has_cnv_chr16,
+ has_cnv_chr17 = srat$has_cnv_chr17,
+ has_cnv_chr18 = srat$has_cnv_chr18,
+ has_cnv_chr19 = srat$has_cnv_chr19,
+ has_cnv_chr20 = srat$has_cnv_chr20,
+ has_cnv_chr21 = srat$has_cnv_chr21,
+ has_cnv_chr22 = srat$has_cnv_chr22
+ ) |>
+ tibble::rownames_to_column("barcode") |>
+ dplyr::mutate(sample_id = sample_id)
+ }
+ ) |>
+ # now combine all dataframes to make one big one
+ dplyr::bind_rows()
The report will be saved in the notebook
directory.
do_Feature_mean
shows heatmap of mean expression of a
+feature grouped by a metadata.
df
is the name of the table containing metadata and
+feature expression (counts) per cellsgroup.by
is the name of the metadata to group the
+cellsfeature
is the name of the gene to average the
+expressiondo_Feature_mean <- function(df, group.by, feature) {
+ df <- df %>%
+ group_by(sample_id, !!sym(group.by)) %>%
+ summarise(m = mean(!!sym(feature)))
+
+
+ p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
+ geom_tile() +
+ scale_fill_viridis_c() +
+ theme_bw() +
+ theme(text = element_text(size = 20)) +
+ theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
+ guides(fill = guide_colourbar(title = paste0(feature)))
+
+ return(p)
+}
As done in 06_cnv_infercnv_exploration.Rmd
, we calculate
+single CNV score and assess its potential in identifying cells with CNV
+versus normal cells without CNV.
We simply checked for each chromosome if the cell
+has_cnv_chr
. Would the cell have more than
+cnv_threshold
chromosome with CNV, the global
+has_cnv_score
will be TRUE. Else, the cell will have a
+has_cnv_score
set to FALSE.
cell_type_df <- cell_type_df |>
+ mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_cnv_chr", colnames(cell_type_df))])) |>
+ mutate(has_cnv_score = case_when(
+ has_cnv_score > params$cnv_threshold ~ "CNV",
+ has_cnv_score <= params$cnv_threshold ~ "no CNV"
+ ))
+
+
+table(cell_type_df$has_cnv_score)
CNV no CNV 190793 9545
+At first, we like to indicate in the
+first.level_annotation
if a cell is normal, cancer or
+unknown.
endothelium
, immune
, stroma
or
+fetal nephron
) and do not have cnv. We only allow a bit of
+flexibility in terms of cnv profile for immune and endothelium cells
+that have a high predicted score. Indeed, we know that false positive
+cnv can be observed in a cell type specific manner.The threshold used for the predicted.score
is defined as
+a parameter of this notebook as 0.85. The threshold used for the
+identification of cnv is also defined in the params of the notebook as
+0.
stroma
or
+fetal nephron
compartments and must have at least few
+cnv.# Define normal cells
+# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold
+cell_type_df <- cell_type_df |>
+ mutate(first.level_annotation = case_when(
+ # assign normal/cancer based on condition
+ compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "no CNV" ~ "normal",
+ compartment %in% c("endothelium", "immune") & cell_type.score > params$predicted.celltype.threshold ~ "normal",
+ compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "CNV" ~ "cancer",
+ .default = "unknown"
+ ))
Using this basic strategy, we identified 190303 cancer +cells, 8088 normal cells. Only 1947cells remain +unknown
+ + +Normal cells from the fetal nephron
compartment must
+be normal kidney cells.
Normal cells from the stroma
compartment must be
+normal stroma cells.
Immune and endothelial cells have been already identified by +label transfer.
Wilms tumor cancer cells can be:
+cancer stroma: We define as cancer stroma all +cancer cells from the stroma compartment.
blastema,: we defined as bastema every cancer
+cell that has a
+fetal_kidney_predicted.cell_type == mesenchymal cell
. We
+know that these mesenchymal cells are cells from the cap
+mesenchyme that are not expected to be in a mature kidney. These
+blastema cells should express higher CITED1.
cancer epithelium: we defined as cancer +epithelium all cancer cells that are neither stroma nor blastemal +cells. We expect these cells to express epithelial markers. Their +predicted cell type should correspond to more mature kidney epithelial +subunits.
cell_type_df <- cell_type_df |>
+ mutate(second.level_annotation = case_when(
+ # assign normal cells based on condition
+ cell_type_df$compartment %in% c("fetal_nephron") &
+ cell_type_df$has_cnv_score == "no CNV" ~ "kidney",
+ cell_type_df$compartment %in% c("stroma") &
+ cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma",
+ cell_type_df$compartment %in% c("endothelium") &
+ (cell_type_df$compartment.score > params$predicted.celltype.threshold |
+ cell_type_df$has_cnv_score == "no CNV") ~ "endothelium",
+ cell_type_df$compartment %in% c("immune") &
+ (cell_type_df$compartment.score > params$predicted.celltype.threshold |
+ cell_type_df$has_cnv_score == "no CNV") ~ "immune",
+
+ # assign cancer cells based on condition
+ cell_type_df$compartment %in% c("stroma") &
+ cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma",
+ cell_type_df$compartment %in% c("fetal_nephron") &
+ cell_type_df$has_cnv_score == "CNV" &
+ cell_type_df$cell_type == "mesenchymal cell" ~ "blastema",
+ cell_type_df$compartment %in% c("fetal_nephron") &
+ cell_type_df$has_cnv_score == "CNV" &
+ cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium",
+ .default = "unknown"
+ ))
This section creates the cell type annotation table for export.
+annotations_table <- cell_type_df |>
+ select(
+ cell_barcode = barcode,
+ scpca_sample_id = sample_id,
+ tumor_cell_classification = first.level_annotation,
+ cell_type_assignment = second.level_annotation
+ ) |>
+ mutate(
+ # change cancer --> tumor, but keep the other labels
+ tumor_cell_classification = ifelse(
+ tumor_cell_classification == "cancer", "tumor", tumor_cell_classification
+ ),
+ cell_type_assignment = str_replace_all(
+ cell_type_assignment,
+ "cancer ",
+ "tumor "
+ )
+ )
+
+write_tsv(annotations_table, annotations_tsv)
Confirm how many samples we have annotations for:
+ +## [1] 40
+Combining label transfer and CNV inference we have produced draft +annotations for all 40 Wilms tumor samples in SCPCP000006
The heatmaps of cnv proportion and marker genes support our +annotations, but signals with some marker genes are very low. Also, +there is no universal marker for each entity of Wilms tumor that cover +all tumor cells from all patient. This makes the validation of the +annotations quite difficult.
However, we could try to take the problem from the other side, +and used the current annotation to perform differential expression +analysis and try to find marker genes that are consistent across patient +and Wilms tumor histologies.
In each histology (i.e. epithelial and stroma), the distinction +between cancer and non cancer cell is difficult (as expected). In this +analysis, we suggested to rely on the cnv score to assess the normality +of the cell. Here again, we could try to run differential expression +analysis and compare epithelial (resp. stroma) cancer versus non-cancer +cells across patient, aiming to find a share transcriptional program +allowing the classification cancer versus normal.
In our annotation, we haven’t taken into account the +favorable/anaplastic status of the sample. However, as anaplasia can +occur in every (but do not has to) wilms tumor histology, I am not sure +how to integrate the information into the annotation.
This notebook could be finally rendered using different +parameters, i.e. threshold for the cnv score and predicted score to +use.
# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
## R version 4.4.1 (2024-06-14)
+## Platform: aarch64-apple-darwin20
+## Running under: macOS 15.1
+##
+## Matrix products: default
+## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
+##
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+##
+## time zone: America/New_York
+## tzcode source: internal
+##
+## attached base packages:
+## [1] stats graphics grDevices utils datasets methods base
+##
+## other attached packages:
+## [1] DT_0.33 patchwork_1.2.0 lubridate_1.9.3 forcats_1.0.0
+## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
+## [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
+## [13] Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4
+##
+## loaded via a namespace (and not attached):
+## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
+## [4] magrittr_2.0.3 spatstat.utils_3.1-0 farver_2.1.2
+## [7] rmarkdown_2.28 vctrs_0.6.5 ROCR_1.0-11
+## [10] spatstat.explore_3.3-2 htmltools_0.5.8.1 sass_0.4.9
+## [13] sctransform_0.4.1 parallelly_1.38.0 KernSmooth_2.23-24
+## [16] bslib_0.8.0 htmlwidgets_1.6.4 ica_1.0-3
+## [19] plyr_1.8.9 plotly_4.10.4 zoo_1.8-12
+## [22] cachem_1.1.0 igraph_2.0.3 mime_0.12
+## [25] lifecycle_1.0.4 pkgconfig_2.0.3 Matrix_1.7-0
+## [28] R6_2.5.1 fastmap_1.2.0 fitdistrplus_1.2-1
+## [31] future_1.34.0 shiny_1.9.1 digest_0.6.37
+## [34] colorspace_2.1-1 rprojroot_2.0.4 tensor_1.5
+## [37] RSpectra_0.16-2 irlba_2.3.5.1 labeling_0.4.3
+## [40] progressr_0.14.0 fansi_1.0.6 spatstat.sparse_3.1-0
+## [43] timechange_0.3.0 httr_1.4.7 polyclip_1.10-7
+## [46] abind_1.4-5 compiler_4.4.1 bit64_4.0.5
+## [49] withr_3.0.1 fastDummies_1.7.4 highr_0.11
+## [52] MASS_7.3-61 tools_4.4.1 lmtest_0.9-40
+## [55] httpuv_1.6.15 future.apply_1.11.2 goftest_1.2-3
+## [58] glue_1.7.0 nlme_3.1-166 promises_1.3.0
+## [61] grid_4.4.1 Rtsne_0.17 cluster_2.1.6
+## [64] reshape2_1.4.4 generics_0.1.3 gtable_0.3.5
+## [67] spatstat.data_3.1-2 tzdb_0.4.0 data.table_1.16.0
+## [70] hms_1.1.3 utf8_1.2.4 spatstat.geom_3.3-2
+## [73] RcppAnnoy_0.0.22 ggrepel_0.9.5 RANN_2.6.2
+## [76] pillar_1.9.0 vroom_1.6.5 spam_2.10-0
+## [79] RcppHNSW_0.6.0 later_1.3.2 splines_4.4.1
+## [82] lattice_0.22-6 bit_4.0.5 survival_3.7-0
+## [85] deldir_2.0-4 tidyselect_1.2.1 miniUI_0.1.1.1
+## [88] pbapply_1.7-2 knitr_1.48 gridExtra_2.3
+## [91] scattermore_1.2 xfun_0.47 matrixStats_1.3.0
+## [94] stringi_1.8.4 lazyeval_0.2.2 yaml_2.3.10
+## [97] evaluate_0.24.0 codetools_0.2-20 cli_3.6.3
+## [100] uwot_0.2.2 xtable_1.8-4 reticulate_1.38.0
+## [103] munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.13
+## [106] globals_0.16.3 spatstat.random_3.3-1 png_0.1-8
+## [109] spatstat.univar_3.0-0 parallel_4.4.1 dotCall64_1.1-1
+## [112] listenv_0.9.1 viridisLite_0.4.2 scales_1.3.0
+## [115] ggridges_0.5.6 crayon_1.5.3 leiden_0.4.3.1
+## [118] rlang_1.1.4 cowplot_1.1.3
+