AlexsLemonade · jashapiro · Oct 24, 2024 · Oct 7, 2024 · Oct 15, 2024 · Oct 22, 2024
@@ -0,0 +1,356 @@
+---
+title: "EDA of DSRCT Samples"
+author: Danh Truong
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: true
+    toc_depth: 3
+---
+
+
+## Introduction
+
+This notebook looks explores the data from the DSRCT sample set, `SCPCP000013`. 
+We then see if we can use expression of DSRCT-specific genes  to manually classify tumor and normal cells.
+The main goal of this notebook is only to identify tumor cells, identification and labeling of the other cells is a separate question that we do not answer here. 
+
+- First we look at expression of each of the DSRCT-specific genes  across all cells. 
+- Then we use a z-transform prior to summing expression of all DSRCT-specific genes  
+Cells with a z-score for any DSRCT-specific genes  > 0 are classified as tumor cells. 
+- We anticipate that normal cells will not express DSRCT-specific genes.
+
+## Setup
+```{r packages}
+suppressPackageStartupMessages({
+  # load required packages
+  library(SingleCellExperiment)
+  library(ggplot2)
+})
+
+# Set default ggplot theme
+theme_set(
+  theme_bw()
+)
+```
+
+
+```{r base paths}
+# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The current data directory, found within the repository base directory
+data_dir <- file.path(repository_base, "data", "2024-07-08")
-data_dir <- file.path(repository_base, "data", "2024-07-08")
+data_dir <- file.path(repository_base, "data", "current")
-data_dir <- file.path(repository_base, "data", "2024-07-08")
+data_dir <- file.path(repository_base, "data", "current")
+#sample_dir <- file.path(data_dir, "SCPCP000013", params$sample_id)
+
+# The path to this module
+module_base <- file.path(repository_base, "analyses", "cell-type-DSRCT")
+```
+
+
+```{r}
+# source in helper functions: make_jaccard_matrix() and jaccard() 
+jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
+source(jaccard_functions)
+```
-
-```{r}
-# source in helper functions: make_jaccard_matrix() and jaccard() 
-jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
-source(jaccard_functions)
-```
+
+```{r}
+# source in helper functions: make_jaccard_matrix() and jaccard() 
+jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
+source(jaccard_functions)
+```
-
-```{r}
-# source in helper functions: make_jaccard_matrix() and jaccard() 
-jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
-source(jaccard_functions)
-```
+
+```{r}
+# source in helper functions: make_jaccard_matrix() and jaccard() 
+jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
+source(jaccard_functions)
+```
+
+```{r}
+metadata_file <- file.path(data_dir, "SCPCP000013", "single_cell_metadata.tsv")
+metadata <- read.csv(metadata_file,  sep = '\t')
+metadata_DSRCT <- dplyr::filter(metadata, diagnosis == 'Desmoplastic small round cell tumor')
+
+```
+
+
+```{r paths}
+sce_file_list <- list()
+for (i in 1:dim(metadata_DSRCT)[1])
+{
+  # Input files
+  sce_file <- paste0(metadata_DSRCT$scpca_library_id[i], '_processed.rds')
+  sce_file_list[[i]] <- file.path(data_dir, "SCPCP000013", metadata_DSRCT$scpca_sample_id[i], sce_file)
+}
+
+marker_genes <- file.path(module_base, "references", "tumor-marker-genes.tsv")
+
+# output tumor/normal classifications
+results_dir <- file.path(module_base, "results", "marker_gene_analysis")
+fs::dir_create(results_dir)
+
+#classifications_filename <- glue::glue("{params$library_id}_tumor_normal_classifications.tsv")
+#output_classifications_file <- file.path(results_dir, classifications_filename)
+```
+
+Read in each data as a separate object `SingleCellExperiment` in the list. 
+```{r}
+sce_list <- lapply(sce_file_list, function(x){
+  # read in processed sce
+  sce <- readr::read_rds(x)
+})
+
+#adding the sample id to the name of each SingleCellExperiment
+names(sce_list) <- metadata_DSRCT$scpca_sample_id
+
+# read in marker genes table 
+marker_genes_df <- readr::read_tsv(marker_genes) |> 
+  # account for genes being from multiple sources
+  dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |> 
+  dplyr::distinct()
+
+marker_genes_df
+
+marker_genes <- marker_genes_df |> 
+  dplyr::filter(cell_type == "tumor") |> 
+  dplyr::pull(ensembl_gene_id)
+```
+
+
+## Analysis content
+
+### Explore marker gene expression 
+
+The first thing we do here is just create a faceted UMAP showing the expression of each marker gene for tumor cells. 
+
+```{r}
+umap_df_list <- lapply(sce_list, function(x) {
+  # get the gene expression counts for all marker genes
+  marker_gene_exp <- logcounts(x[marker_genes, ]) |>
+    as.matrix() |>
+    t() |>
+    as.data.frame() |>
+    tibble::rownames_to_column("barcodes")
+
+  # pull out the UMAP coordinates and make a data frame to use for plotting
+  umap_df <- x |>
+    scuttle::makePerCellDF(use.dimred = "UMAP") |>
+    # replace UMAP.1 with UMAP1
+    dplyr::rename_with(\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")) |>
+    # add in marker gene expression to dataframe
+    dplyr::left_join(marker_gene_exp, by = "barcodes") |>
+    # combine all genes into a single column for easy faceting
+    tidyr::pivot_longer(cols = starts_with("ENSG"),
+                        names_to = "ensembl_gene_id",
+                        values_to = "gene_expression") |>
+    # join with marker gene df to get gene symbols for plotting
+    dplyr::left_join(marker_genes_df, by = c("ensembl_gene_id")) |>
+    dplyr::select(barcodes,
+                  UMAP1,
+                  UMAP2,
+                  gene_symbol,
+                  ensembl_gene_id,
+                  gene_expression,
+                  cluster)
+})
+
+names(umap_df_list) <- metadata_DSRCT$scpca_sample_id
+```
+
+
+```{r, fig.width=8}
+for(i in 1:length(umap_df_list)){
+  # faceted umap showing a umap panel for each marker gene
+  p <- ggplot(umap_df_list[[i]], aes(x = UMAP1, y = UMAP2, color = gene_expression)) +
+    geom_point(alpha = 0.8, size = 0.1) +
+    facet_wrap(vars(gene_symbol)) +
+    scale_color_viridis_c() +
+    labs(color = "Log-normalized gene expression") +
+    # remove axis numbers and background grid
+    scale_x_continuous(labels = NULL, breaks = NULL) +
+    scale_y_continuous(labels = NULL, breaks = NULL) +
+    theme(
+      aspect.ratio = 1,
+      legend.position = "bottom",
+      axis.title = element_text(size = 9, color = "black"),
+      strip.text = element_text(size = 8),
+      legend.title = element_text(size = 9),
+      legend.text = element_text(size = 8)
+    ) +
+    guides(colour = guide_colorbar(title.position = "bottom", title.hjust = 0.5)) +
+    ggtitle(names(umap_df_list)[i])
+  print(p)
+}
+
+
+```
+
+
+In my experience, ST6GALNAC5 is a strong marker for DSRCT in single-cell data. As can be seen, several of the samples have expression of ST6GALNAC5, but some do not, such as SCPCS000731 and SCPCS000729. In fact, these SCPCS000729 contains  low number of cells. These  samples in particular are the PDX samples and I will be excluding SCPCS000729 from further analyses. 
+
+```{r}
+umap_df_list_excluded <- umap_df_list[!(names(umap_df_list) %in% c('SCPCS000731'))]
+```
+
+
+We can also look at the distributions for each marker gene. 
+I would expect to see some sort of bimodal distribution separating cells that do and do not have expression of the marker gene. What is clear from these plots that ST6GALNAC5, CACNA2D2, PTPRQ, IQCJ-SCHIP1,  show a bi modal distribution among the cells. To some extent, we do see expression of the other markers. 
+
+```{r}
+for(i in 1:length(umap_df_list_excluded)) {
+  p <- ggplot(umap_df_list_excluded[[i]],
+              aes(x = gene_expression, fill = gene_symbol)) +
+    geom_density() +
+    facet_wrap(vars(gene_symbol), scales = 'free_y') +
+    theme(legend.position = "none") + 
+    ggtitle(names(umap_df_list_excluded)[i])
+  print(p)
+}
+```
+
+Although we see some slight semblance of a bimodal distribution for most marker genes, it is hard to see and we cannot directly compare gene expression values across genes. 
+This would make it hard to identify a cut off to categorize cells as expression or not expressing the marker gene. 
+
+Now we will transform each of the gene expression vectors by generating z-scores. 
+Then we might be able to find a cut off we can use across samples for if marker genes are present in a cell or not. 
+
+```{r}
+umap_df_list_excluded_scaled <- lapply(umap_df_list_excluded, function(x){
+  x   |>
+  dplyr::group_by(gene_symbol) |>
+  # get z-scores for each gene
+  dplyr::mutate(transformed_gene_expression = scale(gene_expression)[, 1]) |>
+  dplyr::ungroup()
+})
+
+```
+
+
+Now we can create the same density plot but looking at z-scores. Interestingly, in some samples, like SPCS000489, we see that ST6GALNAC5 has negative values. Presumably, these are tumor cells but there may be a skew due to the number of tumor cells compared to normal. 
+
+```{r}
+
+for(i in 1:length(umap_df_list_excluded_scaled)) {
+  p <- ggplot(umap_df_list_excluded_scaled[[i]],
+              aes(x = transformed_gene_expression, fill = gene_symbol)) +
+    geom_density() +
+    facet_wrap(vars(gene_symbol), scales = 'free_y') +
+    theme(legend.position = "none") + 
+    ggtitle(names(umap_df_list_excluded_scaled)[i])
+  print(p)
+}
+```
+
+It looks like some marker genes have distinct groups of cells with z-score > 0, while other marker genes may not be as informative. 
+
+### Classify tumor cells using marker genes only
+
+Let's try and use the marker gene expression to classify tumor cells. 
+It looks like we could use a cutoff of z-score > 0 to count that cell as a tumor cell. 
+
+We could either count any cell that expresses at least one marker gene > 0 as a tumor cell, or look at the combined expression. 
+Let's start with classifying tumor cells as tumor if any marker gene is present (z-score > 0). 
+This also means the sum of all z-scores > 0. 
+
+Below, we can get the sum of the transformed gene expression of all marker genes and plot in a single UMAP. 
+
+```{r}
+# calculate sum gene expression across all marker genes in list 
+marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){
+  x |>
+  dplyr::group_by(barcodes) |> 
+  dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |> 
+  dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |> 
+  dplyr::distinct()
+})
+
+
+# plot mean gene expression 
+
+for(i in 1:length(marker_sum_exp)) {
+  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
+  geom_point(size = 0.5, alpha = 0.5) +
+  scale_color_viridis_c() +
+    ggtitle(names(umap_df_list_excluded_scaled)[i])
+  print(p)
+}
+
+```
+Similar to the individual plots, it looks like there is one group of cells on the bottom right that has the highest marker gene expression. 
+We would anticipate that these are most likely to be the tumor cells. 
+
+Now let's classify any cell that has a sum of marker genes > 0 (after z-transformation) as tumor cells. 
+
+```{r}
+# classify tumor cells based on presence of any marker genes 
+marker_sum_exp <- lapply(marker_sum_exp, function(x){
+  x |> 
+  dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))})
+
+
+for(i in 1:length(marker_sum_exp)) {
+  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
+  geom_point(size = 0.5, alpha = 1) +
+  ggtitle(names(umap_df_list_excluded_scaled)[i])
+  print(p)
+}
+
+```
+
+This gives us a rough idea of cells that may be classified as tumor cells. However, I believe re-doing this analysis using the cluster level data may give us better results. This is highly dependent on whether the clusters were generated with enough granularity.
+
+```{r}
+# calculate sum gene expression across all marker genes in list 
+marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){
+  x |> 
+  dplyr::group_by(cluster) |> #change to cluster
+  dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |> 
+  dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |> 
+  dplyr::distinct()
+})
+
+
+# plot mean gene expression 
+
+for(i in 1:length(marker_sum_exp)) {
+  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
+  geom_point(size = 0.5, alpha = 0.5) +
+  scale_color_viridis_c() +
+    ggtitle(names(umap_df_list_excluded_scaled)[i])
+  print(p)
+}
+
+```
+It looks like there are group of cells with high marker gene expression. We  anticipate that these are most likely to be the tumor cells. The groups with low or negative values are likely normal cells. 
+
+Now let's classify clusters that has a sum of marker genes > 0 (after z-transformation) as tumor cell clusters. 
+
+
+```{r}
+# classify tumor cells based on presence of any marker genes 
+marker_sum_exp <- lapply(marker_sum_exp, function(x){
+  d <- density(x$sum_exp)
+
+  x |> 
+  dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))})
+
+
+for(i in 1:length(marker_sum_exp)) {
+  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
+  geom_point(size = 0.5, alpha = 1) +
+  ggtitle(names(umap_df_list_excluded_scaled)[i])
+  print(p)
+}
+
+```
+
+Based on my experiences, the above plots show that we are unable to adequately determine the tumor cells from the normal cells. This can be seen in SCPCS000729, which is most likely all tumor cells, since it is collected from a patient-derived xenograft.
+
+## Conclusions 
+
+- We do see variation in DSRCT-specific gene expression across cells, suggesting that there may be gene drop-out reducing the quality of the data. 
+- `ST6GALNAC5`, `PTRPQ`, and `IQCJ-SCHIP1` are good markers for the DSRCT cells within this data. The other markers are not as highly expressed either due to heterogeneity or data quality. 
+- For the next steps, we may try to use `SingleR` or `CellAssign` to identify the normal cells and then work to identify remaining cells as tumor cells. Normal cells may have more definitive markers than tumor cells. 
+
+## Save outputs 
+
+```{r}
+# get an RDS of the processed data
+saveRDS(sce_file_list, '../results/SCPCP000013_sce_file_list.rds')
-# get an RDS of the processed data
-saveRDS(sce_file_list, '../results/SCPCP000013_sce_file_list.rds')
+# save an RDS of the processed data
+saveRDS(umap_df_list, file.path(module_base, 'results/SCPCP000013_umap_df_list.rds')
-# get an RDS of the processed data
-saveRDS(sce_file_list, '../results/SCPCP000013_sce_file_list.rds')
+# save an RDS of the processed data
+saveRDS(umap_df_list, file.path(module_base, 'results/SCPCP000013_umap_df_list.rds')
+```
+
+
+## Session Info
+
+```{r session info}
+# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+```
@@ -0,0 +1,9 @@
+cell_type	gene_symbol	ensembl_gene_id	source
+tumor	ST6GALNAC5	ENSG00000117069	internal
+tumor	CACNA2D2	ENSG00000007402	internal
+tumor	IQCJ-SCHIP1	ENSG00000283154	internal
+tumor	PTPRQ	ENSG00000139304	Internal
+tumor	GAL	ENSG00000069482	https://doi.org/10.1002/gcc.22955
+tumor	GALP	ENSG00000197487	https://doi.org/10.1002/gcc.22955
+tumor	GJB2	ENSG00000165474	https://doi.org/10.1002/gcc.22955
+