diff --git a/.github/components/dictionary.txt b/.github/components/dictionary.txt index cc834fc48..c32f83983 100644 --- a/.github/components/dictionary.txt +++ b/.github/components/dictionary.txt @@ -45,6 +45,7 @@ conda config containerd CopyKAT +CPM CRediT Crompton Ctrl @@ -82,9 +83,12 @@ erythroid et ETP ewings +EWS EWS-FLI1 +exocrine fibroblast fibroblasts +FLI formatters Generis GFM @@ -106,6 +110,7 @@ histological histologies homotypic HPC +HSC IAM ICJME ie @@ -123,6 +128,7 @@ Jupyter karyotyping LCA leiden +leptomeningeal LGBTQ licensor licensor's @@ -134,7 +140,7 @@ linter's linters lockfile Looney -Louvain +louvain LSfR macOS macrophage @@ -151,30 +157,40 @@ microRNA Miniconda Miniforge misidentification +modularity monocyte monocytes +mononuclear MSC multifactor +multinucleated myeloid natively Nextflow nephroblastoma nephron +neutrophil NK nonconsensual octicons onboarded oncotarget +ontologies openscpca OpenScPCA OpenScPCA's +osteoblasts +osteoclast +osteoclasts overclustered Panglao PanglaoDB PDX peritubular +perivascular ploidy pluripotent +programmatically PMID PNG podman @@ -215,6 +231,7 @@ ScType SEACells SemVar seq +SingleCellExperiment SingleR snRNA socio @@ -247,6 +264,7 @@ UCell UMAP uncomment unhide +ureteric uteric vCPU vCPUs diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd index 9a06b85cb..68a70703f 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd @@ -40,31 +40,31 @@ repository_base <- rprojroot::find_root(rprojroot::is_git_root) # The path to this module ref_dir <- file.path(repository_base, "analyses", "cell-type-consensus", "references") -# path to ref file for panglao +# path to ref file for panglao panglao_file <- file.path(ref_dir, "panglao-cell-type-ontologies.tsv") ``` ```{r} -# grab obo file +# grab obo file cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl-basic.obo") -# read in panglao file +# read in panglao file panglao_df <- readr::read_tsv(panglao_file) |> - # rename columns to have panglao in them for easy joining later + # rename columns to have panglao in them for easy joining later dplyr::select( panglao_ontology = "ontology_id", panglao_annotation = "human_readable_value" ) # grab singler ref from celldex -blueprint_ref <- celldex::BlueprintEncodeData() +blueprint_ref <- celldex::BlueprintEncodeData() # get ontologies and human readable name into data frame blueprint_df <- data.frame( blueprint_ontology = blueprint_ref$label.ont, blueprint_annotation_main = blueprint_ref$label.main, blueprint_annotation_fine = blueprint_ref$label.fine -) |> +) |> unique() ``` @@ -74,12 +74,12 @@ Below I will calculate the total number of ancestors and the total number of des This will give us an idea of the range of values we expect to see when looking at the PanglaoDB and Blueprint Encode references. ```{r} -# turn cl_ont into data frame with one row per term +# turn cl_ont into data frame with one row per term cl_df <- data.frame( cl_ontology = cl_ont$id, cl_annotation = cl_ont$name -) |> - dplyr::rowwise() |> +) |> + dplyr::rowwise() |> dplyr::mutate( # list all ancestors and descendants calculate total ancestors = list(ontologyIndex::get_ancestors(cl_ont, cl_ontology)), @@ -93,29 +93,31 @@ The vertical lines in the below plot indicate the value for cell types of varyin ```{r} celltypes_of_interest <- c("eukaryotic cell", "lymphocyte", "leukocyte", "hematopoietic cell", "T cell", "endothelial cell", "smooth muscle cell", "memory T cell") -line_df <- cl_df |> - dplyr::filter(cl_annotation %in% celltypes_of_interest) |> - dplyr::select(cl_annotation, total_descendants, total_ancestors) |> +line_df <- cl_df |> + dplyr::filter(cl_annotation %in% celltypes_of_interest) |> + dplyr::select(cl_annotation, total_descendants, total_ancestors) |> unique() -# group any labels that have the same number of ancestors -ancestor_labels_df <- line_df |> - dplyr::group_by(total_ancestors) |> +# group any labels that have the same number of ancestors +ancestor_labels_df <- line_df |> + dplyr::group_by(total_ancestors) |> dplyr::summarise(cl_annotation = paste(cl_annotation, collapse = ",")) ``` ```{r} -# make density plots showing distribution of ancestors and descendants +# make density plots showing distribution of ancestors and descendants ggplot(cl_df, aes(x = total_ancestors)) + geom_density(fill = "#00274C", alpha = 0.5) + - geom_vline(data = ancestor_labels_df, - mapping = aes(xintercept = total_ancestors), - lty = 2) + + geom_vline( + data = ancestor_labels_df, + mapping = aes(xintercept = total_ancestors), + lty = 2 + ) + geom_text( data = ancestor_labels_df, mapping = aes(x = total_ancestors, y = 0.04, label = cl_annotation), - angle = 90, + angle = 90, vjust = -0.5 ) + labs( @@ -132,13 +134,15 @@ Below we will look at total number of descendants. ```{r} ggplot(cl_df, aes(x = total_descendants)) + geom_density(fill = "#FFCB05", alpha = 0.5) + - geom_vline(data = line_df, - mapping = aes(xintercept = total_descendants), - lty = 2) + + geom_vline( + data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2 + ) + geom_text( data = line_df, mapping = aes(x = total_descendants, y = 0.6, label = cl_annotation), - angle = 90, + angle = 90, vjust = -0.5 ) + labs( @@ -152,20 +156,22 @@ It looks like most cell types have very few descendants, so let's zoom into the ```{r} ggplot(cl_df, aes(x = total_descendants)) + geom_density(fill = "#FFCB05", alpha = 0.5) + - geom_vline(data = line_df, - mapping = aes(xintercept = total_descendants), - lty = 2) + + geom_vline( + data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2 + ) + geom_text( data = line_df, mapping = aes(x = total_descendants, y = 0.6, label = cl_annotation), - angle = 90, + angle = 90, vjust = -0.5 ) + labs( x = "Number of descendants", y = "Density" ) + - xlim(c(0,500)) + xlim(c(0, 500)) ``` Here we see a much larger range of values and that cell types become more general as the number of descendants goes up. @@ -194,42 +200,44 @@ cl_graph <- igraph::make_graph(rbind(unlist(parent_terms), rep(names(parent_term ```{r} # get a data frame with all combinations of panglao and blueprint terms -# one row for each combination -all_ref_df <- expand.grid(panglao_df$panglao_ontology, - blueprint_df$blueprint_ontology) |> +# one row for each combination +all_ref_df <- expand.grid( + panglao_df$panglao_ontology, + blueprint_df$blueprint_ontology +) |> dplyr::rename( panglao_ontology = "Var1", blueprint_ontology = "Var2" - ) |> + ) |> # add in the human readable values for each ontology term - dplyr::left_join(blueprint_df, by = "blueprint_ontology") |> - dplyr::left_join(panglao_df, by = "panglao_ontology") |> - tidyr::drop_na() |> - dplyr::rowwise() |> + dplyr::left_join(blueprint_df, by = "blueprint_ontology") |> + dplyr::left_join(panglao_df, by = "panglao_ontology") |> + tidyr::drop_na() |> + dplyr::rowwise() |> dplyr::mutate( # least common shared ancestor lca = list(rownames(ontoProc::findCommonAncestors(blueprint_ontology, panglao_ontology, g = cl_graph))) ) -lca_df <- all_ref_df |> +lca_df <- all_ref_df |> dplyr::mutate( - total_lca = length(lca), # max is three terms - lca = paste0(lca, collapse = ",") # make it easier to split the df + total_lca = length(lca), # max is three terms + lca = paste0(lca, collapse = ",") # make it easier to split the df ) |> - # split each lca term into its own column - tidyr::separate(lca, into = c("lca_1", "lca_2", "lca_3"), sep = ",") |> + # split each lca term into its own column + tidyr::separate(lca, into = c("lca_1", "lca_2", "lca_3"), sep = ",") |> tidyr::pivot_longer( cols = dplyr::starts_with("lca"), names_to = "lca_number", values_to = "lca" - ) |> - tidyr::drop_na() |> - dplyr::select(-lca_number) |> - # account for any cases where the ontology IDs are exact matches + ) |> + tidyr::drop_na() |> + dplyr::select(-lca_number) |> + # account for any cases where the ontology IDs are exact matches # r complains about doing this earlier since the lca column holds lists until now - dplyr::mutate(lca = dplyr::if_else(blueprint_ontology == panglao_ontology, blueprint_ontology, lca)) |> - # join in information for each of the lca terms including name, number of ancestors and descendants - dplyr::left_join(cl_df, by = c("lca" = "cl_ontology")) + dplyr::mutate(lca = dplyr::if_else(blueprint_ontology == panglao_ontology, blueprint_ontology, lca)) |> + # join in information for each of the lca terms including name, number of ancestors and descendants + dplyr::left_join(cl_df, by = c("lca" = "cl_ontology")) ``` @@ -238,13 +246,15 @@ lca_df <- all_ref_df |> ```{r} ggplot(lca_df, aes(x = total_ancestors)) + geom_density() + - geom_vline(data = ancestor_labels_df, - mapping = aes(xintercept = total_ancestors), - lty = 2) + + geom_vline( + data = ancestor_labels_df, + mapping = aes(xintercept = total_ancestors), + lty = 2 + ) + geom_text( data = ancestor_labels_df, mapping = aes(x = total_ancestors, y = 0.6, label = cl_annotation), - angle = 90, + angle = 90, vjust = -0.5 ) + labs( @@ -256,13 +266,15 @@ ggplot(lca_df, aes(x = total_ancestors)) + ```{r} ggplot(lca_df, aes(x = total_descendants)) + geom_density() + - geom_vline(data = line_df, - mapping = aes(xintercept = total_descendants), - lty = 2) + + geom_vline( + data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2 + ) + geom_text( data = line_df, mapping = aes(x = total_descendants, y = 0.002, label = cl_annotation), - angle = 90, + angle = 90, vjust = -0.5 ) + labs( @@ -271,18 +283,20 @@ ggplot(lca_df, aes(x = total_descendants)) + ) ``` -Let's zoom into the area below 1000, since we already know we would want to exlude anything above that based on this plot. +Let's zoom into the area below 1000, since we already know we would want to exclude anything above that based on this plot. ```{r} ggplot(lca_df, aes(x = total_descendants)) + geom_density() + - geom_vline(data = line_df, - mapping = aes(xintercept = total_descendants), - lty = 2) + + geom_vline( + data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2 + ) + geom_text( data = line_df, mapping = aes(x = total_descendants, y = 0.002, label = cl_annotation), - angle = 90, + angle = 90, vjust = -0.5 ) + xlim(c(0, 1000)) + @@ -304,30 +318,30 @@ This is likely to be a good cutoff for deciding which LCA labels to keep. ```{r} peak_idx <- splus2R::peaks(lca_df$total_descendants) -cutoff <- lca_df$total_descendants[peak_idx] |> - min() # find the smallest peak and use that as the cutoff for number of descendants +cutoff <- lca_df$total_descendants[peak_idx] |> + min() # find the smallest peak and use that as the cutoff for number of descendants ``` Below is the list of all consensus cell type labels that we will be keeping if we were to just use this cutoff. ```{r} -celltypes_to_keep <- lca_df |> - dplyr::filter(total_descendants <= cutoff) |> - dplyr::pull(cl_annotation) |> +celltypes_to_keep <- lca_df |> + dplyr::filter(total_descendants <= cutoff) |> + dplyr::pull(cl_annotation) |> unique() celltypes_to_keep ``` -We can also look at all the cell types we are keeping and the total number of descendants to see if there are any that may be we don't want to include because the term is too braod. +We can also look at all the cell types we are keeping and the total number of descendants to see if there are any that may be we don't want to include because the term is too broad. ```{r} # pull out the cell types and total descendants for cell types to keep plot_celltype_df <- lca_df |> dplyr::filter(cl_annotation %in% celltypes_to_keep) |> - dplyr::select(cl_annotation, total_descendants) |> + dplyr::select(cl_annotation, total_descendants) |> unique() # bar chart showing total number of descendants for each cell type @@ -354,12 +368,12 @@ Below are tables that look specifically at the combinations of cell type annotat #### Blood cell ```{r} -print_df <- lca_df |> +print_df <- lca_df |> dplyr::select(blueprint_ontology, blueprint_annotation_main, blueprint_annotation_fine, panglao_ontology, panglao_annotation, total_lca, lca, cl_annotation) # blood cell -print_df |> - dplyr::filter(cl_annotation == "blood cell") +print_df |> + dplyr::filter(cl_annotation == "blood cell") ``` I think I'm in favor of not having a "blood cell" label, since I'm not sure that it's helpful. @@ -369,7 +383,7 @@ Also, if two different methods label something a platelet and a neutrophil, then ```{r} # bone cell -print_df |> +print_df |> dplyr::filter(cl_annotation == "bone cell") ``` @@ -379,7 +393,7 @@ I think I would also remove bone cell, since hematopoietic stem cells and osteoc ```{r} # myeloid leukocyte cell -print_df |> +print_df |> dplyr::filter(cl_annotation == "myeloid leukocyte") ``` @@ -390,9 +404,9 @@ Noting that after discussion we have decided to keep this one since T and B cell ```{r} # progenitor cell -print_df |> - dplyr::filter(cl_annotation == "progenitor cell") |> - head(n=15) # there's a lot of these so let's only print out some +print_df |> + dplyr::filter(cl_annotation == "progenitor cell") |> + head(n = 15) # there's a lot of these so let's only print out some ``` Same with `progenitor cell`, I do think it could be helpful to know that something may be a progenitor cell, but when you have a cell with the label for HSC and the label for cells like monocytes or osteoblasts, then maybe we are talking about a tumor cell instead. @@ -403,16 +417,16 @@ Along those same lines, I think the below terms, `lining cell` and `supporting c #### Lining cell ```{r} -# lining cell -print_df |> +# lining cell +print_df |> dplyr::filter(cl_annotation == "lining cell") ``` #### Supporting cell ```{r} -# supporting cell -print_df |> +# supporting cell +print_df |> dplyr::filter(cl_annotation == "supporting cell") ``` @@ -422,9 +436,9 @@ print_df |> We can also look at what cell type labels we are excluding when using this cut off to see if there are any terms we might actually want to keep instead. ```{r} -lca_df |> - dplyr::filter(total_descendants > cutoff) |> - dplyr::pull(cl_annotation) |> +lca_df |> + dplyr::filter(total_descendants > cutoff) |> + dplyr::pull(cl_annotation) |> unique() ``` @@ -435,7 +449,7 @@ Let's look at those combinations. ```{r} # neuron -print_df |> +print_df |> dplyr::filter(cl_annotation == "neuron") ``` @@ -446,7 +460,7 @@ Even though neuron has ~ 500 descendants, I think we should keep these labels. ```{r} # epithelial cell -print_df |> +print_df |> dplyr::filter(cl_annotation == "epithelial cell") ``` @@ -460,9 +474,9 @@ Maybe in the case where we have multiple LCAs we are already too broad and we sh Here I'm looking at the total number of descendants for all terms that show up because a term has multiple LCAs. ```{r} -lca_df |> - dplyr::filter(total_lca > 1) |> - dplyr::select(cl_annotation, total_descendants) |> +lca_df |> + dplyr::filter(total_lca > 1) |> + dplyr::select(cl_annotation, total_descendants) |> unique() |> dplyr::arrange(total_descendants) ``` @@ -475,13 +489,13 @@ I'll also look to see what cell types we lose when we add this extra filtering s ```{r} # remove any combinations with more than one lca -filtered_lca_df <- lca_df |> +filtered_lca_df <- lca_df |> dplyr::filter(total_lca < 2) -# get a list of cell types to keep based on cutoff -updated_celltypes <- filtered_lca_df |> - dplyr::filter(total_descendants <= cutoff) |> - dplyr::pull(cl_annotation) |> +# get a list of cell types to keep based on cutoff +updated_celltypes <- filtered_lca_df |> + dplyr::filter(total_descendants <= cutoff) |> + dplyr::pull(cl_annotation) |> unique() # which cell types are now missing from the list to keep @@ -493,7 +507,7 @@ It looks like I am losing a few terms I already said were not specific and then #### Hematopoietic precursor cell ```{r} -print_df |> +print_df |> dplyr::filter(cl_annotation == "hematopoietic precursor cell") ``` @@ -503,8 +517,8 @@ I think in the context of pediatric cancer having this label would be helpful, s Let's look at what the other LCA is for an example set. ```{r} -lca_df |> - dplyr::filter(panglao_ontology == "CL:0000037" & blueprint_ontology == "CL:0000050") |> +lca_df |> + dplyr::filter(panglao_ontology == "CL:0000037" & blueprint_ontology == "CL:0000050") |> dplyr::select(blueprint_annotation_main, blueprint_annotation_fine, panglao_annotation, cl_annotation) ``` @@ -514,7 +528,7 @@ Personally, I would keep the term for `hematopoietic precursor cell` because I t #### Perivascular cell ```{r} -print_df |> +print_df |> dplyr::filter(cl_annotation == "perivascular cell") ``` @@ -537,18 +551,20 @@ Then we will look at the values for pairs that have an LCA that pass the total d ```{r} information_content <- ontologySimilarity::descendants_IC(cl_ont) -# get similarity index for each set of terms -si_df <- lca_df |> - dplyr::rowwise() |> +# get similarity index for each set of terms +si_df <- lca_df |> + dplyr::rowwise() |> dplyr::mutate( - similarity_index = ontologySimilarity::get_sim_grid(ontology = cl_ont, - term_sets = list(panglao_ontology, blueprint_ontology)) |> + similarity_index = ontologySimilarity::get_sim_grid( + ontology = cl_ont, + term_sets = list(panglao_ontology, blueprint_ontology) + ) |> ontologySimilarity::get_sim() ) ``` ```{r} -si_df <- si_df |> +si_df <- si_df |> dplyr::mutate( lca_threshold = dplyr::if_else(total_descendants < cutoff, "PASS", "FAIL") ) @@ -570,24 +586,25 @@ Here each LCA term is its own plot and the vertical lines are the similarity ind ```{r} celltypes_to_plot <- c("myeloid leukocyte", "T cell", "cell", "supporting cell", "B cell") -celltypes_to_plot |> +celltypes_to_plot |> purrr::map(\(celltype){ - line_df <- si_df |> - dplyr::filter(cl_annotation == celltype) |> - dplyr::select(cl_annotation, similarity_index) |> + line_df <- si_df |> + dplyr::filter(cl_annotation == celltype) |> + dplyr::select(cl_annotation, similarity_index) |> unique() - + ggplot(si_df, aes(x = similarity_index)) + geom_density() + - geom_vline(data = line_df, - mapping = aes(xintercept = similarity_index), - lty = 2) + + geom_vline( + data = line_df, + mapping = aes(xintercept = similarity_index), + lty = 2 + ) + labs( x = "Similarity index", y = "Density", title = celltype ) - }) ``` diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html index e2cae57ad..caee7a793 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html @@ -9,12 +9,366 @@ - - -Summary of cell type ontologies in reference files +01-reference-exploration + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

<!DOCTYPE html>

+ + + + + + + + +Summary of cell type ontologies in reference files + - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - -

This notebook aims to identify a set of consensus labels between cell -types in the PanglaoDB and Blueprint Encode references.

+

+This notebook aims to identify a set of consensus labels between cell +types in the PanglaoDB and Blueprint Encode references. +

-

Setup

+

+Setup +

suppressPackageStartupMessages({
   # load required packages
   library(ggplot2)
@@ -529,12 +895,16 @@ 

Setup

unique()
-

Full cell ontology

-

Below I will calculate the total number of ancestors and the total +

+Full cell ontology +

+

+Below I will calculate the total number of ancestors and the total number of descendants for each term in the full cell type ontology and then show the distributions for those statistics. This will give us an idea of the range of values we expect to see when looking at the -PanglaoDB and Blueprint Encode references.

+PanglaoDB and Blueprint Encode references. +

# turn cl_ont into data frame with one row per term 
 cl_df <- data.frame(
   cl_ontology = cl_ont$id,
@@ -548,8 +918,10 @@ 

Full cell ontology

descendants = list(ontologyIndex::get_descendants(cl_ont, cl_ontology, exclude_roots = TRUE)), total_descendants = length(descendants) )
-

The vertical lines in the below plot indicate the value for cell -types of varying granularity.

+

+The vertical lines in the below plot indicate the value for cell types +of varying granularity. +

celltypes_of_interest <- c("eukaryotic cell", "lymphocyte", "leukocyte", "hematopoietic cell", "T cell", "endothelial cell", "smooth muscle cell", "memory T cell")
 line_df <- cl_df |> 
   dplyr::filter(cl_annotation %in% celltypes_of_interest) |> 
@@ -576,12 +948,18 @@ 

Full cell ontology

x = "Number of ancestors", y = "Density" )
-

-

Generally it looks like as the cell types get more specific we see a +

+ +

+

+Generally it looks like as the cell types get more specific we see a greater number of ancestors. However, the range of values is small and we see some cell types have the same value and probably not the same -level of granularity.

-

Below we will look at total number of descendants.

+level of granularity. +

+

+Below we will look at total number of descendants. +

ggplot(cl_df, aes(x = total_descendants)) +
   geom_density(fill = "#FFCB05", alpha = 0.5) +
   geom_vline(data = line_df,
@@ -597,9 +975,13 @@ 

Full cell ontology

x = "Number of descendants", y = "Density" )
-

-

It looks like most cell types have very few descendants, so let’s -zoom into the area below 500 to get a better look.

+

+ +

+

+It looks like most cell types have very few descendants, so let’s zoom +into the area below 500 to get a better look. +

ggplot(cl_df, aes(x = total_descendants)) +
   geom_density(fill = "#FFCB05", alpha = 0.5) +
   geom_vline(data = line_df,
@@ -619,30 +1001,42 @@ 

Full cell ontology

## Warning: Removed 14 rows containing non-finite outside the scale range (`stat_density()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_vline()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_text()`).
-

-

Here we see a much larger range of values and that cell types become +

+ +

+

+Here we see a much larger range of values and that cell types become more general as the number of descendants goes up. However, this distribution alone is probably not helpful in determining a cutoff. The next section we will look at this distribution specifically for cell -types present in our references, PanglaoDB and Blueprint encode.

+types present in our references, PanglaoDB and Blueprint encode. +

-

Latest common ancestor (LCA) between PanglaoDB and Blueprint -encode

-

This section will look at identifying the latest common ancestor -(LCA) between all possible combinations of terms from PanglaoDB (used -for assigning cell types with CellAssign) and the +

+Latest common ancestor (LCA) between PanglaoDB and Blueprint encode +

+

+This section will look at identifying the latest common ancestor (LCA) +between all possible combinations of terms from PanglaoDB (used for +assigning cell types with CellAssign) and the BlueprintEncodeData reference from celldex (used for assigning cell types with SingleR). The LCA refers to the latest term in the cell ontology hierarchy that is common -between two terms. I will use the ontoProc::findCommonAncestors() -function to get the LCA for each combination.

-

Note that it is possible to have more than one LCA for a set of -terms. To start, I will keep all LCA terms found.

-

For each LCA, I will again look at the total number of ancestors and +between two terms. I will use the +ontoProc::findCommonAncestors() +function to get the LCA for each combination. +

+

+Note that it is possible to have more than one LCA for a set of terms. +To start, I will keep all LCA terms found. +

+

+For each LCA, I will again look at the total number of ancestors and descendants and see if I can identify an appropriate cutoff. Ultimately, I would like to see if we can use that cutoff to decide if we should -keep the LCA term as the consensus label or use “Unknown”.

+keep the LCA term as the consensus label or use “Unknown”. +

# first set up the graph from cl ont
 parent_terms <- cl_ont$parents
 cl_graph <- igraph::make_graph(rbind(unlist(parent_terms), rep(names(parent_terms), lengths(parent_terms))))
@@ -696,7 +1090,9 @@

Latest common ancestor (LCA) between PanglaoDB and Blueprint
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 7967 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
 ## 18, 19, 20, ...].
-

Distribution of ancestors and descendants

+

+Distribution of ancestors and descendants +

ggplot(lca_df, aes(x = total_ancestors)) +
   geom_density() +
   geom_vline(data = ancestor_labels_df,
@@ -712,7 +1108,9 @@ 

Distribution of ancestors and descendants

x = "Total number of ancestors", y = "Density" )
-

+

+ +

ggplot(lca_df, aes(x = total_descendants)) +
   geom_density() +
   geom_vline(data = line_df,
@@ -728,9 +1126,13 @@ 

Distribution of ancestors and descendants

x = "Total number of descendants", y = "Density" )
-

-

Let’s zoom into the area below 1000, since we already know we would -want to exlude anything above that based on this plot.

+

+ +

+

+Let’s zoom into the area below 1000, since we already know we would want +to exlude anything above that based on this plot. +

ggplot(lca_df, aes(x = total_descendants)) +
   geom_density() +
   geom_vline(data = line_df,
@@ -750,27 +1152,37 @@ 

Distribution of ancestors and descendants

## Warning: Removed 6856 rows containing non-finite outside the scale range (`stat_density()`).
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_vline()`).
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_text()`).
-

-

We can use the vertical lines for cells of interest to help us define -a potential cutoff based on the granularity we would like to see in our +

+ +

+

+We can use the vertical lines for cells of interest to help us define a +potential cutoff based on the granularity we would like to see in our consensus label. We want to be able to label things like T cell, but we don’t want to label anything as lymphocyte as that’s probably not helpful. I don’t see any obvious cutoffs that may be present in the total number of ancestors, but the number of descendants is likely to be informative. I think it might be a good idea to start by drawing a line at the local maxima between the T cell and lymphocyte lines on the -number of descendants graph.

+number of descendants graph. +

-

Defining a cutoff for number of descendants

-

First we will find the value for the first peak shown in the +

+Defining a cutoff for number of descendants +

+

+First we will find the value for the first peak shown in the distribution. This is likely to be a good cutoff for deciding which LCA -labels to keep.

+labels to keep. +

peak_idx <- splus2R::peaks(lca_df$total_descendants)
 cutoff <- lca_df$total_descendants[peak_idx] |> 
   min() # find the smallest peak and use that as the cutoff for number of descendants 
-

Below is the list of all consensus cell type labels that we will be -keeping if we were to just use this cutoff.

+

+Below is the list of all consensus cell type labels that we will be +keeping if we were to just use this cutoff. +

celltypes_to_keep <- lca_df |> 
   dplyr::filter(total_descendants <= cutoff) |> 
   dplyr::pull(cl_annotation) |> 
@@ -796,9 +1208,11 @@ 

Defining a cutoff for number of descendants

## [49] "pericyte" "perivascular cell" "supporting cell" ## [52] "astrocyte" "glial cell" "macroglial cell" ## [55] "neuron associated cell" "mesangial cell"
-

We can also look at all the cell types we are keeping and the total +

+We can also look at all the cell types we are keeping and the total number of descendants to see if there are any that may be we don’t want -to include because the term is too braod.

+to include because the term is too braod. +

# pull out the cell types and total descendants for cell types to keep
 plot_celltype_df <- lca_df |>
   dplyr::filter(cl_annotation %in% celltypes_to_keep) |>
@@ -815,21 +1229,31 @@ 

Defining a cutoff for number of descendants

x = "cell type", y = "Total descendants" )
-

-

There are a few terms that I think might be more broad than we want -like blood cell, bone cell, -supporting cell, and lining cell. I’m on the -fence about keeping myeloid leukocyte and -progenitor cell. I think if we wanted to remove those terms -we could move our cutoff to be the same number of descendants as -T cell, since we do want to keep that.

-

One could also argue to remove stromal cell or -extracellular matrix secreting cell.

-

Below are tables that look specifically at the combinations of cell -type annotations that resulted in some of the terms that I might -consider removing.

+

+ +

+

+There are a few terms that I think might be more broad than we want like +blood cell, bone cell, supporting +cell, and lining cell. I’m on the fence about +keeping myeloid leukocyte and progenitor cell. +I think if we wanted to remove those terms we could move our cutoff to +be the same number of descendants as T cell, since we do +want to keep that. +

+

+One could also argue to remove stromal cell or +extracellular matrix secreting cell. +

+

+Below are tables that look specifically at the combinations of cell type +annotations that resulted in some of the terms that I might consider +removing. +

-

Blood cell

+

+Blood cell +

print_df <- lca_df |> 
   dplyr::select(blueprint_ontology, blueprint_annotation_main, blueprint_annotation_fine, panglao_ontology, panglao_annotation, total_lca, lca, cl_annotation)
 
@@ -850,87 +1274,203 @@ 

Blood cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000775 -Neutrophils -Neutrophils -CL:0000233 -platelet -2 -CL:0000081 -blood cell - - -CL:0000232 -Erythrocytes -Erythrocytes -CL:0000767 -basophil -2 -CL:0000081 -blood cell - - -CL:0000232 -Erythrocytes -Erythrocytes -CL:0000771 -eosinophil -2 -CL:0000081 -blood cell - - -CL:0000232 -Erythrocytes -Erythrocytes -CL:0000775 -neutrophil -2 -CL:0000081 -blood cell - - -CL:0000232 -Erythrocytes -Erythrocytes -CL:0000233 -platelet -2 -CL:0000081 -blood cell - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000233 -platelet -2 -CL:0000081 -blood cell + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000233 + + +platelet + + +2 + + +CL:0000081 + + +blood cell + + + + +CL:0000232 + + +Erythrocytes + + +Erythrocytes + + +CL:0000767 + + +basophil + + +2 + + +CL:0000081 + + +blood cell + + + + +CL:0000232 + + +Erythrocytes + + +Erythrocytes + + +CL:0000771 + + +eosinophil + + +2 + + +CL:0000081 + + +blood cell + + + + +CL:0000232 + + +Erythrocytes + + +Erythrocytes + + +CL:0000775 + + +neutrophil + + +2 + + +CL:0000081 + + +blood cell + + + + +CL:0000232 + + +Erythrocytes + + +Erythrocytes + + +CL:0000233 + + +platelet + + +2 + + +CL:0000081 + + +blood cell + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000233 + + +platelet + + +2 + + +CL:0000081 + + +blood cell +
-

I think I’m in favor of not having a “blood cell” label, since I’m -not sure that it’s helpful. Also, if two different methods label -something a platelet and a neutrophil, then perhaps that label is -inaccurate and it’s really a tumor cell.

+

+I think I’m in favor of not having a “blood cell” label, since I’m not +sure that it’s helpful. Also, if two different methods label something a +platelet and a neutrophil, then perhaps that label is inaccurate and +it’s really a tumor cell. +

-

Bone cell

+

+Bone cell +

# bone cell
 print_df |> 
   dplyr::filter(cl_annotation == "bone cell")
@@ -948,45 +1488,97 @@

Bone cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000557 -HSC -GMP -CL:0000092 -osteoclast -2 -CL:0001035 -bone cell - - -CL:0000557 -HSC -GMP -CL:0000137 -osteocyte -1 -CL:0001035 -bone cell + +CL:0000557 + + +HSC + + +GMP + + +CL:0000092 + + +osteoclast + + +2 + + +CL:0001035 + + +bone cell + + + + +CL:0000557 + + +HSC + + +GMP + + +CL:0000137 + + +osteocyte + + +1 + + +CL:0001035 + + +bone cell +
-

I think I would also remove bone cell, since hematopoietic stem cells -and osteoclasts seem pretty different to me.

+

+I think I would also remove bone cell, since hematopoietic stem cells +and osteoclasts seem pretty different to me. +

-

Myeloid leukocyte

+

+Myeloid leukocyte +

# myeloid leukocyte cell
 print_df |> 
   dplyr::filter(cl_annotation == "myeloid leukocyte")
@@ -1004,739 +1596,1895 @@

Myeloid leukocyte

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000775 -Neutrophils -Neutrophils -CL:0000583 -alveolar macrophage -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000235 -macrophage -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000097 -mast cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000092 -osteoclast -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000091 -Kupffer cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000453 -Langerhans cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000129 -microglial cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000889 -myeloid suppressor cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000775 -Neutrophils -Neutrophils -CL:0000874 -splenic red pulp macrophage -1 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000583 -alveolar macrophage -2 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000767 -basophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000771 -eosinophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000235 -macrophage -2 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000097 -mast cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000775 -neutrophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000092 -osteoclast -2 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000091 -Kupffer cell -2 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000453 -Langerhans cell -2 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000129 -microglial cell -2 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000889 -myeloid suppressor cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000576 -Monocytes -Monocytes -CL:0000874 -splenic red pulp macrophage -2 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000767 -basophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000771 -eosinophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000097 -mast cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000775 -neutrophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000092 -osteoclast -2 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000453 -Langerhans cell -3 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000889 -myeloid suppressor cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000235 -Macrophages -Macrophages -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000767 -basophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000771 -eosinophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000097 -mast cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000775 -neutrophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000092 -osteoclast -2 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000453 -Langerhans cell -3 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000889 -myeloid suppressor cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000863 -Macrophages -Macrophages M1 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000767 -basophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000771 -eosinophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000097 -mast cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000775 -neutrophil -1 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000092 -osteoclast -2 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000453 -Langerhans cell -3 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000889 -myeloid suppressor cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000890 -Macrophages -Macrophages M2 -CL:0000576 -monocyte -2 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000583 -alveolar macrophage -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000235 -macrophage -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000097 -mast cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000092 -osteoclast -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000091 -Kupffer cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000453 -Langerhans cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000129 -microglial cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000889 -myeloid suppressor cell -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000576 -monocyte -1 -CL:0000766 -myeloid leukocyte - - -CL:0000771 -Eosinophils -Eosinophils -CL:0000874 -splenic red pulp macrophage -1 -CL:0000766 -myeloid leukocyte + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000583 + + +alveolar macrophage + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000235 + + +macrophage + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000097 + + +mast cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000092 + + +osteoclast + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000091 + + +Kupffer cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000453 + + +Langerhans cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000129 + + +microglial cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000889 + + +myeloid suppressor cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000775 + + +Neutrophils + + +Neutrophils + + +CL:0000874 + + +splenic red pulp macrophage + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000583 + + +alveolar macrophage + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000767 + + +basophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000771 + + +eosinophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000235 + + +macrophage + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000097 + + +mast cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000775 + + +neutrophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000092 + + +osteoclast + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000091 + + +Kupffer cell + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000453 + + +Langerhans cell + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000129 + + +microglial cell + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000889 + + +myeloid suppressor cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000874 + + +splenic red pulp macrophage + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000767 + + +basophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000771 + + +eosinophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000097 + + +mast cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000775 + + +neutrophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000092 + + +osteoclast + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000453 + + +Langerhans cell + + +3 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000889 + + +myeloid suppressor cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000235 + + +Macrophages + + +Macrophages + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000767 + + +basophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000771 + + +eosinophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000097 + + +mast cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000775 + + +neutrophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000092 + + +osteoclast + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000453 + + +Langerhans cell + + +3 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000889 + + +myeloid suppressor cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000863 + + +Macrophages + + +Macrophages M1 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000767 + + +basophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000771 + + +eosinophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000097 + + +mast cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000775 + + +neutrophil + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000092 + + +osteoclast + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000453 + + +Langerhans cell + + +3 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000889 + + +myeloid suppressor cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000890 + + +Macrophages + + +Macrophages M2 + + +CL:0000576 + + +monocyte + + +2 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000583 + + +alveolar macrophage + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000235 + + +macrophage + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000097 + + +mast cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000092 + + +osteoclast + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000091 + + +Kupffer cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000453 + + +Langerhans cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000129 + + +microglial cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000889 + + +myeloid suppressor cell + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000576 + + +monocyte + + +1 + + +CL:0000766 + + +myeloid leukocyte + + + + +CL:0000771 + + +Eosinophils + + +Eosinophils + + +CL:0000874 + + +splenic red pulp macrophage + + +1 + + +CL:0000766 + + +myeloid leukocyte +
-

I’m torn on this one, because I do think it’s helpful to know if +

+I’m torn on this one, because I do think it’s helpful to know if something is of the myeloid lineage, but if we aren’t keeping lymphocyte then I would argue we shouldn’t keep myeloid leukocyte. Noting that after discussion we have decided to keep this one since T and B cells are much easier to differentiate based on gene expression alone than -cells that are part of the myeloid lineage.

+cells that are part of the myeloid lineage. +

-

Progenitor cell

+

+Progenitor cell +

# progenitor cell
 print_df |> 
   dplyr::filter(cl_annotation == "progenitor cell") |> 
@@ -1755,181 +3503,443 @@ 

Progenitor cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000576 -Monocytes -Monocytes -CL:0000765 -erythroblast -2 -CL:0011026 -progenitor cell - - -CL:0000576 -Monocytes -Monocytes -CL:0000037 -hematopoietic stem cell -2 -CL:0011026 -progenitor cell - - -CL:0000576 -Monocytes -Monocytes -CL:0000062 -osteoblast -1 -CL:0011026 -progenitor cell - - -CL:0000576 -Monocytes -Monocytes -CL:0000158 -club cell -1 -CL:0011026 -progenitor cell - - -CL:0000576 -Monocytes -Monocytes -CL:0000038 -erythroid progenitor cell -2 -CL:0011026 -progenitor cell - - -CL:0000576 -Monocytes -Monocytes -CL:4042021 -neuronal-restricted precursor -1 -CL:0011026 -progenitor cell - - -CL:0000576 -Monocytes -Monocytes -CL:0002453 -oligodendrocyte precursor cell -1 -CL:0011026 -progenitor cell - - -CL:0000576 -Monocytes -Monocytes -CL:0002351 -progenitor cell of endocrine pancreas -1 -CL:0011026 -progenitor cell - - -CL:0000050 -HSC -MEP -CL:0000765 -erythroblast -2 -CL:0011026 -progenitor cell - - -CL:0000050 -HSC -MEP -CL:0000037 -hematopoietic stem cell -2 -CL:0011026 -progenitor cell - - -CL:0000050 -HSC -MEP -CL:0000576 -monocyte -2 -CL:0011026 -progenitor cell - - -CL:0000050 -HSC -MEP -CL:0000576 -monocyte -2 -CL:0011026 -progenitor cell - - -CL:0000050 -HSC -MEP -CL:0000062 -osteoblast -1 -CL:0011026 -progenitor cell - - -CL:0000050 -HSC -MEP -CL:0000158 -club cell -1 -CL:0011026 -progenitor cell - - -CL:0000050 -HSC -MEP -CL:0000038 -erythroid progenitor cell -3 -CL:0011026 -progenitor cell + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000765 + + +erythroblast + + +2 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000037 + + +hematopoietic stem cell + + +2 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000062 + + +osteoblast + + +1 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000158 + + +club cell + + +1 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0000038 + + +erythroid progenitor cell + + +2 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:4042021 + + +neuronal-restricted precursor + + +1 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0002453 + + +oligodendrocyte precursor cell + + +1 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000576 + + +Monocytes + + +Monocytes + + +CL:0002351 + + +progenitor cell of endocrine pancreas + + +1 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000765 + + +erythroblast + + +2 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000037 + + +hematopoietic stem cell + + +2 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000576 + + +monocyte + + +2 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000576 + + +monocyte + + +2 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000062 + + +osteoblast + + +1 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000158 + + +club cell + + +1 + + +CL:0011026 + + +progenitor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000038 + + +erythroid progenitor cell + + +3 + + +CL:0011026 + + +progenitor cell +
-

Same with progenitor cell, I do think it could be -helpful to know that something may be a progenitor cell, but when you -have a cell with the label for HSC and the label for cells like -monocytes or osteoblasts, then maybe we are talking about a tumor cell -instead. After discussion, we are going to remove progenitor cells.

-

Along those same lines, I think the below terms, -lining cell and supporting cell, are too broad -even though they have few descendants.

+

+Same with progenitor cell, I do think it could be helpful +to know that something may be a progenitor cell, but when you have a +cell with the label for HSC and the label for cells like monocytes or +osteoblasts, then maybe we are talking about a tumor cell instead. After +discussion, we are going to remove progenitor cells. +

+

+Along those same lines, I think the below terms, lining +cell and supporting cell, are too broad even though +they have few descendants. +

-

Lining cell

+

+Lining cell +

# lining cell 
 print_df |> 
   dplyr::filter(cl_annotation == "lining cell")
@@ -1947,83 +3957,197 @@

Lining cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000115 -Endothelial cells -Endothelial cells -CL:0000077 -mesothelial cell -2 -CL:0000213 -lining cell - - -CL:0000115 -Endothelial cells -Endothelial cells -CL:0002481 -peritubular myoid cell -2 -CL:0000213 -lining cell - - -CL:0000115 -Endothelial cells -Endothelial cells -CL:0000216 -Sertoli cell -2 -CL:0000213 -lining cell - - -CL:2000008 -Endothelial cells -mv Endothelial cells -CL:0000077 -mesothelial cell -2 -CL:0000213 -lining cell - - -CL:2000008 -Endothelial cells -mv Endothelial cells -CL:0002481 -peritubular myoid cell -2 -CL:0000213 -lining cell - - -CL:2000008 -Endothelial cells -mv Endothelial cells -CL:0000216 -Sertoli cell -2 -CL:0000213 -lining cell + +CL:0000115 + + +Endothelial cells + + +Endothelial cells + + +CL:0000077 + + +mesothelial cell + + +2 + + +CL:0000213 + + +lining cell + + + + +CL:0000115 + + +Endothelial cells + + +Endothelial cells + + +CL:0002481 + + +peritubular myoid cell + + +2 + + +CL:0000213 + + +lining cell + + + + +CL:0000115 + + +Endothelial cells + + +Endothelial cells + + +CL:0000216 + + +Sertoli cell + + +2 + + +CL:0000213 + + +lining cell + + + + +CL:2000008 + + +Endothelial cells + + +mv Endothelial cells + + +CL:0000077 + + +mesothelial cell + + +2 + + +CL:0000213 + + +lining cell + + + + +CL:2000008 + + +Endothelial cells + + +mv Endothelial cells + + +CL:0002481 + + +peritubular myoid cell + + +2 + + +CL:0000213 + + +lining cell + + + + +CL:2000008 + + +Endothelial cells + + +mv Endothelial cells + + +CL:0000216 + + +Sertoli cell + + +2 + + +CL:0000213 + + +lining cell +
-

Supporting cell

+

+Supporting cell +

# supporting cell 
 print_df |> 
   dplyr::filter(cl_annotation == "supporting cell")
@@ -2041,36 +4165,84 @@

Supporting cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000669 -Pericytes -Pericytes -CL:0000216 -Sertoli cell -2 -CL:0000630 -supporting cell - - -CL:0000650 -Mesangial cells -Mesangial cells -CL:0000216 -Sertoli cell -2 -CL:0000630 -supporting cell + +CL:0000669 + + +Pericytes + + +Pericytes + + +CL:0000216 + + +Sertoli cell + + +2 + + +CL:0000630 + + +supporting cell + + + + +CL:0000650 + + +Mesangial cells + + +Mesangial cells + + +CL:0000216 + + +Sertoli cell + + +2 + + +CL:0000630 + + +supporting cell + @@ -2078,10 +4250,14 @@

Supporting cell

-

Discarded cell types

-

We can also look at what cell type labels we are excluding when using +

+Discarded cell types +

+

+We can also look at what cell type labels we are excluding when using this cut off to see if there are any terms we might actually want to -keep instead.

+keep instead. +

lca_df |> 
   dplyr::filter(total_descendants > cutoff) |> 
   dplyr::pull(cl_annotation) |> 
@@ -2093,10 +4269,14 @@ 

Discarded cell types

## [13] "secretory cell" "connective tissue cell" "electrically responsive cell" ## [16] "contractile cell" "epithelial cell" "neuron" ## [19] "neural cell"
-

The only terms in this list that I would be concerned about losing -are “neuron” and epithelial cells. Let’s look at those combinations.

+

+The only terms in this list that I would be concerned about losing are +“neuron” and epithelial cells. Let’s look at those combinations. +

-

Neuron

+

+Neuron +

# neuron
 print_df |> 
   dplyr::filter(cl_annotation == "neuron")
@@ -2114,226 +4294,566 @@

Neuron

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000540 -Neurons -Neurons -CL:0000109 -adrenergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000108 -cholinergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000166 -chromaffin cell -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000700 -dopaminergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0007011 -enteric neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:1001509 -glycinergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000099 -interneuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000100 -motor neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000165 -neuroendocrine cell -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000540 -neuron -0 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0008025 -noradrenergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000210 -photoreceptor cell -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000740 -retinal ganglion cell -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000850 -serotonergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:4023169 -trigeminal neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000695 -Cajal-Retzius cell -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000617 -GABAergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000679 -glutamatergic neuron -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000121 -Purkinje cell -1 -CL:0000540 -neuron - - -CL:0000540 -Neurons -Neurons -CL:0000598 -pyramidal neuron -1 -CL:0000540 -neuron + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000109 + + +adrenergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000108 + + +cholinergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000166 + + +chromaffin cell + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000700 + + +dopaminergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0007011 + + +enteric neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:1001509 + + +glycinergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000099 + + +interneuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000100 + + +motor neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000165 + + +neuroendocrine cell + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000540 + + +neuron + + +0 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0008025 + + +noradrenergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000210 + + +photoreceptor cell + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000740 + + +retinal ganglion cell + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000850 + + +serotonergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:4023169 + + +trigeminal neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000695 + + +Cajal-Retzius cell + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000617 + + +GABAergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000679 + + +glutamatergic neuron + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000121 + + +Purkinje cell + + +1 + + +CL:0000540 + + +neuron + + + + +CL:0000540 + + +Neurons + + +Neurons + + +CL:0000598 + + +pyramidal neuron + + +1 + + +CL:0000540 + + +neuron +
-

It looks like there are a lot of types of neurons in the PanglaoDB +

+It looks like there are a lot of types of neurons in the PanglaoDB reference and only “neuron” as a term in Blueprint. Even though neuron -has ~ 500 descendants, I think we should keep these labels.

+has ~ 500 descendants, I think we should keep these labels. +

-

Epithelial cell

+

+Epithelial cell +

# epithelial cell
 print_df |> 
   dplyr::filter(cl_annotation == "epithelial cell")
@@ -2351,1015 +4871,2605 @@

Epithelial cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000622 -acinar cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:1000488 -cholangiocyte -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000166 -chromaffin cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000584 -enterocyte -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000164 -enteroendocrine cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000065 -ependymal cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000066 -epithelial cell -0 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000160 -goblet cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000501 -granulosa cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000182 -hepatocyte -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0005006 -ionocyte -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000312 -keratinocyte -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000077 -mesothelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000185 -myoepithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000165 -neuroendocrine cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002167 -olfactory epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000510 -paneth cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000162 -parietal cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002481 -peritubular myoid cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000652 -pinealocyte -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000653 -podocyte -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000209 -taste receptor cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000731 -urothelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002368 -respiratory epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002370 -respiratory goblet cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000171 -pancreatic A cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000169 -type B pancreatic cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000706 -choroid plexus epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000158 -club cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002250 -intestinal crypt stem cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000173 -pancreatic D cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002305 -epithelial cell of distal tubule -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002079 -pancreatic ductal cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000504 -enterochromaffin-like cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0005019 -pancreatic epsilon cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002258 -thyroid follicular cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002179 -foveolar cell of stomach -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000696 -PP cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000155 -peptic cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002292 -type I cell of carotid body -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0005010 -renal intercalated cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:1000909 -kidney loop of Henle epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002326 -luminal epithelial cell of mammary gland -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002327 -mammary gland epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000242 -Merkel cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000682 -M cell of gut -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002199 -oxyphil cell of parathyroid gland -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000446 -chief cell of parathyroid gland -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0005009 -renal principal cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002306 -epithelial cell of proximal tubule -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002062 -pulmonary alveolar type 1 cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002063 -pulmonary alveolar type 2 cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:1001596 -salivary gland glandular cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002140 -acinar cell of sebaceous gland -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0000216 -Sertoli cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002562 -hair germinal matrix cell -1 -CL:0000066 -epithelial cell - - -CL:0000066 -Epithelial cells -Epithelial cells -CL:0002204 -brush cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000622 -acinar cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:1000488 -cholangiocyte -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000584 -enterocyte -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000164 -enteroendocrine cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000066 -epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000160 -goblet cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000501 -granulosa cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000182 -hepatocyte -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0005006 -ionocyte -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000185 -myoepithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000510 -paneth cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000162 -parietal cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000653 -podocyte -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000209 -taste receptor cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000731 -urothelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002368 -respiratory epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002370 -respiratory goblet cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000171 -pancreatic A cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000169 -type B pancreatic cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000158 -club cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002250 -intestinal crypt stem cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000173 -pancreatic D cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002305 -epithelial cell of distal tubule -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002079 -pancreatic ductal cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000504 -enterochromaffin-like cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0005019 -pancreatic epsilon cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002258 -thyroid follicular cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002179 -foveolar cell of stomach -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000696 -PP cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000155 -peptic cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0005010 -renal intercalated cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:1000909 -kidney loop of Henle epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002326 -luminal epithelial cell of mammary gland -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002327 -mammary gland epithelial cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000682 -M cell of gut -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002199 -oxyphil cell of parathyroid gland -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0000446 -chief cell of parathyroid gland -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0005009 -renal principal cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002306 -epithelial cell of proximal tubule -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:1001596 -salivary gland glandular cell -1 -CL:0000066 -epithelial cell - - -CL:0000312 -Keratinocytes -Keratinocytes -CL:0002204 -brush cell -1 -CL:0000066 -epithelial cell + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000622 + + +acinar cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:1000488 + + +cholangiocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000166 + + +chromaffin cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000584 + + +enterocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000164 + + +enteroendocrine cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000065 + + +ependymal cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000066 + + +epithelial cell + + +0 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000160 + + +goblet cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000501 + + +granulosa cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000182 + + +hepatocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0005006 + + +ionocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000312 + + +keratinocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000077 + + +mesothelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000185 + + +myoepithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000165 + + +neuroendocrine cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002167 + + +olfactory epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000510 + + +paneth cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000162 + + +parietal cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002481 + + +peritubular myoid cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000652 + + +pinealocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000653 + + +podocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000209 + + +taste receptor cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000731 + + +urothelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002368 + + +respiratory epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002370 + + +respiratory goblet cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000171 + + +pancreatic A cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000169 + + +type B pancreatic cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000706 + + +choroid plexus epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000158 + + +club cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002250 + + +intestinal crypt stem cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000173 + + +pancreatic D cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002305 + + +epithelial cell of distal tubule + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002079 + + +pancreatic ductal cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000504 + + +enterochromaffin-like cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0005019 + + +pancreatic epsilon cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002258 + + +thyroid follicular cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002179 + + +foveolar cell of stomach + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000696 + + +PP cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000155 + + +peptic cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002292 + + +type I cell of carotid body + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0005010 + + +renal intercalated cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:1000909 + + +kidney loop of Henle epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002326 + + +luminal epithelial cell of mammary gland + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002327 + + +mammary gland epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000242 + + +Merkel cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000682 + + +M cell of gut + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002199 + + +oxyphil cell of parathyroid gland + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000446 + + +chief cell of parathyroid gland + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0005009 + + +renal principal cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002306 + + +epithelial cell of proximal tubule + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002062 + + +pulmonary alveolar type 1 cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002063 + + +pulmonary alveolar type 2 cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:1001596 + + +salivary gland glandular cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002140 + + +acinar cell of sebaceous gland + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0000216 + + +Sertoli cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002562 + + +hair germinal matrix cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000066 + + +Epithelial cells + + +Epithelial cells + + +CL:0002204 + + +brush cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000622 + + +acinar cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:1000488 + + +cholangiocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000584 + + +enterocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000164 + + +enteroendocrine cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000066 + + +epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000160 + + +goblet cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000501 + + +granulosa cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000182 + + +hepatocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0005006 + + +ionocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000185 + + +myoepithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000510 + + +paneth cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000162 + + +parietal cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000653 + + +podocyte + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000209 + + +taste receptor cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000731 + + +urothelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002368 + + +respiratory epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002370 + + +respiratory goblet cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000171 + + +pancreatic A cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000169 + + +type B pancreatic cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000158 + + +club cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002250 + + +intestinal crypt stem cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000173 + + +pancreatic D cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002305 + + +epithelial cell of distal tubule + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002079 + + +pancreatic ductal cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000504 + + +enterochromaffin-like cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0005019 + + +pancreatic epsilon cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002258 + + +thyroid follicular cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002179 + + +foveolar cell of stomach + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000696 + + +PP cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000155 + + +peptic cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0005010 + + +renal intercalated cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:1000909 + + +kidney loop of Henle epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002326 + + +luminal epithelial cell of mammary gland + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002327 + + +mammary gland epithelial cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000682 + + +M cell of gut + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002199 + + +oxyphil cell of parathyroid gland + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0000446 + + +chief cell of parathyroid gland + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0005009 + + +renal principal cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002306 + + +epithelial cell of proximal tubule + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:1001596 + + +salivary gland glandular cell + + +1 + + +CL:0000066 + + +epithelial cell + + + + +CL:0000312 + + +Keratinocytes + + +Keratinocytes + + +CL:0002204 + + +brush cell + + +1 + + +CL:0000066 + + +epithelial cell +
-

The PanglaoDB cell types seem to be more specific than the ones -present in Blueprint Encode, similar to the observation with neurons. We -should keep epithelial cell in the cases where the Blueprint Encode -annotation is Epithelial cells but not when it is -Keratinocytes.

+

+The PanglaoDB cell types seem to be more specific than the ones present +in Blueprint Encode, similar to the observation with neurons. We should +keep epithelial cell in the cases where the Blueprint Encode annotation +is Epithelial cells but not when it is +Keratinocytes. +

-

Removing anything with more than 1 LCA

-

One thing I noticed when looking at the labels that have less than -the cutoff is that most of them are from scenarios where we have -multiple LCAs. Maybe in the case where we have multiple LCAs we are -already too broad and we should just eliminate those matches from the -beginning. Here I’m looking at the total number of descendants for all -terms that show up because a term has multiple LCAs.

+

+Removing anything with more than 1 LCA +

+

+One thing I noticed when looking at the labels that have less than the +cutoff is that most of them are from scenarios where we have multiple +LCAs. Maybe in the case where we have multiple LCAs we are already too +broad and we should just eliminate those matches from the beginning. +Here I’m looking at the total number of descendants for all terms that +show up because a term has multiple LCAs. +

lca_df |> 
   dplyr::filter(total_lca > 1) |> 
   dplyr::select(cl_annotation, total_descendants) |> 
@@ -3369,119 +7479,222 @@ 

Removing anything with more than 1 LCA

- - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + +
cl_annotationtotal_descendants +cl_annotation + +total_descendants +
bone cell38 +bone cell + +38 +
blood cell41 +blood cell + +41 +
perivascular cell41 +perivascular cell + +41 +
stromal cell53 +stromal cell + +53 +
supporting cell61 +supporting cell + +61 +
hematopoietic precursor cell105 +hematopoietic precursor cell + +105 +
lining cell120 +lining cell + +120 +
myeloid leukocyte165 +myeloid leukocyte + +165 +
progenitor cell165 +progenitor cell + +165 +
mononuclear phagocyte169 +mononuclear phagocyte + +169 +
phagocyte (sensu Vertebrata)175 +phagocyte (sensu Vertebrata) + +175 +
contractile cell177 +contractile cell + +177 +
defensive cell199 +defensive cell + +199 +
professional antigen presenting cell212 +professional antigen presenting cell + +212 +
connective tissue cell223 +connective tissue cell + +223 +
myeloid cell247 +myeloid cell + +247 +
stuff accumulating cell266 +stuff accumulating cell + +266 +
precursor cell271 +precursor cell + +271 +
secretory cell457 +secretory cell + +457 +
mononuclear cell503 +mononuclear cell + +503 +
leukocyte540 +leukocyte + +540 +
electrically responsive cell673 +electrically responsive cell + +673 +
hematopoietic cell684 +hematopoietic cell + +684 +
eukaryotic cell2645 +eukaryotic cell + +2645 +
-

It looks like most of these terms are pretty broad and are either -much higher than the cutoff or right around the cutoff with a few -exceptions. Things like “bone cell” and “supporting cell” have few -descendants, but I would still argue these are very broad terms and not -useful.

-

I’m going to filter out any matches that show two LCA terms first and +

+It looks like most of these terms are pretty broad and are either much +higher than the cutoff or right around the cutoff with a few exceptions. +Things like “bone cell” and “supporting cell” have few descendants, but +I would still argue these are very broad terms and not useful. +

+

+I’m going to filter out any matches that show two LCA terms first and then use the cutoff to define labels we would keep. I’ll also look to see what cell types we lose when we add this extra filtering step to be -sure they are ones that we want to lose.

+sure they are ones that we want to lose. +

# remove any combinations with more than one lca
 filtered_lca_df <- lca_df |> 
   dplyr::filter(total_lca < 2)
@@ -3496,12 +7709,16 @@ 

Removing anything with more than 1 LCA

setdiff(celltypes_to_keep, updated_celltypes)
## [1] "blood cell"                   "hematopoietic precursor cell" "lining cell"                 
 ## [4] "perivascular cell"            "supporting cell"
-

It looks like I am losing a few terms I already said were not -specific and then a few other terms, like “hematopoietic precursor cell” -and “perivascular cell”. I’ll look at both of those to confirm we would -not want them.

+

+It looks like I am losing a few terms I already said were not specific +and then a few other terms, like “hematopoietic precursor cell” and +“perivascular cell”. I’ll look at both of those to confirm we would not +want them. +

-

Hematopoietic precursor cell

+

+Hematopoietic precursor cell +

print_df |> 
   dplyr::filter(cl_annotation == "hematopoietic precursor cell")
@@ -3518,135 +7735,331 @@

Hematopoietic precursor cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000050 -HSC -MEP -CL:0000037 -hematopoietic stem cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000050 -HSC -MEP -CL:0000038 -erythroid progenitor cell -3 -CL:0008001 -hematopoietic precursor cell - - -CL:0000037 -HSC -HSC -CL:0000038 -erythroid progenitor cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000837 -HSC -MPP -CL:0000037 -hematopoietic stem cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000837 -HSC -MPP -CL:0000038 -erythroid progenitor cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000051 -HSC -CLP -CL:0000037 -hematopoietic stem cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000051 -HSC -CLP -CL:0000038 -erythroid progenitor cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000557 -HSC -GMP -CL:0000037 -hematopoietic stem cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000557 -HSC -GMP -CL:0000038 -erythroid progenitor cell -3 -CL:0008001 -hematopoietic precursor cell - - -CL:0000049 -HSC -CMP -CL:0000037 -hematopoietic stem cell -2 -CL:0008001 -hematopoietic precursor cell - - -CL:0000049 -HSC -CMP -CL:0000038 -erythroid progenitor cell -2 -CL:0008001 -hematopoietic precursor cell + +CL:0000050 + + +HSC + + +MEP + + +CL:0000037 + + +hematopoietic stem cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000050 + + +HSC + + +MEP + + +CL:0000038 + + +erythroid progenitor cell + + +3 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000037 + + +HSC + + +HSC + + +CL:0000038 + + +erythroid progenitor cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000837 + + +HSC + + +MPP + + +CL:0000037 + + +hematopoietic stem cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000837 + + +HSC + + +MPP + + +CL:0000038 + + +erythroid progenitor cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000051 + + +HSC + + +CLP + + +CL:0000037 + + +hematopoietic stem cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000051 + + +HSC + + +CLP + + +CL:0000038 + + +erythroid progenitor cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000557 + + +HSC + + +GMP + + +CL:0000037 + + +hematopoietic stem cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000557 + + +HSC + + +GMP + + +CL:0000038 + + +erythroid progenitor cell + + +3 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000049 + + +HSC + + +CMP + + +CL:0000037 + + +hematopoietic stem cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell + + + + +CL:0000049 + + +HSC + + +CMP + + +CL:0000038 + + +erythroid progenitor cell + + +2 + + +CL:0008001 + + +hematopoietic precursor cell +
-

It looks like here we should be keeping these matches because both +

+It looks like here we should be keeping these matches because both references have these labels as hematopoietic stem and progenitor cells. I think in the context of pediatric cancer having this label would be -helpful, so maybe we shouldn’t remove all terms that have 2 LCAs.

-

Let’s look at what the other LCA is for an example set.

+helpful, so maybe we shouldn’t remove all terms that have 2 LCAs. +

+

+Let’s look at what the other LCA is for an example set. +

lca_df |> 
   dplyr::filter(panglao_ontology == "CL:0000037" & blueprint_ontology == "CL:0000050") |> 
   dplyr::select(blueprint_annotation_main, blueprint_annotation_fine, panglao_annotation, cl_annotation)
@@ -3660,36 +8073,64 @@

Hematopoietic precursor cell

-blueprint_annotation_main -blueprint_annotation_fine -panglao_annotation -cl_annotation + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_annotation + + +cl_annotation + -HSC -MEP -hematopoietic stem cell -hematopoietic precursor cell - - -HSC -MEP -hematopoietic stem cell -progenitor cell + +HSC + + +MEP + + +hematopoietic stem cell + + +hematopoietic precursor cell + + + + +HSC + + +MEP + + +hematopoietic stem cell + + +progenitor cell +
-

It looks like these terms have both -hematopoietic precursor cell and -progenitor cell as LCAs. Personally, I would keep the term -for hematopoietic precursor cell because I think it’s more -informative and specific to the type of progenitor cell.

+

+It looks like these terms have both hematopoietic precursor +cell and progenitor cell as LCAs. Personally, I +would keep the term for hematopoietic precursor cell +because I think it’s more informative and specific to the type of +progenitor cell. +

-

Perivascular cell

+

+Perivascular cell +

print_df |> 
   dplyr::filter(cl_annotation == "perivascular cell")
@@ -3706,125 +8147,282 @@

Perivascular cell

-blueprint_ontology -blueprint_annotation_main -blueprint_annotation_fine -panglao_ontology -panglao_annotation -total_lca -lca -cl_annotation + +blueprint_ontology + + +blueprint_annotation_main + + +blueprint_annotation_fine + + +panglao_ontology + + +panglao_annotation + + +total_lca + + +lca + + +cl_annotation + -CL:0000669 -Pericytes -Pericytes -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell - - -CL:0000669 -Pericytes -Pericytes -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell - - -CL:0000669 -Pericytes -Pericytes -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell - - -CL:0000669 -Pericytes -Pericytes -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell - - -CL:0000650 -Mesangial cells -Mesangial cells -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell - - -CL:0000650 -Mesangial cells -Mesangial cells -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell - - -CL:0000650 -Mesangial cells -Mesangial cells -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell - - -CL:0000650 -Mesangial cells -Mesangial cells -CL:0000359 -vascular associated smooth muscle cell -3 -CL:4033054 -perivascular cell + +CL:0000669 + + +Pericytes + + +Pericytes + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell + + + + +CL:0000669 + + +Pericytes + + +Pericytes + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell + + + + +CL:0000669 + + +Pericytes + + +Pericytes + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell + + + + +CL:0000669 + + +Pericytes + + +Pericytes + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell + + + + +CL:0000650 + + +Mesangial cells + + +Mesangial cells + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell + + + + +CL:0000650 + + +Mesangial cells + + +Mesangial cells + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell + + + + +CL:0000650 + + +Mesangial cells + + +Mesangial cells + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell + + + + +CL:0000650 + + +Mesangial cells + + +Mesangial cells + + +CL:0000359 + + +vascular associated smooth muscle cell + + +3 + + +CL:4033054 + + +perivascular cell +
-

I would remove perivascular cell, since the cell type +

+I would remove perivascular cell, since the cell type labels from PanglaoDB and Blueprint are pretty different from each -other.

+other. +

-

Similarity index

-

An alternative approach would be to calculate the similarity +

+Similarity index +

+

+An alternative approach would be to calculate the +similarity index between each set of terms and define a cutoff for which set of terms are similar. This is a value on a 0-1 scale where 0 indicates no -similarity and 1 indicates the terms are equal.

-

Although this could provide a metric that we could use to define -similar cell types, we would still have to identify the label to use -which would most likely be the LCA. Even if the similarity index is -close to 1, if the LCA term is not informative then I don’t know that we -would want to use that.

-

However, we could use this to finalize the actual pairs of terms that -we trust. For example, if the LCA for a pair is T cell we -can look at the similarity index to confirm that specific pair of terms -has high similarity.

-

Below I’ll calculate the similarity index for each set of terms and -plot the distribution. Then we will look at the values for pairs that -have an LCA that pass the total descendants threshold we set to see if -those pairs have a higher similarity index.

+similarity and 1 indicates the terms are equal. +

+

+Although this could provide a metric that we could use to define similar +cell types, we would still have to identify the label to use which would +most likely be the LCA. Even if the similarity index is close to 1, if +the LCA term is not informative then I don’t know that we would want to +use that. +

+

+However, we could use this to finalize the actual pairs of terms that we +trust. For example, if the LCA for a pair is T cell we can +look at the similarity index to confirm that specific pair of terms has +high similarity. +

+

+Below I’ll calculate the similarity index for each set of terms and plot +the distribution. Then we will look at the values for pairs that have an +LCA that pass the total descendants threshold we set to see if those +pairs have a higher similarity index. +

information_content <- ontologySimilarity::descendants_IC(cl_ont)
 
 # get similarity index for each set of terms 
@@ -3846,15 +8444,21 @@ 

Similarity index

x = "Similarity index", y = "Density" )
-

-

This looks as I expected with most of the pairs that pass the total +

+ +

+

+This looks as I expected with most of the pairs that pass the total descendants cutoff having a higher similarity index than those that do not pass. There is still some overlap though so perhaps even if a set of terms shares an LCA that passes the threshold, the actual terms being -compared may be further apart than we would like.

-

Now let’s look at the similarity index for various LCA terms. Here -each LCA term is its own plot and the vertical lines are the similarity -index for each pair of terms that results in that LCA.

+compared may be further apart than we would like. +

+

+Now let’s look at the similarity index for various LCA terms. Here each +LCA term is its own plot and the vertical lines are the similarity index +for each pair of terms that results in that LCA. +

celltypes_to_plot <- c("myeloid leukocyte", "T cell", "cell", "supporting cell", "B cell")
 
 celltypes_to_plot |> 
@@ -3877,59 +8481,84 @@ 

Similarity index

})
## [[1]]
-

+

+ +

## 
 ## [[2]]
-

+

+ +

## 
 ## [[3]]
-

+

+ +

## 
 ## [[4]]
-

+

+ +

## 
 ## [[5]]
-

-

It looks like terms that are more granular like T and B cell have -higher similarity index values than terms that are less granular which -is what we would expect. However, within terms like myeloid leukocyte -and even T cell we do see a range of values. We could dig deeper into -which pairs are resulting in which similarity index values if we wanted -to, but I think that might be a future direction if we feel like the -similarity index is something that could be useful.

+

+ +

+

+It looks like terms that are more granular like T and B cell have higher +similarity index values than terms that are less granular which is what +we would expect. However, within terms like myeloid leukocyte and even T +cell we do see a range of values. We could dig deeper into which pairs +are resulting in which similarity index values if we wanted to, but I +think that might be a future direction if we feel like the similarity +index is something that could be useful. +

-

Conclusions

-

Based on these findings, I think it might be best to create a -reference that has all possible pairs of labels between PanglaoDB and -Blueprint Encode and the resulting consensus label for those pairs. To -do this we could come up with a whitelist of LCA terms that we would be -comfortable including and all other cell types would be unknowns. I -would use the following criteria to come up with my whitelist:

+

+Conclusions +

+

+Based on these findings, I think it might be best to create a reference +that has all possible pairs of labels between PanglaoDB and Blueprint +Encode and the resulting consensus label for those pairs. To do this we +could come up with a whitelist of LCA terms that we would be comfortable +including and all other cell types would be unknowns. I would use the +following criteria to come up with my whitelist: +

    -
  • Pairs should not have more than 1 LCA, with the exception of the -matches that have the label hematopoietic precursor cell.
  • -
  • The LCA should have equal to or less than 170 total -descendants.
  • -
  • We should include the term for neuron and -epithelial cell even though they do not pass the threshold -for number of descendants. However, epithelial cell should -only be included if the Blueprint Encode name is -Epithelial cells and not -Keratinocytes.
  • -
  • Terms that are too broad should be removed. This includes: -lining cell, blood cell, -progenitor cell, bone cell, and -supporting cell
  • +
  • +Pairs should not have more than 1 LCA, with the exception of the matches +that have the label hematopoietic precursor cell. +
  • +
  • +The LCA should have equal to or less than 170 total descendants. +
  • +
  • +We should include the term for neuron and epithelial +cell even though they do not pass the threshold for number of +descendants. However, epithelial cell should only be +included if the Blueprint Encode name is Epithelial cells +and not Keratinocytes. +
  • +
  • +Terms that are too broad should be removed. This includes: lining +cell, blood cell, progenitor cell, +bone cell, and supporting cell +
-

Alternatively, rather than eliminate terms that are too broad we -could look at the similarity index for individual matches and decide on -a case by case basis if those should be allowed. Although I still think -having a term that is too broad, even if it’s a good match, is not super -informative.

+

+Alternatively, rather than eliminate terms that are too broad we could +look at the similarity index for individual matches and decide on a case +by case basis if those should be allowed. Although I still think having +a term that is too broad, even if it’s a good match, is not super +informative. +

-

Session info

+

+Session info +

sessionInfo()
## R version 4.4.2 (2024-10-31)
 ## Platform: aarch64-apple-darwin20
@@ -3984,6 +8613,48 @@ 

Session info

## [117] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7 mime_0.12 ## [121] bit64_4.5.2
+ + + + + + + + + + @@ -4018,11 +8689,6 @@

Session info

- diff --git a/analyses/cell-type-consensus/references/README.md b/analyses/cell-type-consensus/references/README.md index 4d642c58e..ec7e2495f 100644 --- a/analyses/cell-type-consensus/references/README.md +++ b/analyses/cell-type-consensus/references/README.md @@ -1,32 +1,32 @@ # References -This folder contains all reference files used for generating consensus cell types. +This folder contains all reference files used for generating consensus cell types. -1. `panglao-cell-type-ontologies.tsv`: This file contains a table with all possible cell types in the reference used when running `CellAssign`. -The table includes the following columns: +1. `panglao-cell-type-ontologies.tsv`: This file contains a table with all possible cell types in the reference used when running `CellAssign`. +The table includes the following columns: | | | -| --- | --- | -| `ontology_id` | [cell type (CL) ontology identifier term](https://www.ebi.ac.uk/ols4/ontologies/cl) | +| --- | --- | +| `ontology_id` | [cell type (CL) ontology identifier term](https://www.ebi.ac.uk/ols4/ontologies/cl) | | `human_readable_value` | Label associated with the cell type ontology term | -| `panglao_cell_type` | Original name for the cell type as set by `PanglaoDB` | +| `panglao_cell_type` | Original name for the cell type as set by `PanglaoDB` | -To generate this file follow these steps: +To generate this file follow these steps: -- Download the original reference file with `00-download-panglao-ref.sh`. -- Programmatically assign ontology lables with `01-prepare-cell-type-ontologies.sh`. -- Assign any ontology values manually by finding the most representive [cell type ontology (CL) identifier term](https://www.ebi.ac.uk/ols4/ontologies/cl). +- Download the original reference file with `00-download-panglao-ref.sh`. +- Programmatically assign ontology labels with `01-prepare-cell-type-ontologies.sh`. +- Assign any ontology values manually by finding the most representative [cell type ontology (CL) identifier term](https://www.ebi.ac.uk/ols4/ontologies/cl). -There were a few terms that were assigned manually that did not have an obvious or exact match in the cell type ontology that should be noted: +There were a few terms that were assigned manually that did not have an obvious or exact match in the cell type ontology that should be noted: -- `Kidney progenitor cells` were assigned the [`CL:0000324` for metanephric mesenchyme stem cell](https://www.ebi.ac.uk/ols4/ontologies/cl/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCL_0000324). -This term refers specifically to the stem cells that ultimately comprise the nephron, but does not account for the part of the kidney that is derived from ureteric bud cells. -- `Meningeal cells` were assigned to [`CL:0000708` for leptomeningeal cell](https://www.ebi.ac.uk/ols4/ontologies/cl/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCL_0000708). -This was the closest term that covered general cell types found in the meninges. -- `Pancreatic progenitor cells` were assigned to [`CL:0002351` for progenitor cell of endocrine pancreas](https://www.ebi.ac.uk/ols4/ontologies/cl/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCL_0002351). -This term only covers progenitor cells for the endocrine pancreas and does not cover the exocrine pancreas. -There were no terms that encompassed both other than `progenitor cell`. +- `Kidney progenitor cells` were assigned the [`CL:0000324` for metanephric mesenchyme stem cell](https://www.ebi.ac.uk/ols4/ontologies/cl/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCL_0000324). +This term refers specifically to the stem cells that ultimately comprise the nephron, but does not account for the part of the kidney that is derived from ureteric bud cells. +- `Meningeal cells` were assigned to [`CL:0000708` for leptomeningeal cell](https://www.ebi.ac.uk/ols4/ontologies/cl/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCL_0000708). +This was the closest term that covered general cell types found in the meninges. +- `Pancreatic progenitor cells` were assigned to [`CL:0002351` for progenitor cell of endocrine pancreas](https://www.ebi.ac.uk/ols4/ontologies/cl/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCL_0002351). +This term only covers progenitor cells for the endocrine pancreas and does not cover the exocrine pancreas. +There were no terms that encompassed both other than `progenitor cell`. - `Osteoclast precursor cells` were assigned to [`CL:0000576` for monocyte](https://www.ebi.ac.uk/ols4/ontologies/cl/classes/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCL_0000576). -Monocytes differentiate into mononuclear osteoclasts which are then activated and become multinucleated osteoclasts. -Because monocytes are the "precursor" to the differentiated osteoclast, we chose to use this term. -- `NA` was used for `Undefined placental cells` and `Transient cells` as no clear cell type from the cell ontology was identified. +Monocytes differentiate into mononuclear osteoclasts which are then activated and become multinucleated osteoclasts. +Because monocytes are the "precursor" to the differentiated osteoclast, we chose to use this term. +- `NA` was used for `Undefined placental cells` and `Transient cells` as no clear cell type from the cell ontology was identified. diff --git a/analyses/hello-clusters/01_perform-evaluate-clustering.Rmd b/analyses/hello-clusters/01_perform-evaluate-clustering.Rmd index d7f2fbb55..80268a302 100644 --- a/analyses/hello-clusters/01_perform-evaluate-clustering.Rmd +++ b/analyses/hello-clusters/01_perform-evaluate-clustering.Rmd @@ -2,8 +2,8 @@ title: "Performing graph-based clustering with rOpenScPCA" date: "`r Sys.Date()`" author: "Data Lab" -output: - html_notebook: +output: + html_notebook: toc: yes toc_float: yes df_print: paged @@ -76,7 +76,6 @@ set.seed(2024) ## Read in and prepare data To begin, we'll read in the `SingleCellExperiment` (SCE) object. -We'll also establish a corresponding processed Seurat object from its raw counts that we'll use for some examples. ```{r read data} # Read the SCE file @@ -94,7 +93,7 @@ pca_matrix <- reducedDim(sce, "PCA") ## Perform clustering -This section will show how to perform clustering with the function `rOpenScPCA::calculate_clusters()`. +This section will show how to perform clustering with the function `rOpenScPCA::calculate_clusters()`. This function takes a PCA matrix with rownames representing unique cell ids (e.g., barcodes) as its primary argument. By default it will calculate clusters using the following parameters: @@ -152,7 +151,7 @@ cluster_results_df <- rOpenScPCA::calculate_clusters( ## Calculate QC metrics on clusters -This section demonstrates how to use several functions for evaluating cluster quality and reliability. +This section demonstrates how to use several functions for evaluating cluster quality and reliability. It's important to note that a full evaluation of clustering results would compare these metrics across a set of clustering results, with the aim of identifying an optimal parameterization. All functions presented in this section take the following required arguments: @@ -236,7 +235,7 @@ ggplot(purity_results) + ### Cluster stability -Another approach to exploring cluster quality is how stable the clusters themselves are using bootstrapping. +Another approach to exploring cluster quality is how stable the clusters themselves are using bootstrapping. Given a set of original clusters, we can compare the bootstrapped cluster identities to original ones using the Adjusted Rand Index (ARI), which measures the similarity of two data clusterings. ARI ranges from -1 to 1, where: @@ -276,7 +275,7 @@ ggplot(stability_results) + #### Using non-default clustering parameters -When calculating bootstrap clusters, `rOpenScPCA::calculate_stability()` uses `rOpenScPCA::calculate_clusters()` with default parameters. +When calculating bootstrap clusters, `rOpenScPCA::calculate_stability()` uses `rOpenScPCA::calculate_clusters()` with default parameters. If your original clusters were not calculated with these defaults, you should pass those customized values into this function as well to ensure a fair comparison between your original clusters and the bootstrap clusters. @@ -331,7 +330,6 @@ If you are analyzing your data with a Seurat pipeline that includes calculating To demonstrate this, we'll convert our SCE object to a Seurat using the function `rOpenScPCA::sce_to_seurat()`. Then, we'll use a simple Seurat pipeline to obtain clusters. - ```{r sce to seurat, message = FALSE} # Convert the SCE to a Seurat object using rOpenScPCA @@ -380,8 +378,8 @@ We do not recommend using `rOpenScPCA::calculate_stability()` on Seurat clusters ### Evaluating ScPCA clusters -ScPCA cell metadata already contains a column called `cluster` with results from an automated clustering. -These clusters were calculated using `bluster`, the same tool that `rOpenScPCA` uses. +ScPCA cell metadata already contains a column called `cluster` with results from an automated clustering. +These clusters were calculated using `bluster`, the same tool that `rOpenScPCA` uses. The specifications used for this clustering are stored in the SCE object's metadata, as follows; note that all other clustering parameters were left at their default values. * `metadata(sce)$cluster_algorithm`: The clustering algorithm used @@ -446,7 +444,7 @@ scpca_stability_df <- rOpenScPCA::calculate_stability( ``` -## Saving clustering results +## Saving clustering results Results can either be directly exported as a TSV file (e.g., with `readr::write_tsv()`), or you can add the results into your SCE or Seurat object. The subsequent examples will demonstrate saving the cluster assignments stored in `cluster_results_df$cluster` to an SCE and a Seurat object. @@ -456,7 +454,7 @@ Objects from the ScPCA Portal already contain a column called `cluster` with res These automatic clusters were not evaluated, and their parameters were not optimized for any given library. To avoid ambiguity between the existing and new clustering results, we'll name the new column `ropenscpca_cluster`. -### Saving results to an SCE object +### Saving results to an SCE object We can add columns to an SCE object's `colData` table by directly creating a column in the object with `$`. Before we do so, we'll confirm that the clusters are in the same order as the SCE object by comparing cell ids: @@ -473,7 +471,7 @@ all.equal( sce$ropenscpca_cluster <- cluster_results_df$cluster ``` -### Saving results to a Seurat object +### Saving results to a Seurat object We can add columns to an Seurat object's cell metadata table by directly creating a column in the object with `$` (note that you can also use the Seurat function `AddMetaData()`). diff --git a/analyses/hello-clusters/01_perform-evaluate-clustering.nb.html b/analyses/hello-clusters/01_perform-evaluate-clustering.nb.html index 702a46da2..bc5d1fd95 100644 --- a/analyses/hello-clusters/01_perform-evaluate-clustering.nb.html +++ b/analyses/hello-clusters/01_perform-evaluate-clustering.nb.html @@ -11,7 +11,7 @@ - + Performing graph-based clustering with rOpenScPCA @@ -2901,7 +2901,7 @@

Performing graph-based clustering with rOpenScPCA

Data Lab

-

2024-12-17

+

2024-12-20

@@ -3005,8 +3005,7 @@

Set the random seed

Read in and prepare data

To begin, we’ll read in the SingleCellExperiment (SCE) -object. We’ll also establish a corresponding processed Seurat object -from its raw counts that we’ll use for some examples.

+object.

@@ -3062,7 +3061,7 @@

Clustering with default parameters

@@ -3175,7 +3174,7 @@

Silhouette width

@@ -3191,7 +3190,7 @@

Silhouette width

labs(x = "Cluster", y = "Silhouette width")
-

+

@@ -3228,7 +3227,7 @@

Neighborhood purity

@@ -3244,7 +3243,7 @@

Neighborhood purity

labs(x = "Cluster", y = "Neighborhood purity")
-

+

@@ -3286,7 +3285,7 @@

Cluster stability

@@ -3302,7 +3301,7 @@

Cluster stability

labs(x = "Adjusted rand index across bootstrap replicates") -

+

@@ -3360,7 +3359,7 @@

Working with objects directly

@@ -3385,8 +3384,7 @@

Evaluating Seurat clusters

them.

To demonstrate this, we’ll convert our SCE object to a Seurat using the function rOpenScPCA::sce_to_seurat(). Then, we’ll use a -simple Seurat pipeline to obtain clusters. -

+simple Seurat pipeline to obtain clusters.

@@ -3406,24 +3404,31 @@

Evaluating Seurat clusters

FindNeighbors() |> FindClusters() - + +
Warning in irlba(A = t(x = object), nv = npcs, ...): You're computing too large
+a percentage of total singular values, use a standard svd instead.
+ + +
Warning: Number of dimensions changing from 10 to 50
+ +
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
 
-Number of nodes: 2623
-Number of edges: 78853
+Number of nodes: 100
+Number of edges: 4142
 
 Running Louvain algorithm...
-Maximum modularity in 10 random starts: 0.8478
-Number of communities: 13
+Maximum modularity in 10 random starts: 0.2147
+Number of communities: 2
 Elapsed time: 0 seconds
seurat_obj
- +
An object of class Seurat 
-145743 features across 2623 samples within 3 assays 
-Active assay: SCT (25105 features, 3000 variable features)
+126242 features across 100 samples within 3 assays 
+Active assay: SCT (5604 features, 3000 variable features)
  3 layers present: counts, data, scale.data
  2 other assays present: RNA, spliced
  2 dimensional reductions calculated: pca, umap
@@ -3446,7 +3451,7 @@

Evaluating Seurat clusters

@@ -3538,7 +3543,7 @@

Evaluating ScPCA clusters

@@ -3771,7 +3776,7 @@

Session Info

-
---
title: "Performing graph-based clustering with rOpenScPCA"
date: "`r Sys.Date()`"
author: "Data Lab"
output: 
  html_notebook: 
    toc: yes
    toc_float: yes
    df_print: paged
---

## Introduction

This notebook provides examples of how to use functions in `rOpenScPCA` that:

* Perform clustering
* Calculate QC metrics on clusters, including:
  * Silhouette width
  * Neighborhood purity
  * Cluster stability, as measured with the Adjusted Rand Index
* Calculate QC metrics on clusters obtained with other tools, such as `Seurat`
* Save clustering results to an SCE or `Seurat`

While this notebook demonstrates how to use individual functions that calculate helpful metrics for evaluating clustering results, a full evaluation would compare these metrics across different clusterings from different parameterizations.

This notebook will use the sample `SCPCS000001` from project `SCPCP000001`, which is assumed present in the `OpenScPCA-analysis/data/current/SCPCP000001` directory, for all examples.
Please [see this documentation](https://openscpca.readthedocs.io/en/latest/getting-started/accessing-resources/getting-access-to-data/) for more information about obtaining ScPCA data.

## Setup

### Packages


```{r packages}
library(rOpenScPCA)

suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(Seurat)
  library(dplyr)
  library(ggplot2)
})

# Set ggplot theme for plots
theme_set(theme_bw())
```


### Paths

```{r base paths}
# The base path for the OpenScPCA repository
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current")

# The path to this module
module_base <- file.path(repository_base, "analyses", "hello-clusters")
```

```{r input file path}
# Path to processed SCE file for sample SCPCS000001
input_sce_file <- file.path(data_dir, "SCPCP000001", "SCPCS000001", "SCPCL000001_processed.rds")
```


### Set the random seed

Because clustering involves random sampling, it is important to set the random seed at the top of your analysis script or notebook to ensure reproducibility.

```{r set seed}
set.seed(2024)
```

## Read in and prepare data

To begin, we'll read in the `SingleCellExperiment` (SCE) object.
We'll also establish a corresponding processed Seurat object from its raw counts that we'll use for some examples.

```{r read data}
# Read the SCE file
sce <- readRDS(input_sce_file)
```

For the initial cluster calculations and evaluations, we will use the PCA matrix extracted from the SCE object.
It's also possible to use an SCE object or a Seurat object directly, which we will demonstrate later.


```{r extract pca data}
# Extract the PCA matrix from an SCE object
pca_matrix <- reducedDim(sce, "PCA")
```

## Perform clustering

This section will show how to perform clustering with the function `rOpenScPCA::calculate_clusters()`. 

This function takes a PCA matrix with rownames representing unique cell ids (e.g., barcodes) as its primary argument.
By default it will calculate clusters using the following parameters:

* Louvain algorithm
* Jaccard weighting
* 10 nearest neighbors
* A resolution parameter of 1

This function will return a table with the following columns:

* `cell_id`: Unique cell identifiers, obtained from the PCA matrix's row names
* `cluster`: A factor column with the cluster identities
* There will be one column for each clustering parameter used


### Clustering with default parameters

```{r cluster sce}
# Calculate clusters with default parameters
cluster_results_df <- rOpenScPCA::calculate_clusters(pca_matrix)

# Print the first rows of the resulting table
head(cluster_results_df)
```

### Clustering with non-default parameters

Parameters used for clustering can be customized with these arguments:

* The `algorithm` can be one of:
  * `louvain`, `walktrap`, or `leiden`
* The `weighting` can be one of:
  * `jaccard`, `rank`, or `number`
* The nearest neighbors parameter can be customized with the `nn` argument
* The resolution parameter can be customized with the `resolution` argument
  * This parameter is only used by Louvain and Leiden algorithms
* If the Leiden algorithm is used, its default objective function parameter will be `CPM`, but you can also set  `objective_function = "modularity"` instead.
* You can provide additional parameters as a list to the `cluster_args` argument.
  * Please refer to the [`igraph` documentation](https://igraph.org/r/html/latest) to learn more about what additional parameters can be provided to each clustering algorithm.
  * Note that `cluster_args` only accepts single-length arguments (no vectors or lists).

For example:

```{r cluster sce nondefault}
# Calculate clusters with non-default parameters
cluster_results_df <- rOpenScPCA::calculate_clusters(
  pca_matrix,
  algorithm = "leiden",
  nn = 15,
  objective_function = "modularity"
)
```


## Calculate QC metrics on clusters

This section demonstrates how to use several functions for evaluating cluster quality and reliability. 
It's important to note that a full evaluation of clustering results would compare these metrics across a set of clustering results, with the aim of identifying an optimal parameterization.

All functions presented in this section take the following required arguments:

* A PCA matrix with row names representing unique cell ids (e.g., barcodes)
* A data frame with, at least, columns representing unique cell ids and cluster assignments
  * By default, these columns should be named `cell_id` and `cluster`, respectively, matching the output of `rOpenScPCA::calculate_clusters()`
  * You can override these defaults using the arguments `cell_id_col` and `cluster_col`

### Silhouette width

Silhouette width is a common metric that measures how well separated clusters are by, for each cell, comparing the average distance to all cells in the same cluster, and all cells in other clusters.
This value ranges from -1 to 1.
Cells in well-separated clusters should have high silhouette values closer to 1.
You can read more about silhouette width purity from the [_Orchestrating Single Cell Analysis with Bioconductor_ book](https://bioconductor.org/books/3.19/OSCA.advanced/clustering-redux.html#silhouette-width).

We'll use the function `rOpenScPCA::calculate_silhouette()` to calculate the silhouette width.

This function will return the inputted data frame with two additional columns:

* `silhouette_width`: The calculated silhouette width for the cell
* `silhouette_other`: The closet cluster to the cell besides the cluster to which it belongs, as used in the silhouette width calculation


```{r silhouette}
# calculate the silhouette width for each cell
silhouette_results <- rOpenScPCA::calculate_silhouette(
  pca_matrix,
  cluster_results_df
)

# Print the first rows of the resulting table
head(silhouette_results)
```


We can visualize these results by plotting silhouette width across clusters as violin plots, for example:

```{r violin silhouette}
ggplot(silhouette_results) +
  aes(x = cluster, y = silhouette_width) +
  geom_violin(fill = "darkmagenta") +
  labs(x = "Cluster", y = "Silhouette width")
```

### Neighborhood purity

Neighborhood purity is defined, for each cell, as the proportion of neighboring cells that are assigned to the same cluster.
This value ranges from 0 to 1.
Cells in well-separated clusters should have high purity values closer to 1, since there should be minimal overlap between member and neighboring cells.
You can read more about neighborhood purity from the [_Orchestrating Single Cell Analysis with Bioconductor_ book](https://bioconductor.org/books/3.19/OSCA.advanced/clustering-redux.html#cluster-purity).

We'll use the function `rOpenScPCA::calculate_purity()` to calculate the neighborhood purity.

This function will return the inputted data frame with two additional columns:

* `purity`: The neighborhood purity for the cell
* `maximum_neighbor`: The cluster with the highest proportion of observations neighboring the cell


```{r purity}
# calculate the neighborhood purity for each cell
purity_results <- rOpenScPCA::calculate_purity(
  pca_matrix,
  cluster_results_df
)

# Print the first rows of the resulting table
head(purity_results)
```


We can visualize these results by plotting purity clusters as violin plots, for example:

```{r violin purity}
ggplot(purity_results) +
  aes(x = cluster, y = purity) +
  geom_violin(fill = "darkolivegreen3") +
  labs(x = "Cluster", y = "Neighborhood purity")
```

### Cluster stability

Another approach to exploring cluster quality is how stable the clusters themselves are using bootstrapping. 
Given a set of original clusters, we can compare the bootstrapped cluster identities to original ones using the Adjusted Rand Index (ARI), which measures the similarity of two data clusterings.
ARI ranges from -1 to 1, where:

* A value of 1 indicates they are completely overlapping
* A value of -1 indicates they are completely distinct
* A value of 0 indicates a random relationship

We expect that highly stable clusterings have ARI values closer to 1 across a set of bootstrap replicates.

You can read more about the Adjusted Rand Index from the [_Orchestrating Single Cell Analysis with Bioconductor_ book](https://bioconductor.org/books/release/OSCA.advanced/clustering-redux.html#adjusted-rand-index).

We'll use the function `rOpenScPCA::calculate_stability()` to calculate the cluster stability.
By default, this function performs 20 bootstrap replicates, but this can be customized using the argument `replicates`.

This function will return a data frame with columns `replicate`, `ari`, and additional columns for the clustering parameters used when calculating bootstrap clusters.

```{r stability, warning=FALSE}
# calculate the stability of clusters
stability_results <- rOpenScPCA::calculate_stability(
  pca_matrix,
  cluster_results_df
)

# print the result
stability_results
```

We can visualize these results by plotting stability as a density plot, for example:

```{r ari density}
ggplot(stability_results) +
  aes(x = ari) +
  geom_density(color = "grey30", fill = "lightslateblue") +
  labs(x = "Adjusted rand index across bootstrap replicates")
```


#### Using non-default clustering parameters

When calculating bootstrap clusters, `rOpenScPCA::calculate_stability()` uses `rOpenScPCA::calculate_clusters()` with default parameters. 
If your original clusters were not calculated with these defaults, you should pass those customized values into this function as well to ensure a fair comparison between your original clusters and the bootstrap clusters.


```{r stability custom parameters}
# Calculate clusters with non-default parameters
cluster_df_leiden <- rOpenScPCA::calculate_clusters(
  pca_matrix,
  algorithm = "leiden",
  resolution = 0.5,
  nn = 15
)

# Now, pass in the same arguments customizing parameters here
stability_results_leiden <- rOpenScPCA::calculate_stability(
  pca_matrix,
  cluster_df_leiden,
  algorithm = "leiden",
  resolution = 0.5,
  nn = 15
)
```


## Working with objects directly

As presented above, `rOpenScPCA` clustering functions take a PCA matrix with row names representing unique cell ids as their first argument.

Instead of a matrix, you can alternatively pass in an SCE or Seurat object that contains a matrix.

We show an example of this below with and SCE object and `rOpenScPCA::calculate_clusters()`, but this will also work for any of the evaluation functions as well and has the same syntax for Seurat objects.

```{r run on sce}
# Calculate clusters from an SCE object using default parameters
cluster_results_df <- rOpenScPCA::calculate_clusters(sce)
cluster_results_df
```


`rOpenScPCA` assumes that the PCA matrix is named `PCA` in SCE objects, and `pca` in Seurat objects.
If the PCA matrix you want to use in the object has a different name, you can provide the argument `pc_name`.


## Calculating QC metrics on existing clusters

If you already have clustering results calculated with other tools, you can still use the `rOpenScPCA` functions to evaluate your clusters.

In this section, we'll present examples of how you can calculate the silhouette width, neighborhood purity, and cluster stability from existing cluster assignments within objects.

### Evaluating Seurat clusters

If you are analyzing your data with a Seurat pipeline that includes calculating clusters, you can use `rOpenScPCA` to evaluate them.

To demonstrate this, we'll convert our SCE object to a Seurat using the function `rOpenScPCA::sce_to_seurat()`.
Then, we'll use a simple Seurat pipeline to obtain clusters.
<!-- TODO: We will want to reference this module for further documentation on this function: https://github.com/AlexsLemonade/OpenScPCA-analysis/issues/945 -->

```{r sce to seurat, message = FALSE}
# Convert the SCE to a Seurat object using rOpenScPCA
seurat_obj <- rOpenScPCA::sce_to_seurat(sce)

# Calculate clusters with Seurat using a standard Seurat pipeline, for example
seurat_obj <- seurat_obj |>
  SCTransform() |>
  RunPCA() |>
  FindNeighbors() |>
  FindClusters()

seurat_obj
```


To calculate QC metrics on these clusters, we'll need to create a data frame with columns `cell_id` and `cluster`:

```{r prepare seurat input}
# Create a data frame for input
seurat_cluster_df <- data.frame(
  cell_id = colnames(seurat_obj),
  cluster = seurat_obj$seurat_clusters
)

head(seurat_cluster_df)
```

Now, we can run `rOpenScPCA::calculate_silhouette()` and `rOpenScPCA::calculate_purity()` using this data frame and the Seurat object:

```{r seurat silhouette}
seurat_silhouette_df <- rOpenScPCA::calculate_silhouette(
  seurat_obj,
  seurat_cluster_df
)
```

```{r seurat purity}
seurat_purity_df <- rOpenScPCA::calculate_purity(
  seurat_obj,
  seurat_cluster_df
)
```

We do not recommend using `rOpenScPCA::calculate_stability()` on Seurat clusters due to differences in the underlying clustering approach between Seurat and the `bluster` package which `rOpenScPCA` uses.

### Evaluating ScPCA clusters

ScPCA cell metadata already contains a column called `cluster` with results from an automated clustering. 
These clusters were calculated using `bluster`, the same tool that `rOpenScPCA` uses. 
The specifications used for this clustering are stored in the SCE object's metadata, as follows; note that all other clustering parameters were left at their default values.

* `metadata(sce)$cluster_algorithm`: The clustering algorithm used
* `metadata(sce)$cluster_weighting`: The weighting scheme used
* `metadata(sce)$cluster_nn`: The number of nearest neighbors used

You can see all their values here:


```{r extract cluster params}
# Print clustering specifications
metadata(sce)[c("cluster_algorithm", "cluster_weighting", "cluster_nn")]
```


In this example, we'll show how to use the cluster evaluation functions on these clusters.

To begin, we'll prepare a data frame with two columns: `cell_id` containing cell barcodes, and `cluster` containing the cluster identities.

```{r prepare scpca data frame}
scpca_cluster_df <- data.frame(
  cell_id = colnames(sce),
  cluster = sce$cluster
)

head(scpca_cluster_df)
```

We can run evaluation functions using this data frame and the SCE object.

```{r scpca silhouette}
# Calculate silhouette width
scpca_silhouette_df <- rOpenScPCA::calculate_silhouette(
  sce,
  scpca_cluster_df
)
```

```{r scpca purity}
# Calculate neighborhood purity
scpca_purity_df <- rOpenScPCA::calculate_purity(
  sce,
  scpca_cluster_df
)
```

When running `rOpenScPCA::calculate_stability()`, we'll specify the same parameters originally used to build the clusters by extracting them from the metadata.
We'll need to ensure the provided arguments are lowercase, as well.

Generally speaking, we only recommend evaluating clusters with `rOpenScPCA::calculate_stability()` if you know the original parameters used.


```{r scpca stability}
scpca_stability_df <- rOpenScPCA::calculate_stability(
  sce,
  scpca_cluster_df,
  # provide ScPCA clustering parameters by extracting from the SCE metadata
  algorithm = tolower(metadata(sce)$cluster_algorithm),
  weighting = tolower(metadata(sce)$cluster_weighting),
  nn = metadata(sce)$cluster_nn
)
```


## Saving clustering results 

Results can either be directly exported as a TSV file (e.g., with `readr::write_tsv()`), or you can add the results into your SCE or Seurat object.
The subsequent examples will demonstrate saving the cluster assignments stored in `cluster_results_df$cluster` to an SCE and a Seurat object.

_A word of caution!_
Objects from the ScPCA Portal already contain a column called `cluster` with results from an automated clustering.
These automatic clusters were not evaluated, and their parameters were not optimized for any given library.
To avoid ambiguity between the existing and new clustering results, we'll name the new column `ropenscpca_cluster`.

### Saving results to an SCE object 

We can add columns to an SCE object's `colData` table by directly creating a column in the object with `$`.
Before we do so, we'll confirm that the clusters are in the same order as the SCE object by comparing cell ids:

```{r check sce order}
all.equal(
  colnames(sce),
  cluster_results_df$cell_id
)
```

```{r add to sce}
# Add cluster results to the colData
sce$ropenscpca_cluster <- cluster_results_df$cluster
```

### Saving results to a Seurat object 


We can add columns to an Seurat object's cell metadata table by directly creating a column in the object with `$` (note that you can also use the Seurat function `AddMetaData()`).
Before we do so, we'll confirm that the clusters are in the same order as the Seurat object by comparing cell ids:


```{r check seurat order}
all.equal(
  colnames(seurat_obj),
  cluster_results_df$cell_id
)
```

```{r add to seurat}
# Add cluster results to the cell metadata
seurat_obj$ropenscpca_cluster <- cluster_results_df$cluster
```


## Session Info

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```

+
---
title: "Performing graph-based clustering with rOpenScPCA"
date: "`r Sys.Date()`"
author: "Data Lab"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    df_print: paged
---

## Introduction

This notebook provides examples of how to use functions in `rOpenScPCA` that:

* Perform clustering
* Calculate QC metrics on clusters, including:
  * Silhouette width
  * Neighborhood purity
  * Cluster stability, as measured with the Adjusted Rand Index
* Calculate QC metrics on clusters obtained with other tools, such as `Seurat`
* Save clustering results to an SCE or `Seurat`

While this notebook demonstrates how to use individual functions that calculate helpful metrics for evaluating clustering results, a full evaluation would compare these metrics across different clusterings from different parameterizations.

This notebook will use the sample `SCPCS000001` from project `SCPCP000001`, which is assumed present in the `OpenScPCA-analysis/data/current/SCPCP000001` directory, for all examples.
Please [see this documentation](https://openscpca.readthedocs.io/en/latest/getting-started/accessing-resources/getting-access-to-data/) for more information about obtaining ScPCA data.

## Setup

### Packages


```{r packages}
library(rOpenScPCA)

suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(Seurat)
  library(dplyr)
  library(ggplot2)
})

# Set ggplot theme for plots
theme_set(theme_bw())
```


### Paths

```{r base paths}
# The base path for the OpenScPCA repository
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current")

# The path to this module
module_base <- file.path(repository_base, "analyses", "hello-clusters")
```

```{r input file path}
# Path to processed SCE file for sample SCPCS000001
input_sce_file <- file.path(data_dir, "SCPCP000001", "SCPCS000001", "SCPCL000001_processed.rds")
```


### Set the random seed

Because clustering involves random sampling, it is important to set the random seed at the top of your analysis script or notebook to ensure reproducibility.

```{r set seed}
set.seed(2024)
```

## Read in and prepare data

To begin, we'll read in the `SingleCellExperiment` (SCE) object.

```{r read data}
# Read the SCE file
sce <- readRDS(input_sce_file)
```

For the initial cluster calculations and evaluations, we will use the PCA matrix extracted from the SCE object.
It's also possible to use an SCE object or a Seurat object directly, which we will demonstrate later.


```{r extract pca data}
# Extract the PCA matrix from an SCE object
pca_matrix <- reducedDim(sce, "PCA")
```

## Perform clustering

This section will show how to perform clustering with the function `rOpenScPCA::calculate_clusters()`.

This function takes a PCA matrix with rownames representing unique cell ids (e.g., barcodes) as its primary argument.
By default it will calculate clusters using the following parameters:

* Louvain algorithm
* Jaccard weighting
* 10 nearest neighbors
* A resolution parameter of 1

This function will return a table with the following columns:

* `cell_id`: Unique cell identifiers, obtained from the PCA matrix's row names
* `cluster`: A factor column with the cluster identities
* There will be one column for each clustering parameter used


### Clustering with default parameters

```{r cluster sce}
# Calculate clusters with default parameters
cluster_results_df <- rOpenScPCA::calculate_clusters(pca_matrix)

# Print the first rows of the resulting table
head(cluster_results_df)
```

### Clustering with non-default parameters

Parameters used for clustering can be customized with these arguments:

* The `algorithm` can be one of:
  * `louvain`, `walktrap`, or `leiden`
* The `weighting` can be one of:
  * `jaccard`, `rank`, or `number`
* The nearest neighbors parameter can be customized with the `nn` argument
* The resolution parameter can be customized with the `resolution` argument
  * This parameter is only used by Louvain and Leiden algorithms
* If the Leiden algorithm is used, its default objective function parameter will be `CPM`, but you can also set  `objective_function = "modularity"` instead.
* You can provide additional parameters as a list to the `cluster_args` argument.
  * Please refer to the [`igraph` documentation](https://igraph.org/r/html/latest) to learn more about what additional parameters can be provided to each clustering algorithm.
  * Note that `cluster_args` only accepts single-length arguments (no vectors or lists).

For example:

```{r cluster sce nondefault}
# Calculate clusters with non-default parameters
cluster_results_df <- rOpenScPCA::calculate_clusters(
  pca_matrix,
  algorithm = "leiden",
  nn = 15,
  objective_function = "modularity"
)
```


## Calculate QC metrics on clusters

This section demonstrates how to use several functions for evaluating cluster quality and reliability.
It's important to note that a full evaluation of clustering results would compare these metrics across a set of clustering results, with the aim of identifying an optimal parameterization.

All functions presented in this section take the following required arguments:

* A PCA matrix with row names representing unique cell ids (e.g., barcodes)
* A data frame with, at least, columns representing unique cell ids and cluster assignments
  * By default, these columns should be named `cell_id` and `cluster`, respectively, matching the output of `rOpenScPCA::calculate_clusters()`
  * You can override these defaults using the arguments `cell_id_col` and `cluster_col`

### Silhouette width

Silhouette width is a common metric that measures how well separated clusters are by, for each cell, comparing the average distance to all cells in the same cluster, and all cells in other clusters.
This value ranges from -1 to 1.
Cells in well-separated clusters should have high silhouette values closer to 1.
You can read more about silhouette width purity from the [_Orchestrating Single Cell Analysis with Bioconductor_ book](https://bioconductor.org/books/3.19/OSCA.advanced/clustering-redux.html#silhouette-width).

We'll use the function `rOpenScPCA::calculate_silhouette()` to calculate the silhouette width.

This function will return the inputted data frame with two additional columns:

* `silhouette_width`: The calculated silhouette width for the cell
* `silhouette_other`: The closet cluster to the cell besides the cluster to which it belongs, as used in the silhouette width calculation


```{r silhouette}
# calculate the silhouette width for each cell
silhouette_results <- rOpenScPCA::calculate_silhouette(
  pca_matrix,
  cluster_results_df
)

# Print the first rows of the resulting table
head(silhouette_results)
```


We can visualize these results by plotting silhouette width across clusters as violin plots, for example:

```{r violin silhouette}
ggplot(silhouette_results) +
  aes(x = cluster, y = silhouette_width) +
  geom_violin(fill = "darkmagenta") +
  labs(x = "Cluster", y = "Silhouette width")
```

### Neighborhood purity

Neighborhood purity is defined, for each cell, as the proportion of neighboring cells that are assigned to the same cluster.
This value ranges from 0 to 1.
Cells in well-separated clusters should have high purity values closer to 1, since there should be minimal overlap between member and neighboring cells.
You can read more about neighborhood purity from the [_Orchestrating Single Cell Analysis with Bioconductor_ book](https://bioconductor.org/books/3.19/OSCA.advanced/clustering-redux.html#cluster-purity).

We'll use the function `rOpenScPCA::calculate_purity()` to calculate the neighborhood purity.

This function will return the inputted data frame with two additional columns:

* `purity`: The neighborhood purity for the cell
* `maximum_neighbor`: The cluster with the highest proportion of observations neighboring the cell


```{r purity}
# calculate the neighborhood purity for each cell
purity_results <- rOpenScPCA::calculate_purity(
  pca_matrix,
  cluster_results_df
)

# Print the first rows of the resulting table
head(purity_results)
```


We can visualize these results by plotting purity clusters as violin plots, for example:

```{r violin purity}
ggplot(purity_results) +
  aes(x = cluster, y = purity) +
  geom_violin(fill = "darkolivegreen3") +
  labs(x = "Cluster", y = "Neighborhood purity")
```

### Cluster stability

Another approach to exploring cluster quality is how stable the clusters themselves are using bootstrapping.
Given a set of original clusters, we can compare the bootstrapped cluster identities to original ones using the Adjusted Rand Index (ARI), which measures the similarity of two data clusterings.
ARI ranges from -1 to 1, where:

* A value of 1 indicates they are completely overlapping
* A value of -1 indicates they are completely distinct
* A value of 0 indicates a random relationship

We expect that highly stable clusterings have ARI values closer to 1 across a set of bootstrap replicates.

You can read more about the Adjusted Rand Index from the [_Orchestrating Single Cell Analysis with Bioconductor_ book](https://bioconductor.org/books/release/OSCA.advanced/clustering-redux.html#adjusted-rand-index).

We'll use the function `rOpenScPCA::calculate_stability()` to calculate the cluster stability.
By default, this function performs 20 bootstrap replicates, but this can be customized using the argument `replicates`.

This function will return a data frame with columns `replicate`, `ari`, and additional columns for the clustering parameters used when calculating bootstrap clusters.

```{r stability, warning=FALSE}
# calculate the stability of clusters
stability_results <- rOpenScPCA::calculate_stability(
  pca_matrix,
  cluster_results_df
)

# print the result
stability_results
```

We can visualize these results by plotting stability as a density plot, for example:

```{r ari density}
ggplot(stability_results) +
  aes(x = ari) +
  geom_density(color = "grey30", fill = "lightslateblue") +
  labs(x = "Adjusted rand index across bootstrap replicates")
```


#### Using non-default clustering parameters

When calculating bootstrap clusters, `rOpenScPCA::calculate_stability()` uses `rOpenScPCA::calculate_clusters()` with default parameters.
If your original clusters were not calculated with these defaults, you should pass those customized values into this function as well to ensure a fair comparison between your original clusters and the bootstrap clusters.


```{r stability custom parameters}
# Calculate clusters with non-default parameters
cluster_df_leiden <- rOpenScPCA::calculate_clusters(
  pca_matrix,
  algorithm = "leiden",
  resolution = 0.5,
  nn = 15
)

# Now, pass in the same arguments customizing parameters here
stability_results_leiden <- rOpenScPCA::calculate_stability(
  pca_matrix,
  cluster_df_leiden,
  algorithm = "leiden",
  resolution = 0.5,
  nn = 15
)
```


## Working with objects directly

As presented above, `rOpenScPCA` clustering functions take a PCA matrix with row names representing unique cell ids as their first argument.

Instead of a matrix, you can alternatively pass in an SCE or Seurat object that contains a matrix.

We show an example of this below with and SCE object and `rOpenScPCA::calculate_clusters()`, but this will also work for any of the evaluation functions as well and has the same syntax for Seurat objects.

```{r run on sce}
# Calculate clusters from an SCE object using default parameters
cluster_results_df <- rOpenScPCA::calculate_clusters(sce)
cluster_results_df
```


`rOpenScPCA` assumes that the PCA matrix is named `PCA` in SCE objects, and `pca` in Seurat objects.
If the PCA matrix you want to use in the object has a different name, you can provide the argument `pc_name`.


## Calculating QC metrics on existing clusters

If you already have clustering results calculated with other tools, you can still use the `rOpenScPCA` functions to evaluate your clusters.

In this section, we'll present examples of how you can calculate the silhouette width, neighborhood purity, and cluster stability from existing cluster assignments within objects.

### Evaluating Seurat clusters

If you are analyzing your data with a Seurat pipeline that includes calculating clusters, you can use `rOpenScPCA` to evaluate them.

To demonstrate this, we'll convert our SCE object to a Seurat using the function `rOpenScPCA::sce_to_seurat()`.
Then, we'll use a simple Seurat pipeline to obtain clusters.

```{r sce to seurat, message = FALSE}
# Convert the SCE to a Seurat object using rOpenScPCA
seurat_obj <- rOpenScPCA::sce_to_seurat(sce)

# Calculate clusters with Seurat using a standard Seurat pipeline, for example
seurat_obj <- seurat_obj |>
  SCTransform() |>
  RunPCA() |>
  FindNeighbors() |>
  FindClusters()

seurat_obj
```


To calculate QC metrics on these clusters, we'll need to create a data frame with columns `cell_id` and `cluster`:

```{r prepare seurat input}
# Create a data frame for input
seurat_cluster_df <- data.frame(
  cell_id = colnames(seurat_obj),
  cluster = seurat_obj$seurat_clusters
)

head(seurat_cluster_df)
```

Now, we can run `rOpenScPCA::calculate_silhouette()` and `rOpenScPCA::calculate_purity()` using this data frame and the Seurat object:

```{r seurat silhouette}
seurat_silhouette_df <- rOpenScPCA::calculate_silhouette(
  seurat_obj,
  seurat_cluster_df
)
```

```{r seurat purity}
seurat_purity_df <- rOpenScPCA::calculate_purity(
  seurat_obj,
  seurat_cluster_df
)
```

We do not recommend using `rOpenScPCA::calculate_stability()` on Seurat clusters due to differences in the underlying clustering approach between Seurat and the `bluster` package which `rOpenScPCA` uses.

### Evaluating ScPCA clusters

ScPCA cell metadata already contains a column called `cluster` with results from an automated clustering.
These clusters were calculated using `bluster`, the same tool that `rOpenScPCA` uses.
The specifications used for this clustering are stored in the SCE object's metadata, as follows; note that all other clustering parameters were left at their default values.

* `metadata(sce)$cluster_algorithm`: The clustering algorithm used
* `metadata(sce)$cluster_weighting`: The weighting scheme used
* `metadata(sce)$cluster_nn`: The number of nearest neighbors used

You can see all their values here:


```{r extract cluster params}
# Print clustering specifications
metadata(sce)[c("cluster_algorithm", "cluster_weighting", "cluster_nn")]
```


In this example, we'll show how to use the cluster evaluation functions on these clusters.

To begin, we'll prepare a data frame with two columns: `cell_id` containing cell barcodes, and `cluster` containing the cluster identities.

```{r prepare scpca data frame}
scpca_cluster_df <- data.frame(
  cell_id = colnames(sce),
  cluster = sce$cluster
)

head(scpca_cluster_df)
```

We can run evaluation functions using this data frame and the SCE object.

```{r scpca silhouette}
# Calculate silhouette width
scpca_silhouette_df <- rOpenScPCA::calculate_silhouette(
  sce,
  scpca_cluster_df
)
```

```{r scpca purity}
# Calculate neighborhood purity
scpca_purity_df <- rOpenScPCA::calculate_purity(
  sce,
  scpca_cluster_df
)
```

When running `rOpenScPCA::calculate_stability()`, we'll specify the same parameters originally used to build the clusters by extracting them from the metadata.
We'll need to ensure the provided arguments are lowercase, as well.

Generally speaking, we only recommend evaluating clusters with `rOpenScPCA::calculate_stability()` if you know the original parameters used.


```{r scpca stability}
scpca_stability_df <- rOpenScPCA::calculate_stability(
  sce,
  scpca_cluster_df,
  # provide ScPCA clustering parameters by extracting from the SCE metadata
  algorithm = tolower(metadata(sce)$cluster_algorithm),
  weighting = tolower(metadata(sce)$cluster_weighting),
  nn = metadata(sce)$cluster_nn
)
```


## Saving clustering results

Results can either be directly exported as a TSV file (e.g., with `readr::write_tsv()`), or you can add the results into your SCE or Seurat object.
The subsequent examples will demonstrate saving the cluster assignments stored in `cluster_results_df$cluster` to an SCE and a Seurat object.

_A word of caution!_
Objects from the ScPCA Portal already contain a column called `cluster` with results from an automated clustering.
These automatic clusters were not evaluated, and their parameters were not optimized for any given library.
To avoid ambiguity between the existing and new clustering results, we'll name the new column `ropenscpca_cluster`.

### Saving results to an SCE object

We can add columns to an SCE object's `colData` table by directly creating a column in the object with `$`.
Before we do so, we'll confirm that the clusters are in the same order as the SCE object by comparing cell ids:

```{r check sce order}
all.equal(
  colnames(sce),
  cluster_results_df$cell_id
)
```

```{r add to sce}
# Add cluster results to the colData
sce$ropenscpca_cluster <- cluster_results_df$cluster
```

### Saving results to a Seurat object


We can add columns to an Seurat object's cell metadata table by directly creating a column in the object with `$` (note that you can also use the Seurat function `AddMetaData()`).
Before we do so, we'll confirm that the clusters are in the same order as the Seurat object by comparing cell ids:


```{r check seurat order}
all.equal(
  colnames(seurat_obj),
  cluster_results_df$cell_id
)
```

```{r add to seurat}
# Add cluster results to the cell metadata
seurat_obj$ropenscpca_cluster <- cluster_results_df$cluster
```


## Session Info

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```
