Merge pull request #496 from cole-trapnell-lab/develop

Develop
cole-trapnell-lab · Apr 1, 2021 · 004c096 · 004c096
2 parents 4c01d89 + 0ef9570
commit 004c096
Show file tree

Hide file tree

Showing 28 changed files with 619 additions and 352 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -8,6 +8,9 @@ before_install:
   - sudo apt-get install -y libudunits2-dev
   - sudo apt-get install -y gdal-bin
   - sudo apt-get install -y libgdal1-dev
+  - sudo apt-get install -y libharfbuzz-dev
+  - sudo apt-get install -y libfribidi-dev
+  - sudo apt-get install -y libgit2-dev
 r:
 - bioc-release
 #r_packages:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: monocle3
 Title: Clustering, differential expression, and trajectory analysis for single-
     cell RNA-Seq
-Version: 0.2.3.0
+Version: 1.0.0
 Authors@R: 
     person(given = "Hannah",
            family = "Pliner",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,16 @@
+# monocle3 1.0.0
+
+### Changes
+
+* Add label_principal_points option to plot_cells to assist with identifying root_pr_nodes in order_cells.
+
+### Bug fixes
+
+* Documentation and error reports improvements.
+* Fix fit_models tests for NA/Inf/NaNs.
+* Refine gene loadings calculations in find_gene_modules.
+* Test for convergence failure in top_markers,
+
 # monocle3 0.2.3
 
 ### Changes

diff --git a/R/cluster_genes.R b/R/cluster_genes.R
@@ -67,9 +67,14 @@ find_gene_modules <- function(cds,
                                       "before running cluster_cells"))
 
   preprocess_mat <- cds@preprocess_aux$gene_loadings
-  if (is.null(cds@preprocess_aux$beta) == FALSE){
-    preprocess_mat <- sweep( preprocess_mat, 2, cds@preprocess_aux$beta[,1], '*')
-  }
+# Notes:
+#   o  cds@preprocess_aux$beta is npc x nfactor, which causes
+#      preprocess_mat to have nfactor columns, often one column
+#   o  I do not know how to adjust gene_loadings for batch effects
+#      so this is disabled for now
+#  if (is.null(cds@preprocess_aux$beta) == FALSE){
+#    preprocess_mat = preprocess_mat %*% (-cds@preprocess_aux$beta)
+#  }
   preprocess_mat <- preprocess_mat[intersect(rownames(cds), row.names(preprocess_mat)),]
 
   # uwot::umap uses a random number generator

diff --git a/R/expr_models.R b/R/expr_models.R
@@ -199,7 +199,15 @@ fit_model_helper <- function(x,
 #' @param verbose Logical indicating whether to emit progress messages.
 #' @param ... Additional arguments passed to model fitting functions.
 #'
-#' @return a tibble containing model objects
+#' @return a tibble where the rows are genes and columns are
+#'   * id character vector from `rowData(cds)$id`
+#'   * gene_short_names character vector from `rowData(cds)$gene_short_names`
+#'   * num_cells_expressed int vector from `rowData(cds)$num_cells_expressed`
+#'   * gene_id character vector from row.names(rowData(cds))`
+#'   * model GLM model list returned by speedglm
+#'   * model_summary model summary list returned by `summary(model)`
+#'   * status character vector of model fitting status: OK when model converged, otherwise FAIL
+#'
 #' @export
 fit_models <- function(cds,
                      model_formula_str,
@@ -238,24 +246,28 @@ fit_models <- function(cds,
       err_msg <- paste0(err_msg,'  \'', mf_term, '\': not in cds\n')
       next
     }
-    mf_length  <- length(coldata_df[[mf_term]])
-    mf_num_inf <- sum(is.infinite(coldata_df[[mf_term]]))
-    mf_num_nan <- sum(is.nan(coldata_df[[mf_term]]))
-    mf_num_na  <- sum(is.na(coldata_df[[mf_term]]))
-    if( mf_num_inf > 0 )
-      err_msg <- paste0(err_msg, '  \'', mf_term, '\': ' , mf_num_inf, ' of ', mf_length, ' values are Inf\n')
-    if( mf_num_nan > 0 )
-      err_msg <- paste0(err_msg, '  \'', mf_term, '\': ' , mf_num_nan, ' of ', mf_length, ' values are NaN\n')
-    if( mf_num_na - mf_num_nan > 0 )
-      err_msg <- paste0(err_msg, '  \'', mf_term, '\': ' , mf_num_na - mf_num_nan, ' of ', mf_length, ' values are NA\n')
   }
   if(length(err_msg) > 0)
     stop( '\n-- bad fit_models terms --\n', err_msg )
-  rm( err_msg, mf_terms, mf_term, mf_length, mf_num_inf, mf_num_nan, mf_num_na )
   tryCatch({
     stats::model.frame(model_form, data=coldata_df)
-  }, error = function(e) {
-    stop ("Error in model formula")
+  }, error = function( cnd ) {
+       info_msg <- ''
+       for( mf_term in mf_terms )
+       {
+         mf_length  <- length(coldata_df[[mf_term]])
+         mf_num_inf <- sum(is.infinite(coldata_df[[mf_term]]))
+         mf_num_nan <- sum(is.nan(coldata_df[[mf_term]]))
+         mf_num_na  <- sum(is.na(coldata_df[[mf_term]]))
+         if( mf_num_inf > 0 )
+           info_msg <- paste0(info_msg, '  \'', mf_term, '\': ' , mf_num_inf, ' of ', mf_length, ' values are Inf\n')
+         if( mf_num_nan > 0 )
+           info_msg <- paste0(info_msg, '  \'', mf_term, '\': ' , mf_num_nan, ' of ', mf_length, ' values are NaN\n')
+         if( mf_num_na - mf_num_nan > 0 )
+           info_msg <- paste0(info_msg, '  \'', mf_term, '\': ' , mf_num_na - mf_num_nan, ' of ', mf_length, ' values are NA\n')
+       }
+       rm( mf_term, mf_length, mf_num_inf, mf_num_nan, mf_num_na )
+       stop (paste0( 'Error in model formula: ', conditionMessage( cnd ), '\n', info_msg ) )
   })
 
   disp_func <- NULL

diff --git a/R/find_markers.R b/R/find_markers.R
@@ -21,6 +21,18 @@
 #' @param cores Number of cores to use.
 #' @param verbose Whether to print verbose progress output.
 #'
+#' @return a data.frame where the rows are genes and the columns are
+#' * gene_id vector of gene names
+#' * gene_short_name vector of gene short names
+#' * cell_group character vector of the cell group to which the cell belongs
+#' * marker_score numeric vector of marker scores as the fraction expressing scaled by the specificity. The value ranges from 0 to 1.
+#' * mean_expression numeric vector of mean normalized expression of the gene in the cell group
+#' * fraction_expressing numeric vector of fraction of cells expressing the gene within the cell group
+#' * specificity numeric vector of a measure of how specific the gene's expression is to the cell group based on the Jensen-Shannon divergence. The value ranges from 0 to 1.
+#' * pseudo_R2 numeric vector of pseudo R-squared values, a measure of how well the gene expression model fits the categorical data relative to the null model. The value ranges from 0 to 1.
+#' * marker_test_p_value numeric vector of likelihood ratio p-values
+#' * marker_test_q_value numeric vector of likelihood ratio q-values
+#'
 #' @export
 top_markers <- function(cds,
                         group_cells_by="cluster",
@@ -160,6 +172,13 @@ top_markers <- function(cds,
                                  RhpcBLASctl::blas_set_num_threads(old_blas_num_threads)
                                })
 
+    #
+    # Check for possible convergence failure or other problems. Issue: #383
+    #
+    if('warning' %in% marker_test_res) {
+        warning('test_marker_for_cell_group() caught warning: possible convergence failure.')
+    }
+
     marker_test_res = t(marker_test_res)
     marker_test_res = as.matrix(marker_test_res)
     colnames(marker_test_res) = c("pseudo_R2", "lrtest_p_value")

diff --git a/R/load_cellranger_data.R b/R/load_cellranger_data.R
@@ -30,6 +30,32 @@ get_genome_in_matrix_path <- function(matrix_path, genome=NULL) {
 #' is from version 3.0 and contains non-Gene-Expression data (e.g. Antibodies
 #' or CRISPR features), only the Gene Expression data is returned.
 #'
+#' @details
+#' *  the \emph{pipestance_path} argument takes the name of a Cell Ranger
+#'     output directory, in which it looks for the required data files,
+#'     for example, \emph{pipestance_path=10x_data}
+#' *  for Cell Ranger version 2 data, \emph{load_cellranger_data} expects to
+#'    find the required files \emph{barcodes.tsv}, \emph{genes.tsv}, and
+#'    \emph{matrix.mtx}
+#'    in the directories as
+#'      -  \emph{10x_data/outs/filtered_gene_bc_matrices/<genome>/barcodes.tsv}
+#'      -  \emph{10x_data/outs/filtered_gene_bc_matrices/<genome>/genes.tsv}
+#'      -  \emph{10x_data/outs/filtered_gene_bc_matrices/<genome>/matrix.mtx}
+#'
+#'    where <genome> is the name of a genome. \emph{load_cellranger_data}
+#'    expects to find either a single \emph{genome} directory in
+#'     \emph{10x_data/outs/filtered_gene_bc_matrices} or a \emph{genome}
+#'    directory with the name given with the \emph{genome} argument.
+#' *  for Cell Ranger version 3 data, \emph{load_cellranger_data} expects to
+#'    find the required files \emph{barcodes.tsv.gz}, \emph{features.tsv.gz},
+#'    and \emph{matrix.mtx.gz} in the directories as
+#'      -  \emph{10x_data/outs/filtered_feature_bc_matrix/barcodes.tsv.gz}
+#'      -  \emph{10x_data/outs/filtered_feature_bc_matrix/features.tsv.gz}
+#'      -  \emph{10x_data/outs/filtered_feature_bc_matrix/matrix.mtx.gz}
+#'
+#' * if any of the files is not in the expected directory,
+#'   \emph{load_cellranger_data} will terminate with an error
+#'
 #' @param pipestance_path Path to the output directory produced by Cell Ranger
 #' @param genome The desired genome (e.g., 'hg19' or 'mm10')
 #' @param barcode_filtered Load only the cell-containing barcodes

diff --git a/R/order_cells.R b/R/order_cells.R
@@ -15,8 +15,10 @@
 #' @param reduction_method a string specifying the reduced dimension method to
 #'   use when ordering cells. Currently only "UMAP" is supported.
 #' @param root_pr_nodes NULL or a vector of starting principal points. If
-#'   provided, pseudotime will start (i.e. be zero) at these graph nodes. Both
-#'   \code{root_pr_nodes} and \code{root_cells} cannot be provided.
+#'   provided, pseudotime will start (i.e. be zero) at these graph nodes. You
+#'   can find the principal point names by running plot_cells with
+#'   label_principal_points = TRUE. Both \code{root_pr_nodes} and
+#'   \code{root_cells} cannot be provided.
 #' @param root_cells NULL or a vector of starting cells. If provided,
 #'   pseudotime will start (i.e. be zero) at these cells. Both
 #'   \code{root_pr_nodes} and \code{root_cells} cannot be provided.
@@ -71,16 +73,17 @@ order_cells <- function(cds,
                             msg = paste("All provided root_cells must be",
                                         "present in the cell data set."))
   }
+
   if(is.null(root_cells) & is.null(root_pr_nodes)) {
     assertthat::assert_that(interactive(),
                             msg = paste("When not in interactive mode, either",
-                                        "root_pr_nodes or root_cells must be",
-                                        "provided."))
+                                        "root_pr_nodes or root_cells",
+                                        "must be provided."))
   }
   assertthat::assert_that(!all(c(!is.null(root_cells),
                                  !is.null(root_pr_nodes))),
-                            msg = paste("Please specify either root_pr_nodes",
-                                        "or root_cells, not both."))
+                          msg = paste("Please specify either root_pr_nodes",
+                                      "or root_cells, not both."))
 
   if (is.null(root_pr_nodes) & is.null(root_cells)){
     if (interactive()){

diff --git a/R/plotting.R b/R/plotting.R
@@ -370,6 +370,9 @@ plot_cells_3d <- function(cds,
 #'   ggrastr package.
 #' @param scale_to_range Logical indicating whether to scale expression to
 #'   percent of maximum expression.
+#' @param label_principal_points Logical indicating whether to label roots,
+#'   leaves, and branch points with principal point names. This is useful for
+#'   order_cells and choose_graph_segments in non-interactive mode.
 #'
 #' @return a ggplot2 plot object
 #' @export
@@ -404,7 +407,8 @@ plot_cells <- function(cds,
                        alpha = 1,
                        min_expr=0.1,
                        rasterize=FALSE,
-                       scale_to_range=FALSE) {
+                       scale_to_range=FALSE,
+                       label_principal_points = FALSE) {
   reduction_method <- match.arg(reduction_method)
   assertthat::assert_that(methods::is(cds, "cell_data_set"))
   assertthat::assert_that(!is.null(reducedDims(cds)[[reduction_method]]),
@@ -451,6 +455,19 @@ plot_cells <- function(cds,
     message("No trajectory to plot. Has learn_graph() been called yet?")
     show_trajectory_graph = FALSE
   }
+  if (label_principal_points &&
+      is.null(principal_graph(cds)[[reduction_method]])) {
+    message("Cannot label principal points when no trajectory to plot. Has learn_graph() been called yet?")
+    label_principal_points = FALSE
+  }
+
+  if (label_principal_points) {
+    label_branch_points <- FALSE
+    label_leaves <- FALSE
+    label_roots <- FALSE
+  }
+
+
 
   gene_short_name <- NA
   sample_name <- NA
@@ -624,7 +641,7 @@ plot_cells <- function(cds,
           text_df = data_df %>%
             dplyr::group_by(cell_group) %>%
             dplyr::mutate(cells_in_cluster= dplyr::n()) %>%
-            dplyr::group_by(cell_color, add=TRUE) %>%
+            dplyr::group_by(cell_color, .add=TRUE) %>%
             dplyr::mutate(per=dplyr::n()/cells_in_cluster)
           median_coord_df = text_df %>%
             dplyr::summarize(fraction_of_group = dplyr::n(),
@@ -749,6 +766,26 @@ plot_cells <- function(cds,
                           data=edge_df)
 
 
+    if (label_principal_points) {
+      mst_branch_nodes <- branch_nodes(cds, reduction_method)
+      mst_leaf_nodes <- leaf_nodes(cds, reduction_method)
+      mst_root_nodes <- root_nodes(cds, reduction_method)
+      pps <- c(mst_branch_nodes, mst_leaf_nodes, mst_root_nodes)
+      princ_point_df <- ica_space_df %>%
+        dplyr::slice(match(names(pps), sample_name))
+
+      g <- g +
+        geom_point(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2"),
+                   shape = 21, stroke=I(trajectory_graph_segment_size),
+                   color="white",
+                   fill="black",
+                   size=I(graph_label_size * 1.5),
+                   na.rm=TRUE, princ_point_df) +
+        ggrepel::geom_text_repel(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2",
+                             label="sample_name"),
+                  size=I(graph_label_size * 1.5), color="Black", na.rm=TRUE,
+                  princ_point_df)
+    }
     if (label_branch_points){
       mst_branch_nodes <- branch_nodes(cds, reduction_method)
       branch_point_df <- ica_space_df %>%
@@ -1468,6 +1505,12 @@ plot_genes_by_group <- function(cds,
                             msg = paste("group_cells_by must be a column in",
                                         "the colData table."))
   }
+  assertthat::assert_that("gene_short_name" %in% names(rowData(cds)), msg =
+                            paste("This function requires a gene_short_name",
+                                  "column in your rowData. If you do not have",
+                                  "gene symbols, you can use other gene ids",
+                                  "by running",
+                                  "rowData(cds)$gene_short_name <- row.names(rowData(cds))"))
 
   norm_method = match.arg(norm_method)
 
@@ -1477,7 +1520,7 @@ plot_genes_by_group <- function(cds,
     dplyr::pull(rowname)
   if(length(gene_ids) < 1)
     stop(paste('Please make sure markers are included in the gene_short_name",
-               "column of the fData!'))
+               "column of the rowData!'))
 
   if(flip_percentage_mean == FALSE){
     major_axis <- 1

diff --git a/R/preprocess_cds.R b/R/preprocess_cds.R
@@ -24,15 +24,6 @@
 #' @param use_genes NULL or a list of gene IDs. If a list of gene IDs, only
 #'   this subset of genes is used for dimensionality reduction. Default is
 #'   NULL.
-#' @param residual_model_formula_str NULL or a string model formula specifying
-#'   any effects to subtract from the data before dimensionality reduction.
-#'   Uses a linear model to subtract effects. For non-linear effects, use
-#'   alignment_group. Default is NULL.
-#' @param alignment_group String specifying a column of colData to use for
-#'  aligning groups of cells. The column specified must be a factor.
-#'  Alignment can be used to subtract batch effects in a non-linear way.
-#'  For correcting continuous effects, use residual_model_formula_str.
-#'  Default is NULL.
 #' @param pseudo_count NULL or the amount to increase expression values before
 #'   normalization and dimensionality reduction. If NULL (default), a
 #'   pseudo_count of 1 is added for log normalization and 0 is added for size
@@ -50,8 +41,6 @@ preprocess_cds <- function(cds, method = c('PCA', "LSI"),
                            num_dim=50,
                            norm_method = c("log", "size_only", "none"),
                            use_genes = NULL,
-                           residual_model_formula_str=NULL,
-                           alignment_group=NULL,
                            pseudo_count=NULL,
                            scaling = TRUE,
                            verbose=FALSE,
@@ -108,12 +97,13 @@ preprocess_cds <- function(cds, method = c('PCA', "LSI"),
 
     irlba_rotation <- irlba_res$rotation
     row.names(irlba_rotation) <- rownames(FM)
-    cds@preprocess_aux$gene_loadings <- irlba_rotation
+    cds@preprocess_aux$gene_loadings <- irlba_rotation %*% diag(irlba_res$sdev)
     cds@preprocess_aux$prop_var_expl <- irlba_res$sdev^2 / sum(irlba_res$sdev^2)
 
   } else if(method == "LSI") {
 
     preproc_res <- tfidf(FM)
+    num_col <- ncol(preproc_res)
     irlba_res <- irlba::irlba(Matrix::t(preproc_res),
                               nv = min(num_dim,min(dim(FM)) - 1))
 
@@ -122,7 +112,7 @@ preprocess_cds <- function(cds, method = c('PCA', "LSI"),
 
     irlba_rotation = irlba_res$v
     row.names(irlba_rotation) = rownames(FM)
-    cds@preprocess_aux$gene_loadings = irlba_rotation
+    cds@preprocess_aux$gene_loadings = irlba_rotation %*% diag( irlba_res$d/sqrt( max(1, num_col - 1) ) )
 
   }