Merge pull request #373 from cole-trapnell-lab/develop

Develop
cole-trapnell-lab · Jun 11, 2020 · 4137e48 · 4137e48
2 parents 1a02274 + 161c121
commit 4137e48
Show file tree

Hide file tree

Showing 53 changed files with 1,441 additions and 550 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,51 +1,64 @@
 cache: packages
 sudo: required
 warnings_are_errors: true
+os: linux
+dist: xenial
 language: r
 before_install:
   - sudo apt-get install -y libudunits2-dev
   - sudo apt-get install -y gdal-bin
   - sudo apt-get install -y libgdal1-dev
 r:
 - bioc-release
-r_packages:
-  - covr
+#r_packages:
+#  - covr
 r_github_packages:
 - VPetukhov/ggrastr
 - cole-trapnell-lab/leidenbase
-r_binary_packages:
-- assertthat
-- dplyr
-- devtools
-- ggplot2
-- ggrepel
-- igraph
-- irlba
-- lmtest
-- MASS
-- Matrix
-- pbapply
-- pbmcapply
-- pheatmap
-- plotly
-- plyr
-- proxy
-- pryr
-- pscl
-- purrr
-- RANN
-- Rcpp
-- reshape2
-- RhpcBLASctl
-- roxygen2
-- shiny
-- slam
-- spdep
-- speedglm
-- stringr
-- tibble
-- tidyr
-- viridis
+#r_binary_packages:
+r_packages:
+ - covr
+ - assertthat
+ - dplyr
+ - devtools
+ - ggplot2
+ - ggrepel
+ - igraph
+ - irlba
+ - lmtest
+ - MASS
+ - Matrix
+ - pbapply
+ - pbmcapply
+ - pheatmap
+ - plotly
+ - plyr
+ - proxy
+ - pryr
+ - pscl
+ - purrr
+ - RANN
+ - Rcpp
+ - reshape2
+ - RhpcBLASctl
+ - roxygen2
+ - shiny
+ - slam
+ - spdep
+ - speedglm
+ - stringr
+ - tibble
+ - tidyr
+ - viridis
+
+before_script:
+  - export PKG_NAME=$(Rscript -e 'cat(paste0(devtools::as.package(".")$package))')
+  - export PKG_TARBALL=$(Rscript -e 'pkg <- devtools::as.package("."); cat(paste0(pkg$package,"_",pkg$version,".tar.gz"))')
+  - R CMD build --no-build-vignettes .
+  - R CMD INSTALL ${PKG_TARBALL}
+  - rm ${PKG_TARBALL}
+  - echo "Session info:"
+  - Rscript -e "library(${PKG_NAME});devtools::session_info('${PKG_NAME}')"
 
 script:
   - |

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: monocle3
 Title: Clustering, differential expression, and trajectory analysis for single-
     cell RNA-Seq
-Version: 0.2.1
+Version: 0.2.2
 Authors@R: 
     person(given = "Hannah",
            family = "Pliner",
@@ -27,7 +27,7 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.0.2
+RoxygenNote: 7.1.0
 LinkingTo: 
     Rcpp
 Depends:
@@ -74,7 +74,7 @@ Imports:
     speedglm (>= 0.3-2),
     stringr (>= 1.4.0),
     SummarizedExperiment (>= 1.11.5),
-    uwot (>= 0.1.3),
+    uwot (>= 0.1.8),
     tibble (>= 2.1.1),
     tidyr (>= 0.8.3),
     viridis (>= 0.5.1)

diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,7 @@ export(graph_test)
 export(learn_graph)
 export(load_a549)
 export(load_cellranger_data)
+export(load_mm_data)
 export(load_mtx_data)
 export(model_predictions)
 export(new_cell_data_set)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,24 @@
+# monocle3 0.2.2
+
+### Changes
+* Added load_mm_data() to load data from matrix market sparse file and gene and cell data files.
+* Added rann.k parameter to learn_graph().
+* Added speedglm.maxiter parameter to top_marker().
+
+### Bug fixes
+* Fixed combine_cds() issues.
+* Fixed learn_graph(use_partition=FALSE) issue
+* Fixed batchelor::fastMNN(pc.input) deprecation issue
+* Fixed choose_graph_segments() issue.
+* Fixed missing gaussian family in fit_models().
+* Fixed add pseudocount to violin plot.
+* Fixed add detect_genes() to fit_models() if needed.
+* Fixed compare_models() issues.
+* Fixed check for undefined values in fit_models() formula.
+* Fixed plot_cells() plotting order issue.
+* Fixed find_gene_modules() run-to-run variation issue.
+* Fixed rlist package namespace collision.
+* Fixed allow short gene names in aggregate_gene_expression(gene_group_df).
 
 # monocle3 0.2.0
 

diff --git a/R/alignment.R b/R/alignment.R
@@ -81,15 +81,12 @@ align_cds <- function(cds,
                   "single-cell RNA-sequencing data are corrected by matching",
                   "mutual nearest neighbors.' Nat. Biotechnol., 36(5),",
                   "421-427. doi: 10.1038/nbt.4091"))
-    corrected_PCA = batchelor::fastMNN(as.matrix(preproc_res),
-                                       batch=colData(cds)[,alignment_group],
-                                       k=alignment_k,
-                                       cos.norm=FALSE,
-                                       pc.input = TRUE)
+    corrected_PCA = batchelor::reducedMNN(as.matrix(preproc_res),
+                                          batch=colData(cds)[,alignment_group],
+                                          k=alignment_k)
     preproc_res = corrected_PCA$corrected
     cds <- add_citation(cds, "MNN_correct")
   }
-
   reducedDims(cds)[["Aligned"]] <- as.matrix(preproc_res)
 
   cds

diff --git a/R/cluster_cells.R b/R/cluster_cells.R
@@ -93,7 +93,7 @@ cluster_cells <- function(cds,
   assertthat::assert_that(!is.null(reducedDims(cds)[[reduction_method]]),
                           msg = paste("No dimensionality reduction for",
                                       reduction_method, "calculated.",
-                                      "Please run reduce_dimensions with",
+                                      "Please run reduce_dimension with",
                                       "reduction_method =", reduction_method,
                                       "before running cluster_cells"))
 
@@ -119,11 +119,11 @@ cluster_cells <- function(cds,
                                               partition_qval, verbose)
       partitions <- igraph::components(
         cluster_graph_res$cluster_g)$membership[cluster_result$optim_res$membership]
-      names(partitions) <- row.names(reduced_dim_res)
       partitions <- as.factor(partitions)
     } else {
       partitions <- rep(1, nrow(colData(cds)))
     }
+    names(partitions) <- row.names(reduced_dim_res)
     clusters <- factor(igraph::membership(cluster_result$optim_res))
     cds@clusters[[reduction_method]] <- list(cluster_result = cluster_result,
                                              partitions = partitions,
@@ -144,11 +144,11 @@ cluster_cells <- function(cds,
                                               partition_qval, verbose)
       partitions <- igraph::components(
         cluster_graph_res$cluster_g)$membership[cluster_result$optim_res$membership]
-      names(partitions) <- row.names(reduced_dim_res)
       partitions <- as.factor(partitions)
     } else {
       partitions <- rep(1, nrow(colData(cds)))
     }
+    names(partitions) <- row.names(reduced_dim_res)
     clusters <- factor(igraph::membership(cluster_result$optim_res))
     cds@clusters[[reduction_method]] <- list(cluster_result = cluster_result,
                                              partitions = partitions,

diff --git a/R/cluster_genes.R b/R/cluster_genes.R
@@ -10,15 +10,15 @@
 #' @param umap.fast_sgd Whether to allow UMAP to perform fast stochastic gradient descent. Defaults to TRUE. Setting FALSE will result in slower, but deterministic behavior (if cores=1).
 #' @param umap.nn_method The method used for nearest neighbor network construction during UMAP.
 #' @param k number of kNN used in creating the k nearest neighbor graph for Louvain clustering. The number of kNN is related to the resolution of the clustering result, bigger number of kNN gives low resolution and vice versa. Default to be 20
-#' @param louvain_iter Integer number of iterations used for Louvain clustering. The clustering result gives the largest modularity score will be used as the final clustering result.  Default to be 1. Note that if louvain_iter is large than 1, the `seed` argument will be ignored.
+#' @param leiden_iter Integer number of iterations used for Leiden clustering. The clustering result with the largest modularity score is used as the final clustering result.  Default to be 1.
 #' @param partition_qval Significance threshold used in Louvain community graph partitioning.
 #' @param weight A logic argument to determine whether or not we will use
 #'   Jaccard coefficient for two nearest neighbors (based on the overlapping of
 #'   their kNN) as the weight used for Louvain clustering. Default to be FALSE.
 #' @param resolution Resolution parameter passed to Louvain. Can be a list. If
 #'   so, this method will evaluate modularity at each resolution and use the
 #'   one with the highest value.
-#' @param random_seed  the seed used by the random number generator in louvain-igraph package. This argument will be ignored if louvain_iter is larger than 1.
+#' @param random_seed  the seed used by the random number generator in Leiden.
 #' @param cores number of cores computer should use to execute function
 #' @param verbose Whether or not verbose output is printed.
 #' @param ... Additional arguments passed to UMAP and Louvain analysis.
@@ -35,15 +35,15 @@ find_gene_modules <- function(cds,
                           umap.fast_sgd = FALSE,
                           umap.nn_method = "annoy",
                           k = 20,
-                          louvain_iter = 1,
+                          leiden_iter = 1,
                           partition_qval = 0.05,
                           weight = FALSE,
                           resolution = NULL,
                           random_seed = 0L,
                           cores=1,
                           verbose = F,
                           ...) {
-  method = 'louvain'
+  method = 'leiden'
   assertthat::assert_that(
     tryCatch(expr = ifelse(match.arg(reduction_method) == "",TRUE, TRUE),
              error = function(e) FALSE),
@@ -55,23 +55,27 @@ find_gene_modules <- function(cds,
   assertthat::assert_that(is.character(reduction_method))
   assertthat::assert_that(assertthat::is.count(k))
   assertthat::assert_that(is.logical(weight))
-  assertthat::assert_that(assertthat::is.count(louvain_iter))
+  assertthat::assert_that(assertthat::is.count(leiden_iter))
   ## TO DO what is resolution?
   assertthat::assert_that(is.numeric(partition_qval))
   assertthat::assert_that(is.logical(verbose))
   assertthat::assert_that(!is.null(reducedDims(cds)[[reduction_method]]),
                           msg = paste("No dimensionality reduction for",
                                       reduction_method, "calculated.",
-                                      "Please run reduce_dimensions with",
+                                      "Please run reduce_dimension with",
                                       "reduction_method =", reduction_method,
                                       "before running cluster_cells"))
 
   preprocess_mat <- cds@preprocess_aux$gene_loadings
   if (is.null(cds@preprocess_aux$beta) == FALSE){
-    preprocess_mat = preprocess_mat %*% cds@preprocess_aux$beta
+    preprocess_mat = preprocess_mat %*% (-cds@preprocess_aux$beta)
   }
   preprocess_mat = preprocess_mat[intersect(rownames(cds), row.names(preprocess_mat)),]
 
+  # uwot::umap uses a random number generator
+  if( random_seed != 0L )
+    set.seed( random_seed )
+
   umap_res = uwot::umap(as.matrix(preprocess_mat),
                         n_components = max_components,
                         metric = umap.metric,
@@ -88,14 +92,14 @@ find_gene_modules <- function(cds,
   reduced_dim_res <- umap_res
 
   if(verbose)
-    message("Running louvain clustering algorithm ...")
+    message("Running leiden clustering algorithm ...")
 
   cluster_result <- leiden_clustering(data = reduced_dim_res,
                                     pd = rowData(cds)[
                                       row.names(reduced_dim_res),,drop=FALSE],
                                     k = k,
                                     weight = weight,
-                                    louvain_iter = louvain_iter,
+                                    num_iter = leiden_iter,
                                     resolution_parameter = resolution,
                                     random_seed = random_seed,
                                     verbose = verbose, ...)
@@ -145,7 +149,8 @@ my.aggregate.Matrix = function (x, groupings = NULL, form = NULL, fun = "sum", .
 #'
 #' @param cds The cell_data_set on which this function operates
 #' @param gene_group_df A dataframe in which the first column contains gene ids
-#'   and the second contains groups. If NULL, genes are not grouped.
+#'   or short gene names and the second contains groups. If NULL, genes are not
+#'   grouped.
 #' @param cell_group_df A dataframe in which the first column contains cell ids
 #'   and the second contains groups. If NULL, cells are not grouped.
 #' @param norm_method How to transform gene expression values before
@@ -185,14 +190,26 @@ aggregate_gene_expression <- function(cds,
                                      fData(cds)$gene_short_name |
                                      gene_group_df[,1] %in%
                                      row.names(fData(cds)),,drop=FALSE]
+
+    # Convert gene short names to rownames if necessary. The more
+    # straightforward single call to recode took much longer.
+    # Thanks to Christopher Johnstone who posted this on github.
+    short_name_mask <- gene_group_df[[1]] %in% fData(cds)$gene_short_name
+    if (any(short_name_mask)) {
+      geneids <- as.character(gene_group_df[[1]])
+      geneids[short_name_mask] <- row.names(fData(cds))[match(
+                  geneids[short_name_mask], fData(cds)$gene_short_name)]
+      gene_group_df[[1]] <- geneids
+    }
+
     # gene_group_df = gene_group_df[row.names(fData(cds)),]
 
     # FIXME: this should allow genes to be part of multiple groups. group_by
     # over the second column with a call to colSum should do it.
     agg_mat = as.matrix(my.aggregate.Matrix(agg_mat[gene_group_df[,1],],
                                             as.factor(gene_group_df[,2]),
                                             fun="sum"))
-    if (scale_agg_values){
+	if (scale_agg_values){
       agg_mat <- t(scale(t(agg_mat)))
       agg_mat[agg_mat < min_agg_value] <- min_agg_value
       agg_mat[agg_mat > max_agg_value] <- max_agg_value
@@ -206,13 +223,13 @@ aggregate_gene_expression <- function(cds,
                                   drop=FALSE]
     agg_mat = agg_mat[,cell_group_df[,1]]
     agg_mat = my.aggregate.Matrix(Matrix::t(agg_mat),
-                                             as.factor(cell_group_df[,2]),
+                                  as.factor(cell_group_df[,2]),
                                   fun="mean")
     agg_mat = Matrix::t(agg_mat)
   }
 
   if (exclude.na){
-    agg_mat = agg_mat[row.names(agg_mat) != "NA", colnames(agg_mat) != "NA"]
+    agg_mat <- agg_mat[rownames(agg_mat) != "NA", colnames(agg_mat) != "NA",drop=FALSE]
   }
   return(agg_mat)
 }