Skip to content

Commit

Permalink
Merge pull request #496 from cole-trapnell-lab/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
brgew authored Apr 1, 2021
2 parents 4c01d89 + 0ef9570 commit 004c096
Show file tree
Hide file tree
Showing 28 changed files with 619 additions and 352 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ before_install:
- sudo apt-get install -y libudunits2-dev
- sudo apt-get install -y gdal-bin
- sudo apt-get install -y libgdal1-dev
- sudo apt-get install -y libharfbuzz-dev
- sudo apt-get install -y libfribidi-dev
- sudo apt-get install -y libgit2-dev
r:
- bioc-release
#r_packages:
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: monocle3
Title: Clustering, differential expression, and trajectory analysis for single-
cell RNA-Seq
Version: 0.2.3.0
Version: 1.0.0
Authors@R:
person(given = "Hannah",
family = "Pliner",
Expand Down
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# monocle3 1.0.0

### Changes

* Add label_principal_points option to plot_cells to assist with identifying root_pr_nodes in order_cells.

### Bug fixes

* Documentation and error reports improvements.
* Fix fit_models tests for NA/Inf/NaNs.
* Refine gene loadings calculations in find_gene_modules.
* Test for convergence failure in top_markers,

# monocle3 0.2.3

### Changes
Expand Down
11 changes: 8 additions & 3 deletions R/cluster_genes.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,14 @@ find_gene_modules <- function(cds,
"before running cluster_cells"))

preprocess_mat <- cds@preprocess_aux$gene_loadings
if (is.null(cds@preprocess_aux$beta) == FALSE){
preprocess_mat <- sweep( preprocess_mat, 2, cds@preprocess_aux$beta[,1], '*')
}
# Notes:
# o cds@preprocess_aux$beta is npc x nfactor, which causes
# preprocess_mat to have nfactor columns, often one column
# o I do not know how to adjust gene_loadings for batch effects
# so this is disabled for now
# if (is.null(cds@preprocess_aux$beta) == FALSE){
# preprocess_mat = preprocess_mat %*% (-cds@preprocess_aux$beta)
# }
preprocess_mat <- preprocess_mat[intersect(rownames(cds), row.names(preprocess_mat)),]

# uwot::umap uses a random number generator
Expand Down
40 changes: 26 additions & 14 deletions R/expr_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,15 @@ fit_model_helper <- function(x,
#' @param verbose Logical indicating whether to emit progress messages.
#' @param ... Additional arguments passed to model fitting functions.
#'
#' @return a tibble containing model objects
#' @return a tibble where the rows are genes and columns are
#' * id character vector from `rowData(cds)$id`
#' * gene_short_names character vector from `rowData(cds)$gene_short_names`
#' * num_cells_expressed int vector from `rowData(cds)$num_cells_expressed`
#' * gene_id character vector from row.names(rowData(cds))`
#' * model GLM model list returned by speedglm
#' * model_summary model summary list returned by `summary(model)`
#' * status character vector of model fitting status: OK when model converged, otherwise FAIL
#'
#' @export
fit_models <- function(cds,
model_formula_str,
Expand Down Expand Up @@ -238,24 +246,28 @@ fit_models <- function(cds,
err_msg <- paste0(err_msg,' \'', mf_term, '\': not in cds\n')
next
}
mf_length <- length(coldata_df[[mf_term]])
mf_num_inf <- sum(is.infinite(coldata_df[[mf_term]]))
mf_num_nan <- sum(is.nan(coldata_df[[mf_term]]))
mf_num_na <- sum(is.na(coldata_df[[mf_term]]))
if( mf_num_inf > 0 )
err_msg <- paste0(err_msg, ' \'', mf_term, '\': ' , mf_num_inf, ' of ', mf_length, ' values are Inf\n')
if( mf_num_nan > 0 )
err_msg <- paste0(err_msg, ' \'', mf_term, '\': ' , mf_num_nan, ' of ', mf_length, ' values are NaN\n')
if( mf_num_na - mf_num_nan > 0 )
err_msg <- paste0(err_msg, ' \'', mf_term, '\': ' , mf_num_na - mf_num_nan, ' of ', mf_length, ' values are NA\n')
}
if(length(err_msg) > 0)
stop( '\n-- bad fit_models terms --\n', err_msg )
rm( err_msg, mf_terms, mf_term, mf_length, mf_num_inf, mf_num_nan, mf_num_na )
tryCatch({
stats::model.frame(model_form, data=coldata_df)
}, error = function(e) {
stop ("Error in model formula")
}, error = function( cnd ) {
info_msg <- ''
for( mf_term in mf_terms )
{
mf_length <- length(coldata_df[[mf_term]])
mf_num_inf <- sum(is.infinite(coldata_df[[mf_term]]))
mf_num_nan <- sum(is.nan(coldata_df[[mf_term]]))
mf_num_na <- sum(is.na(coldata_df[[mf_term]]))
if( mf_num_inf > 0 )
info_msg <- paste0(info_msg, ' \'', mf_term, '\': ' , mf_num_inf, ' of ', mf_length, ' values are Inf\n')
if( mf_num_nan > 0 )
info_msg <- paste0(info_msg, ' \'', mf_term, '\': ' , mf_num_nan, ' of ', mf_length, ' values are NaN\n')
if( mf_num_na - mf_num_nan > 0 )
info_msg <- paste0(info_msg, ' \'', mf_term, '\': ' , mf_num_na - mf_num_nan, ' of ', mf_length, ' values are NA\n')
}
rm( mf_term, mf_length, mf_num_inf, mf_num_nan, mf_num_na )
stop (paste0( 'Error in model formula: ', conditionMessage( cnd ), '\n', info_msg ) )
})

disp_func <- NULL
Expand Down
19 changes: 19 additions & 0 deletions R/find_markers.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@
#' @param cores Number of cores to use.
#' @param verbose Whether to print verbose progress output.
#'
#' @return a data.frame where the rows are genes and the columns are
#' * gene_id vector of gene names
#' * gene_short_name vector of gene short names
#' * cell_group character vector of the cell group to which the cell belongs
#' * marker_score numeric vector of marker scores as the fraction expressing scaled by the specificity. The value ranges from 0 to 1.
#' * mean_expression numeric vector of mean normalized expression of the gene in the cell group
#' * fraction_expressing numeric vector of fraction of cells expressing the gene within the cell group
#' * specificity numeric vector of a measure of how specific the gene's expression is to the cell group based on the Jensen-Shannon divergence. The value ranges from 0 to 1.
#' * pseudo_R2 numeric vector of pseudo R-squared values, a measure of how well the gene expression model fits the categorical data relative to the null model. The value ranges from 0 to 1.
#' * marker_test_p_value numeric vector of likelihood ratio p-values
#' * marker_test_q_value numeric vector of likelihood ratio q-values
#'
#' @export
top_markers <- function(cds,
group_cells_by="cluster",
Expand Down Expand Up @@ -160,6 +172,13 @@ top_markers <- function(cds,
RhpcBLASctl::blas_set_num_threads(old_blas_num_threads)
})

#
# Check for possible convergence failure or other problems. Issue: #383
#
if('warning' %in% marker_test_res) {
warning('test_marker_for_cell_group() caught warning: possible convergence failure.')
}

marker_test_res = t(marker_test_res)
marker_test_res = as.matrix(marker_test_res)
colnames(marker_test_res) = c("pseudo_R2", "lrtest_p_value")
Expand Down
26 changes: 26 additions & 0 deletions R/load_cellranger_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,32 @@ get_genome_in_matrix_path <- function(matrix_path, genome=NULL) {
#' is from version 3.0 and contains non-Gene-Expression data (e.g. Antibodies
#' or CRISPR features), only the Gene Expression data is returned.
#'
#' @details
#' * the \emph{pipestance_path} argument takes the name of a Cell Ranger
#' output directory, in which it looks for the required data files,
#' for example, \emph{pipestance_path=10x_data}
#' * for Cell Ranger version 2 data, \emph{load_cellranger_data} expects to
#' find the required files \emph{barcodes.tsv}, \emph{genes.tsv}, and
#' \emph{matrix.mtx}
#' in the directories as
#' - \emph{10x_data/outs/filtered_gene_bc_matrices/<genome>/barcodes.tsv}
#' - \emph{10x_data/outs/filtered_gene_bc_matrices/<genome>/genes.tsv}
#' - \emph{10x_data/outs/filtered_gene_bc_matrices/<genome>/matrix.mtx}
#'
#' where <genome> is the name of a genome. \emph{load_cellranger_data}
#' expects to find either a single \emph{genome} directory in
#' \emph{10x_data/outs/filtered_gene_bc_matrices} or a \emph{genome}
#' directory with the name given with the \emph{genome} argument.
#' * for Cell Ranger version 3 data, \emph{load_cellranger_data} expects to
#' find the required files \emph{barcodes.tsv.gz}, \emph{features.tsv.gz},
#' and \emph{matrix.mtx.gz} in the directories as
#' - \emph{10x_data/outs/filtered_feature_bc_matrix/barcodes.tsv.gz}
#' - \emph{10x_data/outs/filtered_feature_bc_matrix/features.tsv.gz}
#' - \emph{10x_data/outs/filtered_feature_bc_matrix/matrix.mtx.gz}
#'
#' * if any of the files is not in the expected directory,
#' \emph{load_cellranger_data} will terminate with an error
#'
#' @param pipestance_path Path to the output directory produced by Cell Ranger
#' @param genome The desired genome (e.g., 'hg19' or 'mm10')
#' @param barcode_filtered Load only the cell-containing barcodes
Expand Down
15 changes: 9 additions & 6 deletions R/order_cells.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
#' @param reduction_method a string specifying the reduced dimension method to
#' use when ordering cells. Currently only "UMAP" is supported.
#' @param root_pr_nodes NULL or a vector of starting principal points. If
#' provided, pseudotime will start (i.e. be zero) at these graph nodes. Both
#' \code{root_pr_nodes} and \code{root_cells} cannot be provided.
#' provided, pseudotime will start (i.e. be zero) at these graph nodes. You
#' can find the principal point names by running plot_cells with
#' label_principal_points = TRUE. Both \code{root_pr_nodes} and
#' \code{root_cells} cannot be provided.
#' @param root_cells NULL or a vector of starting cells. If provided,
#' pseudotime will start (i.e. be zero) at these cells. Both
#' \code{root_pr_nodes} and \code{root_cells} cannot be provided.
Expand Down Expand Up @@ -71,16 +73,17 @@ order_cells <- function(cds,
msg = paste("All provided root_cells must be",
"present in the cell data set."))
}

if(is.null(root_cells) & is.null(root_pr_nodes)) {
assertthat::assert_that(interactive(),
msg = paste("When not in interactive mode, either",
"root_pr_nodes or root_cells must be",
"provided."))
"root_pr_nodes or root_cells",
"must be provided."))
}
assertthat::assert_that(!all(c(!is.null(root_cells),
!is.null(root_pr_nodes))),
msg = paste("Please specify either root_pr_nodes",
"or root_cells, not both."))
msg = paste("Please specify either root_pr_nodes",
"or root_cells, not both."))

if (is.null(root_pr_nodes) & is.null(root_cells)){
if (interactive()){
Expand Down
49 changes: 46 additions & 3 deletions R/plotting.R
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,9 @@ plot_cells_3d <- function(cds,
#' ggrastr package.
#' @param scale_to_range Logical indicating whether to scale expression to
#' percent of maximum expression.
#' @param label_principal_points Logical indicating whether to label roots,
#' leaves, and branch points with principal point names. This is useful for
#' order_cells and choose_graph_segments in non-interactive mode.
#'
#' @return a ggplot2 plot object
#' @export
Expand Down Expand Up @@ -404,7 +407,8 @@ plot_cells <- function(cds,
alpha = 1,
min_expr=0.1,
rasterize=FALSE,
scale_to_range=FALSE) {
scale_to_range=FALSE,
label_principal_points = FALSE) {
reduction_method <- match.arg(reduction_method)
assertthat::assert_that(methods::is(cds, "cell_data_set"))
assertthat::assert_that(!is.null(reducedDims(cds)[[reduction_method]]),
Expand Down Expand Up @@ -451,6 +455,19 @@ plot_cells <- function(cds,
message("No trajectory to plot. Has learn_graph() been called yet?")
show_trajectory_graph = FALSE
}
if (label_principal_points &&
is.null(principal_graph(cds)[[reduction_method]])) {
message("Cannot label principal points when no trajectory to plot. Has learn_graph() been called yet?")
label_principal_points = FALSE
}

if (label_principal_points) {
label_branch_points <- FALSE
label_leaves <- FALSE
label_roots <- FALSE
}



gene_short_name <- NA
sample_name <- NA
Expand Down Expand Up @@ -624,7 +641,7 @@ plot_cells <- function(cds,
text_df = data_df %>%
dplyr::group_by(cell_group) %>%
dplyr::mutate(cells_in_cluster= dplyr::n()) %>%
dplyr::group_by(cell_color, add=TRUE) %>%
dplyr::group_by(cell_color, .add=TRUE) %>%
dplyr::mutate(per=dplyr::n()/cells_in_cluster)
median_coord_df = text_df %>%
dplyr::summarize(fraction_of_group = dplyr::n(),
Expand Down Expand Up @@ -749,6 +766,26 @@ plot_cells <- function(cds,
data=edge_df)


if (label_principal_points) {
mst_branch_nodes <- branch_nodes(cds, reduction_method)
mst_leaf_nodes <- leaf_nodes(cds, reduction_method)
mst_root_nodes <- root_nodes(cds, reduction_method)
pps <- c(mst_branch_nodes, mst_leaf_nodes, mst_root_nodes)
princ_point_df <- ica_space_df %>%
dplyr::slice(match(names(pps), sample_name))

g <- g +
geom_point(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2"),
shape = 21, stroke=I(trajectory_graph_segment_size),
color="white",
fill="black",
size=I(graph_label_size * 1.5),
na.rm=TRUE, princ_point_df) +
ggrepel::geom_text_repel(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2",
label="sample_name"),
size=I(graph_label_size * 1.5), color="Black", na.rm=TRUE,
princ_point_df)
}
if (label_branch_points){
mst_branch_nodes <- branch_nodes(cds, reduction_method)
branch_point_df <- ica_space_df %>%
Expand Down Expand Up @@ -1468,6 +1505,12 @@ plot_genes_by_group <- function(cds,
msg = paste("group_cells_by must be a column in",
"the colData table."))
}
assertthat::assert_that("gene_short_name" %in% names(rowData(cds)), msg =
paste("This function requires a gene_short_name",
"column in your rowData. If you do not have",
"gene symbols, you can use other gene ids",
"by running",
"rowData(cds)$gene_short_name <- row.names(rowData(cds))"))

norm_method = match.arg(norm_method)

Expand All @@ -1477,7 +1520,7 @@ plot_genes_by_group <- function(cds,
dplyr::pull(rowname)
if(length(gene_ids) < 1)
stop(paste('Please make sure markers are included in the gene_short_name",
"column of the fData!'))
"column of the rowData!'))

if(flip_percentage_mean == FALSE){
major_axis <- 1
Expand Down
16 changes: 3 additions & 13 deletions R/preprocess_cds.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,6 @@
#' @param use_genes NULL or a list of gene IDs. If a list of gene IDs, only
#' this subset of genes is used for dimensionality reduction. Default is
#' NULL.
#' @param residual_model_formula_str NULL or a string model formula specifying
#' any effects to subtract from the data before dimensionality reduction.
#' Uses a linear model to subtract effects. For non-linear effects, use
#' alignment_group. Default is NULL.
#' @param alignment_group String specifying a column of colData to use for
#' aligning groups of cells. The column specified must be a factor.
#' Alignment can be used to subtract batch effects in a non-linear way.
#' For correcting continuous effects, use residual_model_formula_str.
#' Default is NULL.
#' @param pseudo_count NULL or the amount to increase expression values before
#' normalization and dimensionality reduction. If NULL (default), a
#' pseudo_count of 1 is added for log normalization and 0 is added for size
Expand All @@ -50,8 +41,6 @@ preprocess_cds <- function(cds, method = c('PCA', "LSI"),
num_dim=50,
norm_method = c("log", "size_only", "none"),
use_genes = NULL,
residual_model_formula_str=NULL,
alignment_group=NULL,
pseudo_count=NULL,
scaling = TRUE,
verbose=FALSE,
Expand Down Expand Up @@ -108,12 +97,13 @@ preprocess_cds <- function(cds, method = c('PCA', "LSI"),

irlba_rotation <- irlba_res$rotation
row.names(irlba_rotation) <- rownames(FM)
cds@preprocess_aux$gene_loadings <- irlba_rotation
cds@preprocess_aux$gene_loadings <- irlba_rotation %*% diag(irlba_res$sdev)
cds@preprocess_aux$prop_var_expl <- irlba_res$sdev^2 / sum(irlba_res$sdev^2)

} else if(method == "LSI") {

preproc_res <- tfidf(FM)
num_col <- ncol(preproc_res)
irlba_res <- irlba::irlba(Matrix::t(preproc_res),
nv = min(num_dim,min(dim(FM)) - 1))

Expand All @@ -122,7 +112,7 @@ preprocess_cds <- function(cds, method = c('PCA', "LSI"),

irlba_rotation = irlba_res$v
row.names(irlba_rotation) = rownames(FM)
cds@preprocess_aux$gene_loadings = irlba_rotation
cds@preprocess_aux$gene_loadings = irlba_rotation %*% diag( irlba_res$d/sqrt( max(1, num_col - 1) ) )

}

Expand Down
Loading

0 comments on commit 004c096

Please sign in to comment.