Skip to content

Commit

Permalink
Allow db = 'none' if no annotationdbi available
Browse files Browse the repository at this point in the history
  • Loading branch information
hpliner committed Feb 20, 2019
1 parent d04a89b commit 3cd521b
Show file tree
Hide file tree
Showing 12 changed files with 174 additions and 57 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: garnett
Type: Package
Title: Automated cell type classification
Version: 0.1.3
Version: 0.1.4
Author: c(
person("Hannah", "Pliner", email = "[email protected]", role = c("aut", "cre")),
person("Cole", "Trapnell", email = "[email protected]", role = c("aut")))
Expand Down
30 changes: 21 additions & 9 deletions R/classify_cells.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@
#' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
#' For example, for humans use org.Hs.eg.db. See available packages at
#' \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
#' If your organism does not have an AnnotationDb-class database available,
#' you can specify "none", however then Garnett will not check/convert gene
#' IDs, so your CDS and marker file must have the same gene ID type.
#' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
#' of the values in \code{columns(db)}. Default is "ENSEMBL".
#' of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
#' db = "none".
#' @param rank_prob_ratio Numeric value greater than 1. This is the minimum
#' odds ratio between the probability of the most likely cell type to the
#' second most likely cell type to allow assignment. Default is 1.5. Higher
Expand Down Expand Up @@ -72,14 +76,22 @@ classify_cells <- function(cds,
msg = paste("Must run estimateSizeFactors() on cds",
"before calling classify_cells"))
assertthat::assert_that(is(classifier, "garnett_classifier"))
assertthat::assert_that(is(db, "OrgDb"),
msg = paste0("db must be an 'AnnotationDb' object ",
"see http://bioconductor.org/packages/",
"3.8/data/annotation/ for available"))
assertthat::assert_that(is.character(cds_gene_id_type))
assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("cds_gene_id_type must be one of",
"keytypes(db)"))
if(is(db, "character") && db == "none") {
cds_gene_id_type <- 'custom'
classifier_gene_id_type <- 'custom'
marker_file_gene_id_type <- 'custom'
} else {
assertthat::assert_that(is(db, "OrgDb"),
msg = paste0("db must be an 'AnnotationDb' object ",
"or 'none' see ",
"http://bioconductor.org/packages/",
"3.8/data/annotation/ for available"))
assertthat::assert_that(is.character(cds_gene_id_type))
assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("cds_gene_id_type must be one of",
"keytypes(db)"))
}

assertthat::assert_that(is.numeric(rank_prob_ratio))
assertthat::assert_that(rank_prob_ratio > 1,
msg = "rank_prob_ratio must be greater than 1")
Expand Down
6 changes: 5 additions & 1 deletion R/parser.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@ Lexer <- R6::R6Class(
#' For example, for human, use \code{\link[org.Hs.eg.db]{org.Hs.eg.db}}. To
#' see available gene ID types, you can run \code{columns(db)}. You will
#' specify which gene ID type you used when calling
#' \code{\link{train_cell_classifier}}.} \item{not expressed:}{In addition to
#' \code{\link{train_cell_classifier}}.} If your species does not have an
#' annotation dataset of type \code{\link[AnnotationDbi]{AnnotationDb-class}},
#' you can set \code{db = 'none'}, however Garnett will then not convert gene
#' ID types, so CDS and marker file gene ID types need to be the same.
#' \item{not expressed:}{In addition to
#' specifying genes that the cell type should express, you can also specify
#' genes that your cell type should not express. Details on specifying genes
#' are the same as for \code{expressed:}.}
Expand Down
47 changes: 30 additions & 17 deletions R/train_cell_classifier.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@
#' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
#' For example, for humans use org.Hs.eg.db. See available packages at
#' \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
#' If your organism does not have an AnnotationDb-class database available,
#' you can specify "none", however then Garnett will not check/convert gene
#' IDs, so your CDS and marker file must have the same gene ID type.
#' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
#' of the values in \code{columns(db)}. Default is "ENSEMBL".
#' of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
#' db = "none".
#' @param marker_file_gene_id_type The type of gene ID used in the marker file.
#' Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
#' Ignored if db = "none".
#' @param min_observations An integer. The minimum number of representative
#' cells per cell type required to include the cell type in the predictive
#' model. Default is 8.
Expand All @@ -36,7 +41,7 @@
#' preset lambda values are used.
#' @param classifier_gene_id_type The type of gene ID that will be used in the
#' classifier. If possible for your organism, this should be "ENSEMBL", which
#' is the default.
#' is the default. Ignored if db = "none".
#'
#' @details This function has three major parts: 1) parsing the marker file 2)
#' choosing cell representatives and 3) training the classifier. Details on
Expand Down Expand Up @@ -108,22 +113,29 @@ train_cell_classifier <- function(cds,
"before calling train_cell_classifier"))
assertthat::assert_that(is.character(marker_file))
assertthat::is.readable(marker_file)
assertthat::assert_that(is(db, "OrgDb"),
msg = paste0("db must be an 'AnnotationDb' object ",
"see http://bioconductor.org/packages/",
"3.8/data/annotation/ for available"))
assertthat::assert_that(is.character(cds_gene_id_type))
assertthat::assert_that(is.character(marker_file_gene_id_type))
assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("cds_gene_id_type must be one of",
"keytypes(db)"))
assertthat::assert_that(classifier_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("classifier_gene_id_type must be one of",
"keytypes(db)"))
assertthat::assert_that(marker_file_gene_id_type %in%
if (is(db, "character") && db == "none") {
cds_gene_id_type <- 'custom'
classifier_gene_id_type <- 'custom'
marker_file_gene_id_type <- 'custom'
} else {
assertthat::assert_that(is(db, "OrgDb"),
msg = paste0("db must be an 'AnnotationDb' object ",
"or 'none' see ",
"http://bioconductor.org/packages/",
"3.8/data/annotation/ for available"))
assertthat::assert_that(is.character(cds_gene_id_type))
assertthat::assert_that(is.character(marker_file_gene_id_type))
assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("cds_gene_id_type must be one of",
"keytypes(db)"))
assertthat::assert_that(classifier_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("classifier_gene_id_type must be one of",
"keytypes(db)"))
assertthat::assert_that(marker_file_gene_id_type %in%
AnnotationDbi::keytypes(db),
msg = paste("marker_file_gene_id_type must be one of",
"keytypes(db)"))
msg = paste("marker_file_gene_id_type must be one of",
"keytypes(db)"))
}
assertthat::is.count(num_unknown)
assertthat::is.count(cores)
assertthat::assert_that(is.logical(propogate_markers))
Expand Down Expand Up @@ -205,6 +217,7 @@ train_cell_classifier <- function(cds,
##### Make garnett_classifier #####
classifier <- new_garnett_classifier()
classifier@gene_id_type <- classifier_gene_id_type
if(is(db, "character") && db == "none") classifier@gene_id_type <- "custom"

for(i in name_order) {
# check meta data exists
Expand Down
49 changes: 33 additions & 16 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,15 @@ convert_gene_ids <- function(gene_list,
#'
get_feature_genes <- function(classifier,
node = "root",
convert_ids = TRUE,
convert_ids = FALSE,
db=NULL) {
assertthat::assert_that(is(classifier, "garnett_classifier"))
assertthat::assert_that(is.character(node))
assertthat::assert_that(is.logical(convert_ids))
if (convert_ids) {
if (is.null(db)) stop("If convert_ids = TRUE, db must be provided.")
if (is(db, "character") && db == "none")
stop("Cannot convert IDs if db = 'none'.")
assertthat::assert_that(is(db, "OrgDb"),
msg = paste0("db must be an 'AnnotationDb' object ",
"see http://bioconductor.org/",
Expand Down Expand Up @@ -160,10 +162,15 @@ get_classifier_references <- function(classifier,
#' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
#' For example, for humans use org.Hs.eg.db. See available packages at
#' \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
#' If your organism does not have an AnnotationDb-class database available,
#' you can specify "none", however then Garnett will not check/convert gene
#' IDs, so your CDS and marker file must have the same gene ID type.
#' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
#' of the values in \code{columns(db)}. Default is "ENSEMBL".
#' of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
#' db = "none".
#' @param marker_file_gene_id_type The type of gene ID used in the marker file.
#' Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
#' Ignored if db = "none".
#' @param propogate_markers Logical. Should markers from child nodes of a cell
#' type be used in finding representatives of the parent type? Should
#' generally be \code{TRUE}.
Expand All @@ -172,7 +179,7 @@ get_classifier_references <- function(classifier,
#' calculation is slower with very large datasets.
#' @param classifier_gene_id_type The type of gene ID that will be used in the
#' classifier. If possible for your organism, this should be "ENSEMBL", which
#' is the default.
#' is the default. Ignored if db = "none".
#'
#' @return Data.frame of marker check results.
#'
Expand Down Expand Up @@ -245,19 +252,29 @@ check_markers <- function(cds,
"before calling check_markers"))
assertthat::assert_that(is.character(marker_file))
assertthat::is.readable(marker_file)
assertthat::assert_that(is(db, "OrgDb"),
msg = paste0("db must be an 'AnnotationDb' object ",
"see http://bioconductor.org/packages/",
"3.8/data/annotation/ for available"))
assertthat::assert_that(is.character(cds_gene_id_type))
assertthat::assert_that(is.character(marker_file_gene_id_type))
assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("cds_gene_id_type must be one of",
"keytypes(db)"))
assertthat::assert_that(marker_file_gene_id_type %in%
AnnotationDbi::keytypes(db),
msg = paste("marker_file_gene_id_type must be one of",
"keytypes(db)"))

if (is(db, "character") && db == "none") {
cds_gene_id_type <- 'custom'
classifier_gene_id_type <- 'custom'
marker_file_gene_id_type <- 'custom'
} else {
assertthat::assert_that(is(db, "OrgDb"),
msg = paste0("db must be an 'AnnotationDb' object ",
"or 'none' see ",
"http://bioconductor.org/packages/",
"3.8/data/annotation/ for available"))
assertthat::assert_that(is.character(cds_gene_id_type))
assertthat::assert_that(is.character(marker_file_gene_id_type))
assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
msg = paste("cds_gene_id_type must be one of",
"keytypes(db)"))
assertthat::assert_that(marker_file_gene_id_type %in%
AnnotationDbi::keytypes(db),
msg = paste("marker_file_gene_id_type must be one",
"of keytypes(db)"))
}


assertthat::assert_that(is.logical(propogate_markers))
assertthat::assert_that(is.logical(use_tf_idf))

Expand Down
6 changes: 5 additions & 1 deletion man/Parser.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 9 additions & 4 deletions man/check_markers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions man/classify_cells.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/get_feature_genes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 9 additions & 4 deletions man/train_cell_classifier.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions tests/testthat/test-overall.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,30 @@ test_that("whole process is the same multi-core", {
expect_equal(sum(pData(test_cds)$cluster_ext_type == "T cells"), 199)
})

data(test_cds)
set.seed(260)
test_classifier <- train_cell_classifier(cds = test_cds,
marker_file = "../pbmc_test.txt",
db='none',
min_observations = 10,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
cores = 2,
marker_file_gene_id_type = "SYMBOL")

test_cds <- garnett::classify_cells(test_cds, test_classifier,
db = 'none',
rank_prob_ratio = 1.5,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")

test_that("whole process is the same db = 'none'", {
expect_equal(sum(pData(test_cds)$cell_type == "B cells"), 211)
expect_equal(sum(pData(test_cds)$cell_type == "CD4 T cells"), 117)
expect_equal(sum(pData(test_cds)$cell_type == "CD8 T cells"), 62)
expect_equal(sum(pData(test_cds)$cell_type == "T cells"), 156)
expect_equal(sum(pData(test_cds)$cluster_ext_type == "B cells"), 401)
expect_equal(sum(pData(test_cds)$cluster_ext_type == "CD4 T cells"), 200)
expect_equal(sum(pData(test_cds)$cluster_ext_type == "T cells"), 199)
})

Loading

0 comments on commit 3cd521b

Please sign in to comment.