Allow db = 'none' if no annotationdbi available

cole-trapnell-lab · Feb 20, 2019 · 3cd521b · 3cd521b
1 parent d04a89b
commit 3cd521b
Show file tree

Hide file tree

Showing 12 changed files with 174 additions and 57 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: garnett
 Type: Package
 Title: Automated cell type classification
-Version: 0.1.3
+Version: 0.1.4
 Author: c(
     person("Hannah", "Pliner", email = "[email protected]", role = c("aut", "cre")), 
     person("Cole", "Trapnell", email = "[email protected]", role = c("aut")))

diff --git a/R/classify_cells.R b/R/classify_cells.R
@@ -10,8 +10,12 @@
 #' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
 #'  For example, for humans use org.Hs.eg.db. See available packages at
 #'  \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+#'  If your organism does not have an AnnotationDb-class database available,
+#'  you can specify "none", however then Garnett will not check/convert gene
+#'  IDs, so your CDS and marker file must have the same gene ID type.
 #' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
-#'  of the values in \code{columns(db)}. Default is "ENSEMBL".
+#'  of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+#'  db = "none".
 #' @param rank_prob_ratio Numeric value greater than 1. This is the minimum
 #'  odds ratio between the probability of the most likely cell type to the
 #'  second most likely cell type to allow assignment. Default is 1.5. Higher
@@ -72,14 +76,22 @@ classify_cells <- function(cds,
                           msg = paste("Must run estimateSizeFactors() on cds",
                                       "before calling classify_cells"))
   assertthat::assert_that(is(classifier, "garnett_classifier"))
-  assertthat::assert_that(is(db, "OrgDb"),
-                          msg = paste0("db must be an 'AnnotationDb' object ",
-                                       "see http://bioconductor.org/packages/",
-                                       "3.8/data/annotation/ for available"))
-  assertthat::assert_that(is.character(cds_gene_id_type))
-  assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("cds_gene_id_type must be one of",
-                                      "keytypes(db)"))
+  if(is(db, "character") && db == "none") {
+    cds_gene_id_type <- 'custom'
+    classifier_gene_id_type <- 'custom'
+    marker_file_gene_id_type <- 'custom'
+  } else {
+    assertthat::assert_that(is(db, "OrgDb"),
+                            msg = paste0("db must be an 'AnnotationDb' object ",
+                                         "or 'none' see ",
+                                         "http://bioconductor.org/packages/",
+                                         "3.8/data/annotation/ for available"))
+    assertthat::assert_that(is.character(cds_gene_id_type))
+    assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("cds_gene_id_type must be one of",
+                                        "keytypes(db)"))
+  }
+
   assertthat::assert_that(is.numeric(rank_prob_ratio))
   assertthat::assert_that(rank_prob_ratio > 1,
                           msg = "rank_prob_ratio must be greater than 1")

diff --git a/R/parser.R b/R/parser.R
@@ -68,7 +68,11 @@ Lexer <- R6::R6Class(
 #'   For example, for human, use \code{\link[org.Hs.eg.db]{org.Hs.eg.db}}. To
 #'   see available gene ID types, you can run \code{columns(db)}. You will
 #'   specify which gene ID type you used when calling
-#'   \code{\link{train_cell_classifier}}.} \item{not expressed:}{In addition to
+#'   \code{\link{train_cell_classifier}}.} If your species does not have an
+#'   annotation dataset of type \code{\link[AnnotationDbi]{AnnotationDb-class}},
+#'   you can set \code{db = 'none'}, however Garnett will then not convert gene
+#'   ID types, so CDS and marker file gene ID types need to be the same.
+#'   \item{not expressed:}{In addition to
 #'   specifying genes that the cell type should express, you can also specify
 #'   genes that your cell type should not express. Details on specifying genes
 #'   are the same as for \code{expressed:}.}

diff --git a/R/train_cell_classifier.R b/R/train_cell_classifier.R
@@ -14,10 +14,15 @@
 #' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
 #'  For example, for humans use org.Hs.eg.db. See available packages at
 #'  \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+#'  If your organism does not have an AnnotationDb-class database available,
+#'  you can specify "none", however then Garnett will not check/convert gene
+#'  IDs, so your CDS and marker file must have the same gene ID type.
 #' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
-#'  of the values in \code{columns(db)}. Default is "ENSEMBL".
+#'  of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+#'  db = "none".
 #' @param marker_file_gene_id_type The type of gene ID used in the marker file.
 #'  Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
+#'  Ignored if db = "none".
 #' @param min_observations An integer. The minimum number of representative
 #'  cells per cell type required to include the cell type in the predictive
 #'  model. Default is 8.
@@ -36,7 +41,7 @@
 #'  preset lambda values are used.
 #' @param classifier_gene_id_type The type of gene ID that will be used in the
 #'  classifier. If possible for your organism, this should be "ENSEMBL", which
-#'  is the default.
+#'  is the default. Ignored if db = "none".
 #'
 #' @details This function has three major parts: 1) parsing the marker file 2)
 #'  choosing cell representatives and 3) training the classifier. Details on
@@ -108,22 +113,29 @@ train_cell_classifier <- function(cds,
                                       "before calling train_cell_classifier"))
   assertthat::assert_that(is.character(marker_file))
   assertthat::is.readable(marker_file)
-  assertthat::assert_that(is(db, "OrgDb"),
-                          msg = paste0("db must be an 'AnnotationDb' object ",
-                                       "see http://bioconductor.org/packages/",
-                                       "3.8/data/annotation/ for available"))
-  assertthat::assert_that(is.character(cds_gene_id_type))
-  assertthat::assert_that(is.character(marker_file_gene_id_type))
-  assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("cds_gene_id_type must be one of",
-                                      "keytypes(db)"))
-  assertthat::assert_that(classifier_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("classifier_gene_id_type must be one of",
-                                      "keytypes(db)"))
-  assertthat::assert_that(marker_file_gene_id_type %in%
+  if (is(db, "character") && db == "none") {
+    cds_gene_id_type <- 'custom'
+    classifier_gene_id_type <- 'custom'
+    marker_file_gene_id_type <- 'custom'
+  } else {
+    assertthat::assert_that(is(db, "OrgDb"),
+                            msg = paste0("db must be an 'AnnotationDb' object ",
+                                         "or 'none' see ",
+                                         "http://bioconductor.org/packages/",
+                                         "3.8/data/annotation/ for available"))
+    assertthat::assert_that(is.character(cds_gene_id_type))
+    assertthat::assert_that(is.character(marker_file_gene_id_type))
+    assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("cds_gene_id_type must be one of",
+                                        "keytypes(db)"))
+    assertthat::assert_that(classifier_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("classifier_gene_id_type must be one of",
+                                        "keytypes(db)"))
+    assertthat::assert_that(marker_file_gene_id_type %in%
                             AnnotationDbi::keytypes(db),
-                          msg = paste("marker_file_gene_id_type must be one of",
-                                      "keytypes(db)"))
+                            msg = paste("marker_file_gene_id_type must be one of",
+                                        "keytypes(db)"))
+  }
   assertthat::is.count(num_unknown)
   assertthat::is.count(cores)
   assertthat::assert_that(is.logical(propogate_markers))
@@ -205,6 +217,7 @@ train_cell_classifier <- function(cds,
   ##### Make garnett_classifier #####
   classifier <- new_garnett_classifier()
   classifier@gene_id_type <- classifier_gene_id_type
+  if(is(db, "character") && db == "none") classifier@gene_id_type <- "custom"
 
   for(i in name_order) {
     # check meta data exists

diff --git a/R/utils.R b/R/utils.R
@@ -77,13 +77,15 @@ convert_gene_ids <- function(gene_list,
 #'
 get_feature_genes <- function(classifier,
                               node = "root",
-                              convert_ids = TRUE,
+                              convert_ids = FALSE,
                               db=NULL) {
   assertthat::assert_that(is(classifier, "garnett_classifier"))
   assertthat::assert_that(is.character(node))
   assertthat::assert_that(is.logical(convert_ids))
   if (convert_ids) {
     if (is.null(db)) stop("If convert_ids = TRUE, db must be provided.")
+    if (is(db, "character") && db == "none")
+      stop("Cannot convert IDs if db = 'none'.")
     assertthat::assert_that(is(db, "OrgDb"),
                             msg = paste0("db must be an 'AnnotationDb' object ",
                                          "see http://bioconductor.org/",
@@ -160,10 +162,15 @@ get_classifier_references <- function(classifier,
 #' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
 #'  For example, for humans use org.Hs.eg.db. See available packages at
 #'  \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+#'  If your organism does not have an AnnotationDb-class database available,
+#'  you can specify "none", however then Garnett will not check/convert gene
+#'  IDs, so your CDS and marker file must have the same gene ID type.
 #' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
-#'  of the values in \code{columns(db)}. Default is "ENSEMBL".
+#'  of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+#'  db = "none".
 #' @param marker_file_gene_id_type The type of gene ID used in the marker file.
 #'  Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
+#'  Ignored if db = "none".
 #' @param propogate_markers Logical. Should markers from child nodes of a cell
 #'  type be used in finding representatives of the parent type? Should
 #'  generally be \code{TRUE}.
@@ -172,7 +179,7 @@ get_classifier_references <- function(classifier,
 #'  calculation is slower with very large datasets.
 #' @param classifier_gene_id_type The type of gene ID that will be used in the
 #'  classifier. If possible for your organism, this should be "ENSEMBL", which
-#'  is the default.
+#'  is the default. Ignored if db = "none".
 #'
 #' @return Data.frame of marker check results.
 #'
@@ -245,19 +252,29 @@ check_markers <- function(cds,
                                       "before calling check_markers"))
   assertthat::assert_that(is.character(marker_file))
   assertthat::is.readable(marker_file)
-  assertthat::assert_that(is(db, "OrgDb"),
-                          msg = paste0("db must be an 'AnnotationDb' object ",
-                                       "see http://bioconductor.org/packages/",
-                                       "3.8/data/annotation/ for available"))
-  assertthat::assert_that(is.character(cds_gene_id_type))
-  assertthat::assert_that(is.character(marker_file_gene_id_type))
-  assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("cds_gene_id_type must be one of",
-                                      "keytypes(db)"))
-  assertthat::assert_that(marker_file_gene_id_type %in%
-                            AnnotationDbi::keytypes(db),
-                          msg = paste("marker_file_gene_id_type must be one of",
-                                      "keytypes(db)"))
+
+  if (is(db, "character") && db == "none") {
+    cds_gene_id_type <- 'custom'
+    classifier_gene_id_type <- 'custom'
+    marker_file_gene_id_type <- 'custom'
+  } else {
+    assertthat::assert_that(is(db, "OrgDb"),
+                            msg = paste0("db must be an 'AnnotationDb' object ",
+                                         "or 'none' see ",
+                                         "http://bioconductor.org/packages/",
+                                         "3.8/data/annotation/ for available"))
+    assertthat::assert_that(is.character(cds_gene_id_type))
+    assertthat::assert_that(is.character(marker_file_gene_id_type))
+    assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("cds_gene_id_type must be one of",
+                                        "keytypes(db)"))
+    assertthat::assert_that(marker_file_gene_id_type %in%
+                              AnnotationDbi::keytypes(db),
+                            msg = paste("marker_file_gene_id_type must be one",
+                                        "of keytypes(db)"))
+  }
+
+
   assertthat::assert_that(is.logical(propogate_markers))
   assertthat::assert_that(is.logical(use_tf_idf))
 

diff --git a/man/Parser.Rd b/man/Parser.Rd
diff --git a/man/check_markers.Rd b/man/check_markers.Rd
diff --git a/man/classify_cells.Rd b/man/classify_cells.Rd
diff --git a/man/get_feature_genes.Rd b/man/get_feature_genes.Rd
diff --git a/man/train_cell_classifier.Rd b/man/train_cell_classifier.Rd
diff --git a/tests/testthat/test-overall.R b/tests/testthat/test-overall.R
@@ -90,3 +90,30 @@ test_that("whole process is the same multi-core", {
   expect_equal(sum(pData(test_cds)$cluster_ext_type == "T cells"), 199)
 })
 
+data(test_cds)
+set.seed(260)
+test_classifier <- train_cell_classifier(cds = test_cds,
+                                         marker_file = "../pbmc_test.txt",
+                                         db='none',
+                                         min_observations = 10,
+                                         cds_gene_id_type = "SYMBOL",
+                                         num_unknown = 50,
+                                         cores = 2,
+                                         marker_file_gene_id_type = "SYMBOL")
+
+test_cds <- garnett::classify_cells(test_cds, test_classifier,
+                                    db = 'none',
+                                    rank_prob_ratio = 1.5,
+                                    cluster_extend = TRUE,
+                                    cds_gene_id_type = "SYMBOL")
+
+test_that("whole process is the same db = 'none'", {
+  expect_equal(sum(pData(test_cds)$cell_type == "B cells"), 211)
+  expect_equal(sum(pData(test_cds)$cell_type == "CD4 T cells"), 117)
+  expect_equal(sum(pData(test_cds)$cell_type == "CD8 T cells"), 62)
+  expect_equal(sum(pData(test_cds)$cell_type == "T cells"), 156)
+  expect_equal(sum(pData(test_cds)$cluster_ext_type == "B cells"), 401)
+  expect_equal(sum(pData(test_cds)$cluster_ext_type == "CD4 T cells"), 200)
+  expect_equal(sum(pData(test_cds)$cluster_ext_type == "T cells"), 199)
+})
+