From 3cd521be69b0c595b3e02ca4cbd19c1695846ceb Mon Sep 17 00:00:00 2001
From: Hannah A Pliner <hpliner@gmail.com>
Date: Tue, 19 Feb 2019 16:51:59 -0800
Subject: [PATCH] Allow db = 'none' if no annotationdbi available

---
 DESCRIPTION                   |  2 +-
 R/classify_cells.R            | 30 ++++++++++++++-------
 R/parser.R                    |  6 ++++-
 R/train_cell_classifier.R     | 47 +++++++++++++++++++++------------
 R/utils.R                     | 49 +++++++++++++++++++++++------------
 man/Parser.Rd                 |  6 ++++-
 man/check_markers.Rd          | 13 +++++++---
 man/classify_cells.Rd         |  8 ++++--
 man/get_feature_genes.Rd      |  2 +-
 man/train_cell_classifier.Rd  | 13 +++++++---
 tests/testthat/test-overall.R | 27 +++++++++++++++++++
 tests/testthat/test-utils.R   | 28 +++++++++++++++++++-
 12 files changed, 174 insertions(+), 57 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 85ef4d6..497d60c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: garnett
 Type: Package
 Title: Automated cell type classification
-Version: 0.1.3
+Version: 0.1.4
 Author: c(
     person("Hannah", "Pliner", email = "hpliner@uw.edu", role = c("aut", "cre")), 
     person("Cole", "Trapnell", email = "coletrap@uw.edu", role = c("aut")))
diff --git a/R/classify_cells.R b/R/classify_cells.R
index f852af0..4dd3756 100644
--- a/R/classify_cells.R
+++ b/R/classify_cells.R
@@ -10,8 +10,12 @@
 #' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
 #'  For example, for humans use org.Hs.eg.db. See available packages at
 #'  \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+#'  If your organism does not have an AnnotationDb-class database available,
+#'  you can specify "none", however then Garnett will not check/convert gene
+#'  IDs, so your CDS and marker file must have the same gene ID type.
 #' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
-#'  of the values in \code{columns(db)}. Default is "ENSEMBL".
+#'  of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+#'  db = "none".
 #' @param rank_prob_ratio Numeric value greater than 1. This is the minimum
 #'  odds ratio between the probability of the most likely cell type to the
 #'  second most likely cell type to allow assignment. Default is 1.5. Higher
@@ -72,14 +76,22 @@ classify_cells <- function(cds,
                           msg = paste("Must run estimateSizeFactors() on cds",
                                       "before calling classify_cells"))
   assertthat::assert_that(is(classifier, "garnett_classifier"))
-  assertthat::assert_that(is(db, "OrgDb"),
-                          msg = paste0("db must be an 'AnnotationDb' object ",
-                                       "see http://bioconductor.org/packages/",
-                                       "3.8/data/annotation/ for available"))
-  assertthat::assert_that(is.character(cds_gene_id_type))
-  assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("cds_gene_id_type must be one of",
-                                      "keytypes(db)"))
+  if(is(db, "character") && db == "none") {
+    cds_gene_id_type <- 'custom'
+    classifier_gene_id_type <- 'custom'
+    marker_file_gene_id_type <- 'custom'
+  } else {
+    assertthat::assert_that(is(db, "OrgDb"),
+                            msg = paste0("db must be an 'AnnotationDb' object ",
+                                         "or 'none' see ",
+                                         "http://bioconductor.org/packages/",
+                                         "3.8/data/annotation/ for available"))
+    assertthat::assert_that(is.character(cds_gene_id_type))
+    assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("cds_gene_id_type must be one of",
+                                        "keytypes(db)"))
+  }
+
   assertthat::assert_that(is.numeric(rank_prob_ratio))
   assertthat::assert_that(rank_prob_ratio > 1,
                           msg = "rank_prob_ratio must be greater than 1")
diff --git a/R/parser.R b/R/parser.R
index ab20fd9..54be5a2 100644
--- a/R/parser.R
+++ b/R/parser.R
@@ -68,7 +68,11 @@ Lexer <- R6::R6Class(
 #'   For example, for human, use \code{\link[org.Hs.eg.db]{org.Hs.eg.db}}. To
 #'   see available gene ID types, you can run \code{columns(db)}. You will
 #'   specify which gene ID type you used when calling
-#'   \code{\link{train_cell_classifier}}.} \item{not expressed:}{In addition to
+#'   \code{\link{train_cell_classifier}}.} If your species does not have an
+#'   annotation dataset of type \code{\link[AnnotationDbi]{AnnotationDb-class}},
+#'   you can set \code{db = 'none'}, however Garnett will then not convert gene
+#'   ID types, so CDS and marker file gene ID types need to be the same.
+#'   \item{not expressed:}{In addition to
 #'   specifying genes that the cell type should express, you can also specify
 #'   genes that your cell type should not express. Details on specifying genes
 #'   are the same as for \code{expressed:}.}
diff --git a/R/train_cell_classifier.R b/R/train_cell_classifier.R
index 02a7681..0936d7c 100644
--- a/R/train_cell_classifier.R
+++ b/R/train_cell_classifier.R
@@ -14,10 +14,15 @@
 #' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
 #'  For example, for humans use org.Hs.eg.db. See available packages at
 #'  \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+#'  If your organism does not have an AnnotationDb-class database available,
+#'  you can specify "none", however then Garnett will not check/convert gene
+#'  IDs, so your CDS and marker file must have the same gene ID type.
 #' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
-#'  of the values in \code{columns(db)}. Default is "ENSEMBL".
+#'  of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+#'  db = "none".
 #' @param marker_file_gene_id_type The type of gene ID used in the marker file.
 #'  Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
+#'  Ignored if db = "none".
 #' @param min_observations An integer. The minimum number of representative
 #'  cells per cell type required to include the cell type in the predictive
 #'  model. Default is 8.
@@ -36,7 +41,7 @@
 #'  preset lambda values are used.
 #' @param classifier_gene_id_type The type of gene ID that will be used in the
 #'  classifier. If possible for your organism, this should be "ENSEMBL", which
-#'  is the default.
+#'  is the default. Ignored if db = "none".
 #'
 #' @details This function has three major parts: 1) parsing the marker file 2)
 #'  choosing cell representatives and 3) training the classifier. Details on
@@ -108,22 +113,29 @@ train_cell_classifier <- function(cds,
                                       "before calling train_cell_classifier"))
   assertthat::assert_that(is.character(marker_file))
   assertthat::is.readable(marker_file)
-  assertthat::assert_that(is(db, "OrgDb"),
-                          msg = paste0("db must be an 'AnnotationDb' object ",
-                                       "see http://bioconductor.org/packages/",
-                                       "3.8/data/annotation/ for available"))
-  assertthat::assert_that(is.character(cds_gene_id_type))
-  assertthat::assert_that(is.character(marker_file_gene_id_type))
-  assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("cds_gene_id_type must be one of",
-                                      "keytypes(db)"))
-  assertthat::assert_that(classifier_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("classifier_gene_id_type must be one of",
-                                      "keytypes(db)"))
-  assertthat::assert_that(marker_file_gene_id_type %in%
+  if (is(db, "character") && db == "none") {
+    cds_gene_id_type <- 'custom'
+    classifier_gene_id_type <- 'custom'
+    marker_file_gene_id_type <- 'custom'
+  } else {
+    assertthat::assert_that(is(db, "OrgDb"),
+                            msg = paste0("db must be an 'AnnotationDb' object ",
+                                         "or 'none' see ",
+                                         "http://bioconductor.org/packages/",
+                                         "3.8/data/annotation/ for available"))
+    assertthat::assert_that(is.character(cds_gene_id_type))
+    assertthat::assert_that(is.character(marker_file_gene_id_type))
+    assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("cds_gene_id_type must be one of",
+                                        "keytypes(db)"))
+    assertthat::assert_that(classifier_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("classifier_gene_id_type must be one of",
+                                        "keytypes(db)"))
+    assertthat::assert_that(marker_file_gene_id_type %in%
                             AnnotationDbi::keytypes(db),
-                          msg = paste("marker_file_gene_id_type must be one of",
-                                      "keytypes(db)"))
+                            msg = paste("marker_file_gene_id_type must be one of",
+                                        "keytypes(db)"))
+  }
   assertthat::is.count(num_unknown)
   assertthat::is.count(cores)
   assertthat::assert_that(is.logical(propogate_markers))
@@ -205,6 +217,7 @@ train_cell_classifier <- function(cds,
   ##### Make garnett_classifier #####
   classifier <- new_garnett_classifier()
   classifier@gene_id_type <- classifier_gene_id_type
+  if(is(db, "character") && db == "none") classifier@gene_id_type <- "custom"
 
   for(i in name_order) {
     # check meta data exists
diff --git a/R/utils.R b/R/utils.R
index 5f078af..8483bf0 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -77,13 +77,15 @@ convert_gene_ids <- function(gene_list,
 #'
 get_feature_genes <- function(classifier,
                               node = "root",
-                              convert_ids = TRUE,
+                              convert_ids = FALSE,
                               db=NULL) {
   assertthat::assert_that(is(classifier, "garnett_classifier"))
   assertthat::assert_that(is.character(node))
   assertthat::assert_that(is.logical(convert_ids))
   if (convert_ids) {
     if (is.null(db)) stop("If convert_ids = TRUE, db must be provided.")
+    if (is(db, "character") && db == "none")
+      stop("Cannot convert IDs if db = 'none'.")
     assertthat::assert_that(is(db, "OrgDb"),
                             msg = paste0("db must be an 'AnnotationDb' object ",
                                          "see http://bioconductor.org/",
@@ -160,10 +162,15 @@ get_classifier_references <- function(classifier,
 #' @param db Bioconductor AnnotationDb-class package for converting gene IDs.
 #'  For example, for humans use org.Hs.eg.db. See available packages at
 #'  \href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+#'  If your organism does not have an AnnotationDb-class database available,
+#'  you can specify "none", however then Garnett will not check/convert gene
+#'  IDs, so your CDS and marker file must have the same gene ID type.
 #' @param cds_gene_id_type The type of gene ID used in the CDS. Should be one
-#'  of the values in \code{columns(db)}. Default is "ENSEMBL".
+#'  of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+#'  db = "none".
 #' @param marker_file_gene_id_type The type of gene ID used in the marker file.
 #'  Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
+#'  Ignored if db = "none".
 #' @param propogate_markers Logical. Should markers from child nodes of a cell
 #'  type be used in finding representatives of the parent type? Should
 #'  generally be \code{TRUE}.
@@ -172,7 +179,7 @@ get_classifier_references <- function(classifier,
 #'  calculation is slower with very large datasets.
 #' @param classifier_gene_id_type The type of gene ID that will be used in the
 #'  classifier. If possible for your organism, this should be "ENSEMBL", which
-#'  is the default.
+#'  is the default. Ignored if db = "none".
 #'
 #' @return Data.frame of marker check results.
 #'
@@ -245,19 +252,29 @@ check_markers <- function(cds,
                                       "before calling check_markers"))
   assertthat::assert_that(is.character(marker_file))
   assertthat::is.readable(marker_file)
-  assertthat::assert_that(is(db, "OrgDb"),
-                          msg = paste0("db must be an 'AnnotationDb' object ",
-                                       "see http://bioconductor.org/packages/",
-                                       "3.8/data/annotation/ for available"))
-  assertthat::assert_that(is.character(cds_gene_id_type))
-  assertthat::assert_that(is.character(marker_file_gene_id_type))
-  assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
-                          msg = paste("cds_gene_id_type must be one of",
-                                      "keytypes(db)"))
-  assertthat::assert_that(marker_file_gene_id_type %in%
-                            AnnotationDbi::keytypes(db),
-                          msg = paste("marker_file_gene_id_type must be one of",
-                                      "keytypes(db)"))
+
+  if (is(db, "character") && db == "none") {
+    cds_gene_id_type <- 'custom'
+    classifier_gene_id_type <- 'custom'
+    marker_file_gene_id_type <- 'custom'
+  } else {
+    assertthat::assert_that(is(db, "OrgDb"),
+                            msg = paste0("db must be an 'AnnotationDb' object ",
+                                         "or 'none' see ",
+                                         "http://bioconductor.org/packages/",
+                                         "3.8/data/annotation/ for available"))
+    assertthat::assert_that(is.character(cds_gene_id_type))
+    assertthat::assert_that(is.character(marker_file_gene_id_type))
+    assertthat::assert_that(cds_gene_id_type %in% AnnotationDbi::keytypes(db),
+                            msg = paste("cds_gene_id_type must be one of",
+                                        "keytypes(db)"))
+    assertthat::assert_that(marker_file_gene_id_type %in%
+                              AnnotationDbi::keytypes(db),
+                            msg = paste("marker_file_gene_id_type must be one",
+                                        "of keytypes(db)"))
+  }
+
+
   assertthat::assert_that(is.logical(propogate_markers))
   assertthat::assert_that(is.logical(use_tf_idf))
 
diff --git a/man/Parser.Rd b/man/Parser.Rd
index 8b91149..a9daf8a 100644
--- a/man/Parser.Rd
+++ b/man/Parser.Rd
@@ -56,7 +56,11 @@ The following are the potential descriptors:
   For example, for human, use \code{\link[org.Hs.eg.db]{org.Hs.eg.db}}. To
   see available gene ID types, you can run \code{columns(db)}. You will
   specify which gene ID type you used when calling
-  \code{\link{train_cell_classifier}}.} \item{not expressed:}{In addition to
+  \code{\link{train_cell_classifier}}.} If your species does not have an
+  annotation dataset of type \code{\link[AnnotationDbi]{AnnotationDb-class}},
+  you can set \code{db = 'none'}, however Garnett will then not convert gene
+  ID types, so CDS and marker file gene ID types need to be the same.
+  \item{not expressed:}{In addition to
   specifying genes that the cell type should express, you can also specify
   genes that your cell type should not express. Details on specifying genes
   are the same as for \code{expressed:}.}
diff --git a/man/check_markers.Rd b/man/check_markers.Rd
index 6ff1acb..a1988bf 100644
--- a/man/check_markers.Rd
+++ b/man/check_markers.Rd
@@ -17,13 +17,18 @@ See details and documentation for \code{\link{Parser}} by running
 
 \item{db}{Bioconductor AnnotationDb-class package for converting gene IDs.
 For example, for humans use org.Hs.eg.db. See available packages at
-\href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.}
+\href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+If your organism does not have an AnnotationDb-class database available,
+you can specify "none", however then Garnett will not check/convert gene
+IDs, so your CDS and marker file must have the same gene ID type.}
 
 \item{cds_gene_id_type}{The type of gene ID used in the CDS. Should be one
-of the values in \code{columns(db)}. Default is "ENSEMBL".}
+of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+db = "none".}
 
 \item{marker_file_gene_id_type}{The type of gene ID used in the marker file.
-Should be one of the values in \code{columns(db)}. Default is "SYMBOL".}
+Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
+Ignored if db = "none".}
 
 \item{propogate_markers}{Logical. Should markers from child nodes of a cell
 type be used in finding representatives of the parent type? Should
@@ -35,7 +40,7 @@ calculation is slower with very large datasets.}
 
 \item{classifier_gene_id_type}{The type of gene ID that will be used in the
 classifier. If possible for your organism, this should be "ENSEMBL", which
-is the default.}
+is the default. Ignored if db = "none".}
 }
 \value{
 Data.frame of marker check results.
diff --git a/man/classify_cells.Rd b/man/classify_cells.Rd
index 9a0abde..a994b86 100644
--- a/man/classify_cells.Rd
+++ b/man/classify_cells.Rd
@@ -15,10 +15,14 @@ classify_cells(cds, classifier, db, cds_gene_id_type = "ENSEMBL",
 
 \item{db}{Bioconductor AnnotationDb-class package for converting gene IDs.
 For example, for humans use org.Hs.eg.db. See available packages at
-\href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.}
+\href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+If your organism does not have an AnnotationDb-class database available,
+you can specify "none", however then Garnett will not check/convert gene
+IDs, so your CDS and marker file must have the same gene ID type.}
 
 \item{cds_gene_id_type}{The type of gene ID used in the CDS. Should be one
-of the values in \code{columns(db)}. Default is "ENSEMBL".}
+of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+db = "none".}
 
 \item{rank_prob_ratio}{Numeric value greater than 1. This is the minimum
 odds ratio between the probability of the most likely cell type to the
diff --git a/man/get_feature_genes.Rd b/man/get_feature_genes.Rd
index 1074c2d..cb60643 100644
--- a/man/get_feature_genes.Rd
+++ b/man/get_feature_genes.Rd
@@ -4,7 +4,7 @@
 \alias{get_feature_genes}
 \title{Extract feature genes}
 \usage{
-get_feature_genes(classifier, node = "root", convert_ids = TRUE,
+get_feature_genes(classifier, node = "root", convert_ids = FALSE,
   db = NULL)
 }
 \arguments{
diff --git a/man/train_cell_classifier.Rd b/man/train_cell_classifier.Rd
index d141a15..5307d02 100644
--- a/man/train_cell_classifier.Rd
+++ b/man/train_cell_classifier.Rd
@@ -19,13 +19,18 @@ See details and documentation for \code{\link{Parser}} by running
 
 \item{db}{Bioconductor AnnotationDb-class package for converting gene IDs.
 For example, for humans use org.Hs.eg.db. See available packages at
-\href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.}
+\href{http://bioconductor.org/packages/3.8/data/annotation/}{Bioconductor}.
+If your organism does not have an AnnotationDb-class database available,
+you can specify "none", however then Garnett will not check/convert gene
+IDs, so your CDS and marker file must have the same gene ID type.}
 
 \item{cds_gene_id_type}{The type of gene ID used in the CDS. Should be one
-of the values in \code{columns(db)}. Default is "ENSEMBL".}
+of the values in \code{columns(db)}. Default is "ENSEMBL". Ignored if
+db = "none".}
 
 \item{marker_file_gene_id_type}{The type of gene ID used in the marker file.
-Should be one of the values in \code{columns(db)}. Default is "SYMBOL".}
+Should be one of the values in \code{columns(db)}. Default is "SYMBOL".
+Ignored if db = "none".}
 
 \item{min_observations}{An integer. The minimum number of representative
 cells per cell type required to include the cell type in the predictive
@@ -51,7 +56,7 @@ preset lambda values are used.}
 
 \item{classifier_gene_id_type}{The type of gene ID that will be used in the
 classifier. If possible for your organism, this should be "ENSEMBL", which
-is the default.}
+is the default. Ignored if db = "none".}
 }
 \description{
 This function takes single-cell expression data in the form of a CDS object
diff --git a/tests/testthat/test-overall.R b/tests/testthat/test-overall.R
index d0bf25e..f316ad4 100644
--- a/tests/testthat/test-overall.R
+++ b/tests/testthat/test-overall.R
@@ -90,3 +90,30 @@ test_that("whole process is the same multi-core", {
   expect_equal(sum(pData(test_cds)$cluster_ext_type == "T cells"), 199)
 })
 
+data(test_cds)
+set.seed(260)
+test_classifier <- train_cell_classifier(cds = test_cds,
+                                         marker_file = "../pbmc_test.txt",
+                                         db='none',
+                                         min_observations = 10,
+                                         cds_gene_id_type = "SYMBOL",
+                                         num_unknown = 50,
+                                         cores = 2,
+                                         marker_file_gene_id_type = "SYMBOL")
+
+test_cds <- garnett::classify_cells(test_cds, test_classifier,
+                                    db = 'none',
+                                    rank_prob_ratio = 1.5,
+                                    cluster_extend = TRUE,
+                                    cds_gene_id_type = "SYMBOL")
+
+test_that("whole process is the same db = 'none'", {
+  expect_equal(sum(pData(test_cds)$cell_type == "B cells"), 211)
+  expect_equal(sum(pData(test_cds)$cell_type == "CD4 T cells"), 117)
+  expect_equal(sum(pData(test_cds)$cell_type == "CD8 T cells"), 62)
+  expect_equal(sum(pData(test_cds)$cell_type == "T cells"), 156)
+  expect_equal(sum(pData(test_cds)$cluster_ext_type == "B cells"), 401)
+  expect_equal(sum(pData(test_cds)$cluster_ext_type == "CD4 T cells"), 200)
+  expect_equal(sum(pData(test_cds)$cluster_ext_type == "T cells"), 199)
+})
+
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
index 2dd14c8..2a45db3 100644
--- a/tests/testthat/test-utils.R
+++ b/tests/testthat/test-utils.R
@@ -22,7 +22,7 @@ out2 <- get_feature_genes(test_classifier, convert_ids = F, db=org.Hs.eg.db)
 out3 <- get_feature_genes(test_classifier, db=org.Hs.eg.db, node="T cells")
 
 test_that("get_feature_genes works", {
-  expect_error(get_feature_genes(test_classifier),
+  expect_error(get_feature_genes(test_classifier, convert_ids = TRUE),
                "If convert_ids = TRUE, db must be provided.")
   expect_is(out, "data.frame")
   expect_is(out2, "data.frame")
@@ -108,3 +108,29 @@ test_that("plot_markers works", {
                               fig = plot_markers(marker_check,
                                                  amb_marker_cutoff = .7))
 })
+
+marker_check <- check_markers(test_cds,
+                              "../pbmc_test.txt",
+                              db="none",
+                              cds_gene_id_type = "SYMBOL",
+                              marker_file_gene_id_type = "SYMBOL")
+
+marker_check2 <- check_markers(test_cds, use_tf_idf = F,
+                               "../pbmc_1mark.txt",
+                               db="none",
+                               cds_gene_id_type = "SYMBOL",
+                               marker_file_gene_id_type = "SYMBOL")
+
+test_that("check_markers works db='none'", {
+  expect_equal(sum(marker_check$marker_score), 414.3958, tol = 1e-3)
+  expect_equal(nrow(marker_check), 18)
+  expect_equal(sum(marker_check$summary != "Ok"), 3)
+  sub <- subset(marker_check, parent != "root")
+  expect_equal(sum(sub$nominates), 323)
+  expect_equal(sum(sub$exclusion_dismisses), 233)
+  expect_equal(sum(sub$inclusion_ambiguates), 63)
+  expect_equal(sub$most_overlap[5], "CD4 T cells")
+  expect_equal(sum(marker_check2$marker_score, na.rm=T), 388.5029, tol = 1e-4)
+  expect_equal(marker_check2[18, "total_nominated"], 46)
+})
+