Add code for pseudobulk computation from Seurat objects.

ddiez · Dec 8, 2023 · 7f7f212 · 7f7f212
1 parent 4c3a2f3
commit 7f7f212
Show file tree

Hide file tree

Showing 5 changed files with 134 additions and 1 deletion.
diff --git a/NAMESPACE b/NAMESPACE
@@ -75,6 +75,7 @@ S3method(plot_violin,Seurat)
 S3method(plot_violin,SingleCellExperiment)
 S3method(plot_violin,data.frame)
 S3method(process,Seurat)
+S3method(pseudobulk,Seurat)
 S3method(reduce_dim,SingleCellExperiment)
 S3method(reduce_dim,matrix)
 S3method(run_enrichment,data.frame)
@@ -138,6 +139,7 @@ export(plot_velocity)
 export(plot_violin)
 export(plot_volcano)
 export(process)
+export(pseudobulk)
 export(read_cpdb_out)
 export(reduce_dim)
 export(run_enrichment)
@@ -156,6 +158,7 @@ importFrom(ComplexHeatmap,Heatmap)
 importFrom(ComplexHeatmap,columnAnnotation)
 importFrom(GenomeInfoDb,standardChromosomes)
 importFrom(Matrix,t)
+importFrom(Seurat,DietSeurat)
 importFrom(Seurat,FindClusters)
 importFrom(Seurat,FindNeighbors)
 importFrom(Seurat,FindVariableFeatures)

diff --git a/R/pseudobulk.R b/R/pseudobulk.R
@@ -0,0 +1,57 @@
+#' pseudobulk
+#'
+#' Generate pseudobulk counts of single cell data per cell population and sample name. Return a list of DGEList objects.
+#'
+#' @param x Seurat object.
+#' @param split.by Split single cell data using this column. Typically cell populations or clusters.
+#' @param group.by Aggregate counts from cells in each group. Typically sample names.
+#' @param samples A data.frame with sample information to include in slot "samples" in the DGEList object.
+#' @param genes A data.frame with gene information to include in slot "genes" in the DGEList object.
+#' @param assay Seurat assay to use. Default: NULL (default assay).
+#' @param layers Seurat layers to use. Default: counts.
+#'
+#' @export
+pseudobulk <- function(x, split.by, group.by, samples=NULL, genes=NULL, ...) {
+  UseMethod("pseudobulk")
+}
+
+#' @rdname pseudobulk
+#' @export
+pseudobulk.Seurat <- function(x, split.by, group.by, samples=NULL, genes=NULL, assay=NULL, layers="counts", ...) {
+  assay <- DefaultAssay(x)
+  x <- Seurat::DietSeurat(x, layers=layers, assay=assay)
+
+  groups <- x[[]][[split.by]]
+  if (!is.factor(groups)) groups <- factor(groups)
+  group_levels <- levels(groups)
+  xl <- lapply(group_levels, function(group) {
+    x[, groups == group]
+  })
+  names(xl) <- group_levels
+
+  if (is.null(samples)) {
+    samples <- x[[]][[group.by]]
+    if (!is.factor(samples))
+      samples <- levels(factor(samples))
+    samples <- data.frame(row.names=samples)
+  }
+
+  xl <- lapply(xl, function(x) {
+    tmp <- Seurat::AggregateExpression(x, pb.method="aggregate", group.by=group.by, assay=assay)[[assay]]
+    if (!is.null(samples))
+      tmp <- tmp[, rownames(samples)]
+    tmp
+  })
+
+  dge <- lapply(names(xl), function(n) {
+    s <- samples
+    s$cluster <- n
+    tmp <- edgeR::DGEList(xl[[n]], samples=s)
+    tmp$genes <- genes
+    colnames(tmp) <- unname(colnames(tmp))
+    tmp
+  })
+  names(dge) <- names(xl)
+
+  dge
+}
diff --git a/R/scmisc-package.R b/R/scmisc-package.R
@@ -16,6 +16,6 @@
 #' @importFrom grid gpar grid.circle unit.c
 #' @importFrom stats prcomp kmeans dist hclust cutree as.dist cor p.adjust setNames
 #' @importFrom utils tail head
-#' @importFrom Seurat FindClusters FindNeighbors FindVariableFeatures RunPCA RunUMAP ScaleData
+#' @importFrom Seurat FindClusters FindNeighbors FindVariableFeatures RunPCA RunUMAP ScaleData DietSeurat
 ## usethis namespace: end
 NULL
diff --git a/man/pseudobulk.Rd b/man/pseudobulk.Rd
diff --git a/tests/testthat/test-pseudobulk.R b/tests/testthat/test-pseudobulk.R
@@ -0,0 +1,35 @@
+context("test-pseudobulk")
+
+m <- matrix(c(1:24), nrow=3, dimnames=list(paste0("gene-", 1:3), paste0("cell-", 1:8)))
+samplename <- rep(c("D1", "D2"), 4)
+celltype <- rep(c("A", "B"), each=4)
+meta <- data.frame(samplename=samplename, celltype=celltype)
+
+x <- CreateSeuratObject(as.sparse(m), meta=meta)
+
+samples <- data.frame(sex=c("F", "M"), row.names=c("D1", "D2"))
+
+y <- pseudobulk(x, split.by="celltype", group.by="samplename", assay="RNA", samples=samples)
+
+test_that("sort_features works", {
+  expect_equal(length(y), 2)
+  expect_equal(dim(y[[1]]), c(3, 2))
+  expect_equal(dim(y[[2]]), c(3, 2))
+  expect_equal(dim(y[[1]]$samples), c(2, 5))
+  expect_equal(dim(y[[2]]$samples), c(2, 5))
+  expect_null(y[[1]]$genes)
+  expect_null(y[[2]]$genes)
+})
+
+genes <- data.frame(row.names=rownames(m))
+y <- pseudobulk(x, split.by="celltype", group.by="samplename", assay="RNA", genes=genes)
+
+test_that("sort_features works", {
+  expect_equal(length(y), 2)
+  expect_equal(dim(y[[1]]), c(3, 2))
+  expect_equal(dim(y[[2]]), c(3, 2))
+  expect_equal(dim(y[[1]]$samples), c(2, 4))
+  expect_equal(dim(y[[2]]$samples), c(2, 4))
+  expect_equal(dim(y[[1]]$genes), c(3, 0))
+  expect_equal(dim(y[[2]]$genes), c(3, 0))
+})