Skip to content

Commit

Permalink
Added aggregate and split
Browse files Browse the repository at this point in the history
  • Loading branch information
lambdamoses committed Jun 13, 2024
1 parent aa523b3 commit 02812a8
Show file tree
Hide file tree
Showing 12 changed files with 732 additions and 174 deletions.
4 changes: 4 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Imports:
sf,
sfheaders,
SingleCellExperiment,
sparseMatrixStats,
SpatialExperiment,
spdep (>= 1.1-7),
SummarizedExperiment,
Expand All @@ -50,6 +51,8 @@ Collate:
'AllGenerics.R'
'utils.R'
'SFE-class.R'
'aggregate.R'
'align.R'
'annotGeometries.R'
'cbind.R'
'coerce.R'
Expand All @@ -68,6 +71,7 @@ Collate:
'reexports.R'
'saveRDS.R'
'spatialGraphs.R'
'split.R'
'subset.R'
'transformation.R'
'updateObject.R'
Expand Down
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ export(addVisiumSpotPoly)
export(affine)
export(affineImg)
export(aggBboxes)
export(aggregateTx)
export(aggregateTxTech)
export(annotGeometries)
export(annotGeometry)
export(annotGeometryNames)
Expand Down Expand Up @@ -137,6 +139,8 @@ export(spatialCoordsNames)
export(spatialGraph)
export(spatialGraphNames)
export(spatialGraphs)
export(splitContiguity)
export(splitSamples)
export(spotPoly)
export(st_any_intersects)
export(st_any_pred)
Expand Down Expand Up @@ -174,6 +178,7 @@ exportMethods("spatialGraphNames<-")
exportMethods("spatialGraphs<-")
exportMethods(addImg)
exportMethods(affineImg)
exportMethods(aggregate)
exportMethods(annotGeometries)
exportMethods(annotGeometry)
exportMethods(annotGeometryNames)
Expand Down Expand Up @@ -203,6 +208,7 @@ exportMethods(show)
exportMethods(spatialGraph)
exportMethods(spatialGraphNames)
exportMethods(spatialGraphs)
exportMethods(split)
exportMethods(toExtImage)
exportMethods(toSpatRasterImage)
exportMethods(toSpatialFeatureExperiment)
Expand Down Expand Up @@ -273,9 +279,11 @@ importFrom(SpatialExperiment,spatialCoordsNames)
importFrom(SpatialExperiment,toSpatialExperiment)
importFrom(SummarizedExperiment,"colData<-")
importFrom(SummarizedExperiment,"rowData<-")
importFrom(SummarizedExperiment,assayNames)
importFrom(SummarizedExperiment,assays)
importFrom(SummarizedExperiment,colData)
importFrom(SummarizedExperiment,rowData)
importFrom(data.table,as.data.table)
importFrom(data.table,fread)
importFrom(data.table,is.data.table)
importFrom(data.table,merge.data.table)
Expand Down Expand Up @@ -312,6 +320,7 @@ importFrom(sf,st_bbox)
importFrom(sf,st_buffer)
importFrom(sf,st_cast)
importFrom(sf,st_centroid)
importFrom(sf,st_collection_extract)
importFrom(sf,st_coordinates)
importFrom(sf,st_covered_by)
importFrom(sf,st_disjoint)
Expand All @@ -326,6 +335,7 @@ importFrom(sf,st_is_empty)
importFrom(sf,st_is_valid)
importFrom(sf,st_join)
importFrom(sf,st_linestring)
importFrom(sf,st_make_grid)
importFrom(sf,st_multilinestring)
importFrom(sf,st_multipoint)
importFrom(sf,st_multipolygon)
Expand Down
284 changes: 284 additions & 0 deletions R/aggregate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
.aggregate_num <- function(mat, inds, FUN, fun_name, BPPARAM) {
if (fun_name %in% c("sum", "mean")) {
if (fun_name == "sum")
mat_agg <- sparseMatrix(i = unlist(inds), j = seq_along(inds), x = 1,
dims = c(ncol(mat), length(inds)))
else if (fun_name == "mean") {
ll <- lengths(inds)
mat_agg <- sparseMatrix(i = unlist(inds), j = seq_along(inds),
x = rep(1/ll, times = ll),
dims = c(ncol(mat), length(inds)))
}
out_agg <- mat %*% mat_agg
} else if (fun_name %in% getNamespaceExports("sparseMatrixStats")) {
# For functions supported by sparseMatrixStats
if (!grepl("^row", fun_name))
stop("Only row* functions from sparseMatrixStats can be used for FUN.")
cnts_agg <- bplapply(inds, function(i) {
m <- mat[,i,drop = FALSE]
FUN(m)
}, BPPARAM = BPPARAM)
out_agg <- matrix(unlist(out_agg), ncol = length(inds))
} else stop("Function ", fun_name, " not supported for aggregating SFE.")
out_agg
}

# For one sample
.aggregate_sample_cell <- function(x, by, FUN, fun_name, colGeometryName, join,
new_geometry_name, BPPARAM) {
cg <- colGeometry(x, type = colGeometryName)
inds <- join(by, cg) # somewhat faster than join(cg, by) when by has fewer geometries than cg
not_empty <- which(length(inds))
by <- by[not_empty]
inds <- inds[not_empty]
cnts <- counts(x)
# Aggregate the gene count matrix, only do the counts assay
# Express this as a matrix multiplication for sum. Use sweep to deal with mean
cnts_agg <- .aggregate_num(cnts, inds, FUN, fun_name, BPPARAM)
# Aggregate numeric columns of colData; logical are converted to numeric
df <- colData(x)
which_logical <- which(vapply(df, is.logical, FUN.VALUE = logical(1)))
for (i in which_logical) {
df[,i] <- as.integer(df[,i])
}
numeric_cols <- vapply(df, is.numeric, FUN.VALUE = logical(1))
if (any(numeric_cols)) {
cd <- t(as.matrix(df[,numeric_cols, drop = FALSE]))
cd_agg <- .aggregate_num(cd, inds, FUN, fun_name, BPPARAM) |> t()
cd_agg <- as(cd_agg, "DFrame")
}
# Deal with anything that is neither numerical nor logical
# Primarily character and factor. What about list columns? I don't know.
if (!all(numeric_cols)) {
df_bin <- data.frame(bin = rep(seq_along(inds), times = lengths(inds)),
index = unlist(inds))
df_bin <- df_bin[order(df_bin$index),]
names_not_num <- names(df)[!numeric_cols]
for (n in names_not_num)
df_bin[[n]] <- split(df[[,i]][n], list(bin = df_bin$bin))
if (any(numeric_cols)) cd_agg <- cbind(df_bin, cd_agg)
}
cgs <- list(bins = by)
names(cgs) <- new_geometry_name
SpatialFeatureExperiment(assays = list(counts = cngs_agg), colData = cd_agg,
sample_id = sampleIDs(x),
colGeometries = cgs)
}

#' Aggregate transcript spots from file
#'
#' This function reads the transcript spot file from the standard output of the
#' commercial technologies (not GeoParquet) for spatial aggregation where the
#' spots are assigned to polygons such as cells or spatial bins. Presets for
#' Xenium, MERFISH, and CosMX are available.
#'
#' @param df If the file is already loaded into memory, a data frame (sf) with
#' columns for the x, y, and optionally z coordinates and gene assignment of
#' each transcript spot. If specified, then argument \code{file} will be
#' ignored.
#' @param by A \code{sfc} or \code{sf} object for spatial aggregation.
#' @inheritParams formatTxTech
#' @inheritParams formatTxSpots
#' @note The resulting SFE object often includes geometries (e.g. grid cells)
#' outside tissue, because there can be transcript spots detected outside the
#' tissue.
#' @return A SFE object with count matrix for number of spots of each gene in
#' each geometry. Geometries with no spot are removed.
#' @importFrom data.table as.data.table
#' @importFrom sf st_make_grid
#' @export
aggregateTx <- function(file, df = NULL, by = NULL, spatialCoordsNames = c("X", "Y", "Z"),
gene_col = "gene",
phred_col = "qv", min_phred = 20, flip = FALSE,
cellsize = NULL, square = TRUE, flat_topped = FALSE,
new_geometry_name = "bins", unit = "micron") {
# This is only for one file, one sample
if (!is.null(df)) file <- df
mols <- .check_tx_file(file, spatialCoordsNames, gene_col, phred_col,
min_phred, flip, BPPARAM)
mols <- df2sf(mols, spatialCoordsNames = spatialCoordsNames,
geometryType = "POINT")
if (is.null(by))
by <- st_make_grid(mols, cellsize = cellsize, square = square,
flat_topped = flat_topped)
else if (is(by, "sf")) by <- st_geometry(by)
grid_sf <- st_sf(grid_id = seq_along(by), geometry = by)
mols <- st_join(mols, grid_sf) # Took 5.87 minutes for 7171453 spots and 8555 bins
mols <- st_drop_geometry(mols) |> as.data.table()
mols <- mols[, .N, by = .(gene, grid_id)]
mols$gene <- factor(mols$gene) # The levels are alphabetically arranged
mols$gene_index <- as.integer(mols$gene)
new_mat <- sparseMatrix(i = mols$gene_index, j = mols$grid_id, x = mols$N)
rownames(new_mat) <- levels(mols$gene)
colnames(new_mat) <- seq_len(ncol(new_mat))
new_mat <- new_mat[,colSums(new_mat) > 0] # Remove empty grid cells
cgs <- list(bins = grid_sf[grid_sf$grid_id %in% mols$grid_id, "geometry"])
names(cgs) <- new_geometry_name
SpatialFeatureExperiment(assays = list(counts = new_mat),
colGeometries = cgs, unit = unit)
}

#' @rdname aggregateTx
#' @export
aggregateTxTech <- function(data_dir, df = NULL, by = NULL,
tech = c("Vizgen", "Xenium", "CosMX"),
min_phred = 20, flip = FALSE,
cellsize = NULL, square = TRUE, flat_topped = FALSE,
new_geometry_name = "bins") {
if (!is.null(df)) data_dir <- NULL
c(spatialCoordsNames, gene_col, cell_col, fn) %<-%
.get_tech_tx_fields(tech, data_dir)
aggregateTx(file = fn, df = df, by = by,
spatialCoordsNames = spatialCoordsNames,
gene_col = gene_col,
phred_col = "qv", min_phred = min_phred, flip = flip,
cellsize = cellsize, square = square, flat_topped = flat_topped,
new_geometry_name = new_geometry_name)
}


# Might turn this into an exported function
.aggregate_sample_tx <- function(x, by, rowGeometryName, new_geometry_name) {
rg <- rowGeometry(x, rowGeometryName)
if (!is.null(st_z_range(rg)))
by <- st_zm(by, drop = FALSE, what = "Z")
grid_sf <- st_sf(grid_id = seq_along(by), geometry = by)

# Probably faster than directly calling st_intersection, since I don't need
# the actual geometries of the intersections, maybe not
tx_coords <- st_coordinates(rg) # Might write another function similar to formatTxTech to skip this
tx_point <- df2sf(tx_coords, spatialCoordsNames = c("X", "Y", "Z"))
tx_ind <- as.data.table(st_drop_geometry(tx_point))
tx_info <- txSpots(sfe) |> st_drop_geometry()
tx_info$L1 <- seq_along(tx_info$gene) # it has to be "gene" if it's from formatTxSpots
tx_point <- merge(tx_point, tx_info, by = "L1") # takes a while
tx_point <- st_as_sf(tx_point) |> st_join(grid_sf) # takes a few minutes
tx_counts <- tx_point |> st_drop_geometry() |> as.data.table()
tx_counts <- tx_counts[, .N, by = .(gene, L1, grid_id)]

new_mat <- sparseMatrix(i = tx_counts$L1, j = tx_counts$grid_id, x = tx_counts$N)
cgs <- list(bins = by)
names(cgs) <- new_geometry_name
SpatialFeatureExperiment(assays = list(counts = new_mat),
colGeometries = cgs)
}

.aggregate_sample <- function(x, by = NULL, FUN = sum,
colGeometryName = 1L, rowGeometryName = NULL,
join = st_intersects, new_geometry_name = "bins",
BPPARAM = SerialParam()) {
# Here x is an SFE object with one sample
# by is sfc
# Can't do S4 method with signature for `by` because the argument `by` isn't
# in the generic and I don't want to mess with the `aggregate` function in
# other packages
if (is.null(rowGeometryName)) {
.aggregate_sample_tx(x, by, rowGeometryName, new_geometry_name)
} else {
.aggregate_sample_cell(x, by, FUN, fun_name, colGeometryName, join,
new_geometry_name, BPPARAM)
}
}

#' Aggregate data in SFE using geometry
#'
#' Gene expression and numeric columns of \code{colData} will be aggregated with
#' the function specified in \code{FUN}, according to another geometry supplied
#' and a geometry predicate (such as \code{st_intersects}). For example, when
#' the predicate is \code{st_intersects} and a spatial grid is used to
#' aggregate, then the data associated with all cells that intersect with each
#' grid cell will be aggregated with \code{FUN}, such as \code{mean} or
#' \code{sum}. The categorical columns will be collected into list columns, and
#' logical columns will be converted into numeric before applying \code{FUN}.
#'
#' For smFISH-based data where the transcript spots are available, the
#' transcript spots can be used instead of cells to aggregate the gene count
#' matrix, in which case all assays other than \code{counts} will be dropped and
#' \code{FUN} only applies to \code{colData} because the transcript spots are
#' simply counted.
#'
#' What this function does is similar to
#' \href{https://github.com/JEFworks-Lab/SEraster}{SEraster} but more general
#' because any geometry and more aggregation function can be used, not just
#' regular grids, and the aggregation can be performed on the transcript spots.
#'
#' @inheritParams sf::st_make_grid
#' @inheritParams sf::aggregate
#' @param x An SFE object to be aggregated.
#' @param by A \code{sf} data frame whose geometry column is used for
#' aggregation or \code{sfc} or for multiple samples a list of \code{sfc}
#' whose names are the sample IDs. For multiple samples, the \code{sf} data
#' frame must have a column \code{sample_id} to indicate which geometry for
#' which sample. This argument is optional if \code{cellsize} is specified.
#' @param FUN Function to aggregate the numerical columns in \code{colData} and
#' the gene count matrix. This can be \code{sum}, \code{mean}, or any
#' \code{row*} function in the \code{sparseMatrixStats} package such as
#' \code{\link{rowMedians}}. Aggregation is not done when aggregating by
#' transcript spots in \code{rowGeometry}.
#' @param sample_id Which samples to aggregate, defaults to "all".
#' @param colGeometryName Which \code{colGeometry} to spatially aggregate the
#' data, by default the first one.
#' @param rowGeometryName Which \code{rowGeometry} to spatially aggregate
#' @param new_geometry_name Name to give to the new \code{colGeometry} in the
#' output. Defaults to "bins".
#' @param BPPARAM A \code{\link{BiocParallelParam}} object specifying parallel
#' computing when aggregating different genes. Defaults to
#' \code{SerialParam()}.
#' @return An SFE object with \code{colGeometry} the same as the geometry
#' specified in \code{by} or same as the grid specified in \code{cellsize}.
#' \code{rowGeometries} and \code{rowData} remain the same as in the input
#' \code{x}. \code{reducedDims}, \code{localResults}, \code{colFeatureData}
#' (and its \code{colGeometry}, \code{annotGeometry}, and \code{reducedDim}
#' counterparts), and \code{spatialGraphs} are dropped because those results
#' no longer apply after aggregation.
#' @export
#' @concept Geometric operations
#' @examples
#' # example code
#'
setMethod("aggregate", "SpatialFeatureExperiment",
function(x, by = NULL, FUN = sum, sample_id = "all",
colGeometryName = 1L, rowGeometryName = NULL,
cellsize = NULL, square = TRUE, flat_topped = FALSE,
new_geometry_name = "bins",
BPPARAM = SerialParam()) {
sample_id <- .check_sample_id(x, sample_id, one = FALSE)
if (is.null(by) && is.null(cellsize)) {
stop("Either `by` or `cellsize` must be specified.")
}
# Make grid for multiple samples if `by` is not specified
if (is.null(by)) {
by <- .make_grid_samples(x, sample_id, colGeometryName,
cellsize, square, flat_topped)
}
if (!is(by, "sfc") && !is(by, "sf"))
stop("`by` must be either sf or sfc.")
if (length(sample_id) > 1L && (is(by, "sfc") || !"sample_id" %in% names(by))) {
stop("`by` must be a sf data frame with a column `sample_id`")
}
# Need to check new_geometry_name
fun_name <- substitute(FUN)
sfes <- splitSamples(x) # Output list should have sample IDs as names
if (length(sample_id) > 1L) {
by <- split(by$geometry, list(sample_id = by$sample_id))
}
sfes <- lapply(sample_id, function(sfe, df) {
.aggregate_sample(sfe, by = df, FUN = FUN,
colGeometryName = colGeometryName,
rowGeometryName = rowGeometryName)
})
out <- do.call(cbind, sfes)
# Add the original rowGeometries back
rowGeometries(out) <- rowGeometries(x, sample_id = sample_id)
out
})

# Function to make grid for multiple samples
.make_grid_samples <- function(x, sample_id, colGeometryName, cellsize, square,
flat_topped) {
cg <- colGeometry(x, sample_id = "all", type = colGeometryName)
cg$sample_id <- x$sample_id
cg <- cg[cg$sample_id %in% sample_id,]
cgs <- split(cg, f = cg$sample_id)
lapply(cgs, st_make_grid, cellsize = cellsize, square = square, flat_topped = flat_topped)
}
2 changes: 2 additions & 0 deletions R/align.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Tissue alignment using point cloud registration
# And plotting functions to inspect the alignment
Loading

0 comments on commit 02812a8

Please sign in to comment.