diff --git a/DESCRIPTION b/DESCRIPTION index 8f018643..e0d555ae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,6 +36,7 @@ Imports: sf, sfheaders, SingleCellExperiment, + sparseMatrixStats, SpatialExperiment, spdep (>= 1.1-7), SummarizedExperiment, @@ -50,6 +51,8 @@ Collate: 'AllGenerics.R' 'utils.R' 'SFE-class.R' + 'aggregate.R' + 'align.R' 'annotGeometries.R' 'cbind.R' 'coerce.R' @@ -68,6 +71,7 @@ Collate: 'reexports.R' 'saveRDS.R' 'spatialGraphs.R' + 'split.R' 'subset.R' 'transformation.R' 'updateObject.R' diff --git a/NAMESPACE b/NAMESPACE index 695da3c3..18845c5a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,8 @@ export(addVisiumSpotPoly) export(affine) export(affineImg) export(aggBboxes) +export(aggregateTx) +export(aggregateTxTech) export(annotGeometries) export(annotGeometry) export(annotGeometryNames) @@ -137,6 +139,8 @@ export(spatialCoordsNames) export(spatialGraph) export(spatialGraphNames) export(spatialGraphs) +export(splitContiguity) +export(splitSamples) export(spotPoly) export(st_any_intersects) export(st_any_pred) @@ -174,6 +178,7 @@ exportMethods("spatialGraphNames<-") exportMethods("spatialGraphs<-") exportMethods(addImg) exportMethods(affineImg) +exportMethods(aggregate) exportMethods(annotGeometries) exportMethods(annotGeometry) exportMethods(annotGeometryNames) @@ -203,6 +208,7 @@ exportMethods(show) exportMethods(spatialGraph) exportMethods(spatialGraphNames) exportMethods(spatialGraphs) +exportMethods(split) exportMethods(toExtImage) exportMethods(toSpatRasterImage) exportMethods(toSpatialFeatureExperiment) @@ -273,9 +279,11 @@ importFrom(SpatialExperiment,spatialCoordsNames) importFrom(SpatialExperiment,toSpatialExperiment) importFrom(SummarizedExperiment,"colData<-") importFrom(SummarizedExperiment,"rowData<-") +importFrom(SummarizedExperiment,assayNames) importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) +importFrom(data.table,as.data.table) importFrom(data.table,fread) importFrom(data.table,is.data.table) importFrom(data.table,merge.data.table) @@ -312,6 +320,7 @@ importFrom(sf,st_bbox) importFrom(sf,st_buffer) importFrom(sf,st_cast) importFrom(sf,st_centroid) +importFrom(sf,st_collection_extract) importFrom(sf,st_coordinates) importFrom(sf,st_covered_by) importFrom(sf,st_disjoint) @@ -326,6 +335,7 @@ importFrom(sf,st_is_empty) importFrom(sf,st_is_valid) importFrom(sf,st_join) importFrom(sf,st_linestring) +importFrom(sf,st_make_grid) importFrom(sf,st_multilinestring) importFrom(sf,st_multipoint) importFrom(sf,st_multipolygon) diff --git a/R/aggregate.R b/R/aggregate.R new file mode 100644 index 00000000..af5c6eb4 --- /dev/null +++ b/R/aggregate.R @@ -0,0 +1,284 @@ +.aggregate_num <- function(mat, inds, FUN, fun_name, BPPARAM) { + if (fun_name %in% c("sum", "mean")) { + if (fun_name == "sum") + mat_agg <- sparseMatrix(i = unlist(inds), j = seq_along(inds), x = 1, + dims = c(ncol(mat), length(inds))) + else if (fun_name == "mean") { + ll <- lengths(inds) + mat_agg <- sparseMatrix(i = unlist(inds), j = seq_along(inds), + x = rep(1/ll, times = ll), + dims = c(ncol(mat), length(inds))) + } + out_agg <- mat %*% mat_agg + } else if (fun_name %in% getNamespaceExports("sparseMatrixStats")) { + # For functions supported by sparseMatrixStats + if (!grepl("^row", fun_name)) + stop("Only row* functions from sparseMatrixStats can be used for FUN.") + cnts_agg <- bplapply(inds, function(i) { + m <- mat[,i,drop = FALSE] + FUN(m) + }, BPPARAM = BPPARAM) + out_agg <- matrix(unlist(out_agg), ncol = length(inds)) + } else stop("Function ", fun_name, " not supported for aggregating SFE.") + out_agg +} + +# For one sample +.aggregate_sample_cell <- function(x, by, FUN, fun_name, colGeometryName, join, + new_geometry_name, BPPARAM) { + cg <- colGeometry(x, type = colGeometryName) + inds <- join(by, cg) # somewhat faster than join(cg, by) when by has fewer geometries than cg + not_empty <- which(length(inds)) + by <- by[not_empty] + inds <- inds[not_empty] + cnts <- counts(x) + # Aggregate the gene count matrix, only do the counts assay + # Express this as a matrix multiplication for sum. Use sweep to deal with mean + cnts_agg <- .aggregate_num(cnts, inds, FUN, fun_name, BPPARAM) + # Aggregate numeric columns of colData; logical are converted to numeric + df <- colData(x) + which_logical <- which(vapply(df, is.logical, FUN.VALUE = logical(1))) + for (i in which_logical) { + df[,i] <- as.integer(df[,i]) + } + numeric_cols <- vapply(df, is.numeric, FUN.VALUE = logical(1)) + if (any(numeric_cols)) { + cd <- t(as.matrix(df[,numeric_cols, drop = FALSE])) + cd_agg <- .aggregate_num(cd, inds, FUN, fun_name, BPPARAM) |> t() + cd_agg <- as(cd_agg, "DFrame") + } + # Deal with anything that is neither numerical nor logical + # Primarily character and factor. What about list columns? I don't know. + if (!all(numeric_cols)) { + df_bin <- data.frame(bin = rep(seq_along(inds), times = lengths(inds)), + index = unlist(inds)) + df_bin <- df_bin[order(df_bin$index),] + names_not_num <- names(df)[!numeric_cols] + for (n in names_not_num) + df_bin[[n]] <- split(df[[,i]][n], list(bin = df_bin$bin)) + if (any(numeric_cols)) cd_agg <- cbind(df_bin, cd_agg) + } + cgs <- list(bins = by) + names(cgs) <- new_geometry_name + SpatialFeatureExperiment(assays = list(counts = cngs_agg), colData = cd_agg, + sample_id = sampleIDs(x), + colGeometries = cgs) +} + +#' Aggregate transcript spots from file +#' +#' This function reads the transcript spot file from the standard output of the +#' commercial technologies (not GeoParquet) for spatial aggregation where the +#' spots are assigned to polygons such as cells or spatial bins. Presets for +#' Xenium, MERFISH, and CosMX are available. +#' +#' @param df If the file is already loaded into memory, a data frame (sf) with +#' columns for the x, y, and optionally z coordinates and gene assignment of +#' each transcript spot. If specified, then argument \code{file} will be +#' ignored. +#' @param by A \code{sfc} or \code{sf} object for spatial aggregation. +#' @inheritParams formatTxTech +#' @inheritParams formatTxSpots +#' @note The resulting SFE object often includes geometries (e.g. grid cells) +#' outside tissue, because there can be transcript spots detected outside the +#' tissue. +#' @return A SFE object with count matrix for number of spots of each gene in +#' each geometry. Geometries with no spot are removed. +#' @importFrom data.table as.data.table +#' @importFrom sf st_make_grid +#' @export +aggregateTx <- function(file, df = NULL, by = NULL, spatialCoordsNames = c("X", "Y", "Z"), + gene_col = "gene", + phred_col = "qv", min_phred = 20, flip = FALSE, + cellsize = NULL, square = TRUE, flat_topped = FALSE, + new_geometry_name = "bins", unit = "micron") { + # This is only for one file, one sample + if (!is.null(df)) file <- df + mols <- .check_tx_file(file, spatialCoordsNames, gene_col, phred_col, + min_phred, flip, BPPARAM) + mols <- df2sf(mols, spatialCoordsNames = spatialCoordsNames, + geometryType = "POINT") + if (is.null(by)) + by <- st_make_grid(mols, cellsize = cellsize, square = square, + flat_topped = flat_topped) + else if (is(by, "sf")) by <- st_geometry(by) + grid_sf <- st_sf(grid_id = seq_along(by), geometry = by) + mols <- st_join(mols, grid_sf) # Took 5.87 minutes for 7171453 spots and 8555 bins + mols <- st_drop_geometry(mols) |> as.data.table() + mols <- mols[, .N, by = .(gene, grid_id)] + mols$gene <- factor(mols$gene) # The levels are alphabetically arranged + mols$gene_index <- as.integer(mols$gene) + new_mat <- sparseMatrix(i = mols$gene_index, j = mols$grid_id, x = mols$N) + rownames(new_mat) <- levels(mols$gene) + colnames(new_mat) <- seq_len(ncol(new_mat)) + new_mat <- new_mat[,colSums(new_mat) > 0] # Remove empty grid cells + cgs <- list(bins = grid_sf[grid_sf$grid_id %in% mols$grid_id, "geometry"]) + names(cgs) <- new_geometry_name + SpatialFeatureExperiment(assays = list(counts = new_mat), + colGeometries = cgs, unit = unit) +} + +#' @rdname aggregateTx +#' @export +aggregateTxTech <- function(data_dir, df = NULL, by = NULL, + tech = c("Vizgen", "Xenium", "CosMX"), + min_phred = 20, flip = FALSE, + cellsize = NULL, square = TRUE, flat_topped = FALSE, + new_geometry_name = "bins") { + if (!is.null(df)) data_dir <- NULL + c(spatialCoordsNames, gene_col, cell_col, fn) %<-% + .get_tech_tx_fields(tech, data_dir) + aggregateTx(file = fn, df = df, by = by, + spatialCoordsNames = spatialCoordsNames, + gene_col = gene_col, + phred_col = "qv", min_phred = min_phred, flip = flip, + cellsize = cellsize, square = square, flat_topped = flat_topped, + new_geometry_name = new_geometry_name) +} + + +# Might turn this into an exported function +.aggregate_sample_tx <- function(x, by, rowGeometryName, new_geometry_name) { + rg <- rowGeometry(x, rowGeometryName) + if (!is.null(st_z_range(rg))) + by <- st_zm(by, drop = FALSE, what = "Z") + grid_sf <- st_sf(grid_id = seq_along(by), geometry = by) + + # Probably faster than directly calling st_intersection, since I don't need + # the actual geometries of the intersections, maybe not + tx_coords <- st_coordinates(rg) # Might write another function similar to formatTxTech to skip this + tx_point <- df2sf(tx_coords, spatialCoordsNames = c("X", "Y", "Z")) + tx_ind <- as.data.table(st_drop_geometry(tx_point)) + tx_info <- txSpots(sfe) |> st_drop_geometry() + tx_info$L1 <- seq_along(tx_info$gene) # it has to be "gene" if it's from formatTxSpots + tx_point <- merge(tx_point, tx_info, by = "L1") # takes a while + tx_point <- st_as_sf(tx_point) |> st_join(grid_sf) # takes a few minutes + tx_counts <- tx_point |> st_drop_geometry() |> as.data.table() + tx_counts <- tx_counts[, .N, by = .(gene, L1, grid_id)] + + new_mat <- sparseMatrix(i = tx_counts$L1, j = tx_counts$grid_id, x = tx_counts$N) + cgs <- list(bins = by) + names(cgs) <- new_geometry_name + SpatialFeatureExperiment(assays = list(counts = new_mat), + colGeometries = cgs) +} + +.aggregate_sample <- function(x, by = NULL, FUN = sum, + colGeometryName = 1L, rowGeometryName = NULL, + join = st_intersects, new_geometry_name = "bins", + BPPARAM = SerialParam()) { + # Here x is an SFE object with one sample + # by is sfc + # Can't do S4 method with signature for `by` because the argument `by` isn't + # in the generic and I don't want to mess with the `aggregate` function in + # other packages + if (is.null(rowGeometryName)) { + .aggregate_sample_tx(x, by, rowGeometryName, new_geometry_name) + } else { + .aggregate_sample_cell(x, by, FUN, fun_name, colGeometryName, join, + new_geometry_name, BPPARAM) + } +} + +#' Aggregate data in SFE using geometry +#' +#' Gene expression and numeric columns of \code{colData} will be aggregated with +#' the function specified in \code{FUN}, according to another geometry supplied +#' and a geometry predicate (such as \code{st_intersects}). For example, when +#' the predicate is \code{st_intersects} and a spatial grid is used to +#' aggregate, then the data associated with all cells that intersect with each +#' grid cell will be aggregated with \code{FUN}, such as \code{mean} or +#' \code{sum}. The categorical columns will be collected into list columns, and +#' logical columns will be converted into numeric before applying \code{FUN}. +#' +#' For smFISH-based data where the transcript spots are available, the +#' transcript spots can be used instead of cells to aggregate the gene count +#' matrix, in which case all assays other than \code{counts} will be dropped and +#' \code{FUN} only applies to \code{colData} because the transcript spots are +#' simply counted. +#' +#' What this function does is similar to +#' \href{https://github.com/JEFworks-Lab/SEraster}{SEraster} but more general +#' because any geometry and more aggregation function can be used, not just +#' regular grids, and the aggregation can be performed on the transcript spots. +#' +#' @inheritParams sf::st_make_grid +#' @inheritParams sf::aggregate +#' @param x An SFE object to be aggregated. +#' @param by A \code{sf} data frame whose geometry column is used for +#' aggregation or \code{sfc} or for multiple samples a list of \code{sfc} +#' whose names are the sample IDs. For multiple samples, the \code{sf} data +#' frame must have a column \code{sample_id} to indicate which geometry for +#' which sample. This argument is optional if \code{cellsize} is specified. +#' @param FUN Function to aggregate the numerical columns in \code{colData} and +#' the gene count matrix. This can be \code{sum}, \code{mean}, or any +#' \code{row*} function in the \code{sparseMatrixStats} package such as +#' \code{\link{rowMedians}}. Aggregation is not done when aggregating by +#' transcript spots in \code{rowGeometry}. +#' @param sample_id Which samples to aggregate, defaults to "all". +#' @param colGeometryName Which \code{colGeometry} to spatially aggregate the +#' data, by default the first one. +#' @param rowGeometryName Which \code{rowGeometry} to spatially aggregate +#' @param new_geometry_name Name to give to the new \code{colGeometry} in the +#' output. Defaults to "bins". +#' @param BPPARAM A \code{\link{BiocParallelParam}} object specifying parallel +#' computing when aggregating different genes. Defaults to +#' \code{SerialParam()}. +#' @return An SFE object with \code{colGeometry} the same as the geometry +#' specified in \code{by} or same as the grid specified in \code{cellsize}. +#' \code{rowGeometries} and \code{rowData} remain the same as in the input +#' \code{x}. \code{reducedDims}, \code{localResults}, \code{colFeatureData} +#' (and its \code{colGeometry}, \code{annotGeometry}, and \code{reducedDim} +#' counterparts), and \code{spatialGraphs} are dropped because those results +#' no longer apply after aggregation. +#' @export +#' @concept Geometric operations +#' @examples +#' # example code +#' +setMethod("aggregate", "SpatialFeatureExperiment", + function(x, by = NULL, FUN = sum, sample_id = "all", + colGeometryName = 1L, rowGeometryName = NULL, + cellsize = NULL, square = TRUE, flat_topped = FALSE, + new_geometry_name = "bins", + BPPARAM = SerialParam()) { + sample_id <- .check_sample_id(x, sample_id, one = FALSE) + if (is.null(by) && is.null(cellsize)) { + stop("Either `by` or `cellsize` must be specified.") + } + # Make grid for multiple samples if `by` is not specified + if (is.null(by)) { + by <- .make_grid_samples(x, sample_id, colGeometryName, + cellsize, square, flat_topped) + } + if (!is(by, "sfc") && !is(by, "sf")) + stop("`by` must be either sf or sfc.") + if (length(sample_id) > 1L && (is(by, "sfc") || !"sample_id" %in% names(by))) { + stop("`by` must be a sf data frame with a column `sample_id`") + } + # Need to check new_geometry_name + fun_name <- substitute(FUN) + sfes <- splitSamples(x) # Output list should have sample IDs as names + if (length(sample_id) > 1L) { + by <- split(by$geometry, list(sample_id = by$sample_id)) + } + sfes <- lapply(sample_id, function(sfe, df) { + .aggregate_sample(sfe, by = df, FUN = FUN, + colGeometryName = colGeometryName, + rowGeometryName = rowGeometryName) + }) + out <- do.call(cbind, sfes) + # Add the original rowGeometries back + rowGeometries(out) <- rowGeometries(x, sample_id = sample_id) + out + }) + +# Function to make grid for multiple samples +.make_grid_samples <- function(x, sample_id, colGeometryName, cellsize, square, + flat_topped) { + cg <- colGeometry(x, sample_id = "all", type = colGeometryName) + cg$sample_id <- x$sample_id + cg <- cg[cg$sample_id %in% sample_id,] + cgs <- split(cg, f = cg$sample_id) + lapply(cgs, st_make_grid, cellsize = cellsize, square = square, flat_topped = flat_topped) +} diff --git a/R/align.R b/R/align.R new file mode 100644 index 00000000..97f867e3 --- /dev/null +++ b/R/align.R @@ -0,0 +1,2 @@ +# Tissue alignment using point cloud registration +# And plotting functions to inspect the alignment diff --git a/R/formatTxSpots.R b/R/formatTxSpots.R index c84f8bd5..b5bf2954 100644 --- a/R/formatTxSpots.R +++ b/R/formatTxSpots.R @@ -320,15 +320,11 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), file <- normalizePath(file, mustWork = TRUE) dest <- match.arg(dest) z_option <- match.arg(z_option) - ext <- file_ext(file) if (dest == "colGeometry") { return <- FALSE if (is.null(file_out)) stop("file_out must be specified for dest = 'colGeometry'.") } - if (!ext %in% c("csv", "gz", "tsv", "txt", "parquet")) { - stop("The file must be one of csv, gz, tsv, txt, or parquet") - } if (!is.null(file_out) && (file.exists(file_out) || dir.exists(file_path_sans_ext(file_out)))) { out <- .read_tx_output(file_out, z, z_option, "gene", return) @@ -337,34 +333,8 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), if (!is.numeric(z) && z != "all") { stop("z must either be numeric or be 'all' indicating all z-planes.") } - if (ext == "parquet") { - check_installed("arrow") - mols <- arrow::read_parquet(file) - # convert cols with raw bytes to character - # NOTE: can take a while. - mols <- .rawToChar_df(mols, BPPARAM = BPPARAM) - # sanity, convert to data.table - if (!is(mols, "data.table")) { - mols <- data.table::as.data.table(mols) - } - } else { - mols <- fread(file) - } - names(mols)[names(mols) == gene_col] <- "gene" - gene_col <- "gene" - ind <- !spatialCoordsNames[1:2] %in% names(mols) - if (any(ind)) { - col_offending <- setdiff(spatialCoordsNames[1:2], names(mols)) - ax <- c("x", "y") - stop(paste(ax[ind], collapse = ", "), " coordinate column(s) ", - paste(col_offending, collapse = ", "), " not found.") - } - spatialCoordsNames <- intersect(spatialCoordsNames, names(mols)) - if (flip) { - y_name <- spatialCoordsNames[2] - if (!is.data.table(mols)) ..y_name <- y_name - mols[,y_name] <- -mols[,..y_name] - } + mols <- .check_tx_file(file, spatialCoordsNames, gene_col, phred_col, + min_phred, flip, BPPARAM) # Check z use_z <- length(spatialCoordsNames) == 3L if (use_z) { @@ -389,9 +359,6 @@ formatTxSpots <- function(file, dest = c("rowGeometry", "colGeometry"), z_option <- "3d" } } - if (phred_col %in% names(mols) && !is.null(min_phred)) { - mols <- mols[mols[[phred_col]] >= min_phred,] - } message(">>> Converting transcript spots to geometry") if (dest == "colGeometry") { if (!length(cell_col) || any(!cell_col %in% names(mols))) diff --git a/R/geometry_operation.R b/R/geometry_operation.R index 3af6f672..eb96d07e 100644 --- a/R/geometry_operation.R +++ b/R/geometry_operation.R @@ -732,133 +732,3 @@ removeEmptySpace <- function(sfe, sample_id = "all") { } sfe } - -# Aggregate---------- -# For one sample -.aggregate_sample <- function(x, by = NULL, FUN = sum, - colGeometryName = 1L, rowGeometryName = NULL, - cellsize = NULL, square = TRUE, flat_topped = FALSE, - join = st_intersects) { - # Here x is an SFE object with one sample - -} - -#' Aggregate data in SFE using geometry -#' -#' Gene expression and numeric columns of \code{colData} will be aggregated with -#' the function specified in \code{FUN}, according to another geometry supplied -#' and a geometry predicate (such as \code{st_intersects}). For example, when -#' the predicate is \code{st_intersects} and a spatial grid is used to -#' aggregate, then the data associated with all cells that intersect with each -#' grid cell will be aggregated with \code{FUN}, such as \code{mean} or -#' \code{sum}. The categorical columns will be collected into list columns, and -#' logical columns will be converted into numeric before applying \code{FUN}. -#' -#' For smFISH-based data where the transcript spots are available, the -#' transcript spots can be used instead of cells to aggregate the gene count -#' matrix, in which case all assays other than \code{counts} will be dropped and -#' \code{FUN} only applies to \code{colData} because the transcript spots are -#' simply counted. -#' -#' @inheritParams sf::st_make_grid -#' @inheritParams sf::aggregate -#' @param x An SFE object to be aggregated. -#' @param by A \code{sf} data frame whose geometry column is used for -#' aggregation or \code{sfc}. For multiple samples, the \code{sf} data frame -#' must have a column \code{sample_id} to indicate which geometry for which -#' sample. This argument is optional if \code{cellsize} is specified. -#' @param FUN Function to aggregate the numerical columns in \code{colData} and -#' the gene count matrix. The function must take a numeric vector as input and -#' return a numeric scalar. Aggregation is not done when aggregating by -#' transcript spots in \code{rowGeometry}. -#' @param sample_id Which samples to aggregate, defaults to "all". -#' @param colGeometryName Which \code{colGeometry} to spatially aggregate the -#' data, by default the first one. -#' @param rowGeometryName Which \code{rowGeometry} to spatially aggregate -#' @return An SFE object with \code{colGeometry} the same as the geometry -#' specified in \code{by} or same as the grid specified in \code{cellsize}. -#' \code{rowGeometries} and \code{rowData} remain the same as in the input -#' \code{x}. \code{reducedDims}, \code{localResults}, \code{colFeatureData} -#' (and its \code{colGeometry}, \code{annotGeometry}, and \code{reducedDim} -#' counterparts), and \code{spatialGraphs} are dropped because those results -#' no longer apply after aggregation. -#' @export -#' @concept Geometric operations -#' @examples -#' # example code -#' -setMethod("aggregate", "SpatialFeatureExperiment", - function(x, by = NULL, FUN = sum, sample_id = "all", - colGeometryName = 1L, rowGeometryName = NULL, - cellsize = NULL, square = TRUE, flat_topped = FALSE, - join = st_intersects) { - sample_id <- .check_sample_id(x, sample_id, one = FALSE) - if (is.null(by) && is.null(cellsize)) { - stop("Either `by` or `cellsize` must be specified.") - } - if (!is(by, "sfc") && !is(by, "sf")) - stop("`by` must be either sf or sfc.") - if (length(sample_id) > 1L && (is(by, "sfc") || !"sample_id" %in% names(by))) { - stop("`b`y must be a sf data frame with a column `sample_id`") - } - sfes <- split(x, "sample_id") - sfes <- lapply(sfes, .aggregate_sample, - by = by, FUN = FUN, - colGeometryName = colGeometryName, - rowGeometryName = rowGeometryName, cellsize = cellsize, - square = square, flat_topped = flat_topped, - join = join) - do.call(cbind, sfes) - }) - -# Split------------- - -#' Split SFE object with categorical vector or geometry -#' -#' This function splits an SFE object into multiple SFE objects according to a -#' categorical vector (such as sample IDs) or geometries (all cells/spots -#' intersecting with each geometry will become a separate SFE object). -#' -#' @param x An SFE object -#' @param f A character or vector of length 1 or same length as \code{ncol(x)}. -#' When length 1, it should be the name of a column in `colData(x)`, and if -#' not found in \code{colData} then a name of an \code{annotGeometry}. It can -#' also be a \code{sf} data frame or \code{sfc} to split by geometry. Each row -#' of the \code{sf} data frame or each element in the \code{sfc} will -#' correspond to a new SFE object. The \code{sf} data frame must have a column -#' \code{sample_id} when splitting multiple samples. -#' @param sample_id Which samples to split. -#' @param colGeometryName Which \code{colGeometry} to use to determine which -#' cells or spots should belong to which new SFE object when splitting by -#' \code{sf} or \code{sfc}. Default to the first one. -#' @param join Logical spatial predicate function to use when \code{f} is -#' \code{sf} or \code{sfc}. See \code{\link{st_join}}. -#' @concept Geometric operations -#' @rdname split-SpatialFeatureExperiment -#' @examples -#' -NULL - -#' @rdname split-SpatialFeatureExperiment -#' @export -setMethod("split", c("SpatialFeatureExperiment", "ANY"), - function(x, f) { - - }) - -#' @rdname split-SpatialFeatureExperiment -#' @export -setMethod("split", c("SpatialFeatureExperiment", "sf"), - function(x, f, sample_id = "all", colGeometryName = 1L, - join = st_intersects) { - - }) - -#' @rdname split-SpatialFeatureExperiment -#' @export -setMethod("split", c("SpatialFeatureExperiment", "sfc"), - function(x, f, sample_id = 1L, colGeometryName = 1L, - join = st_intersects) { - - }) - diff --git a/R/split.R b/R/split.R new file mode 100644 index 00000000..0efecaa8 --- /dev/null +++ b/R/split.R @@ -0,0 +1,105 @@ +# Split------------- + +#' Split SFE object with categorical vector or geometry +#' +#' The \code{split} methods for SFE split an SFE object into multiple SFE +#' objects by geometries (all cells/spots intersecting with each geometry will +#' become a separate SFE object). The \code{splitSamples} function splits the +#' SFE object by \code{sample_id} so each sample will become a separate SFE +#' object. The \code{splitContiguity} function splits the SFE object by +#' contiguity of an \code{annotGeometry}, which by default is "tissueBoundary". +#' +#' @param x An SFE object +#' @param f It can be a \code{sf} data frame or \code{sfc} to split by geometry. +#' Each row of the \code{sf} data frame or each element in the \code{sfc} will +#' correspond to a new SFE object. The \code{sf} data frame must have a column +#' \code{sample_id} when splitting multiple samples. Can also be a list of +#' \code{sfc} whose names correspond to \code{sample_id}s to split. +#' @param sample_id Which samples to split. +#' @param colGeometryName Which \code{colGeometry} to use to determine which +#' cells or spots should belong to which new SFE object when splitting by +#' \code{sf} or \code{sfc}. Default to the first one. +#' @param join Logical spatial predicate function to use when \code{f} is +#' \code{sf} or \code{sfc}. See \code{\link{st_join}}. +#' @param annotGeometryName Name of \code{annotGeometry} to use to split by +#' contiguity. +#' @param min_area Minimum area in the same unit as the geometry coordinates +#' (squared) for each piece to be considered a separate piece when splitting +#' by contiguity. Only pieces that are large enough are considered. +#' @return A list of SFE objects. +#' @concept Geometric operations +#' @name split-SpatialFeatureExperiment +#' @examples +#' # example code +NULL + +#' @rdname split-SpatialFeatureExperiment +#' @export +setMethod("split", c("SpatialFeatureExperiment", "sf"), + function(x, f, sample_id = "all", colGeometryName = 1L) { + sample_id <- .check_sample_id(x, sample_id, one = FALSE) + if (!"sample_id" %in% names(f) && length(sample_id) > 1L) + stop("f must have a column sample_id when multiple samples are specified.") + l <- split(st_geometry(f), f$sample_id) + split(x, l, sample_id = sample_id, colGeometryName = colGeometryName) + }) + +#' @rdname split-SpatialFeatureExperiment +#' @export +setMethod("split", c("SpatialFeatureExperiment", "sfc"), + function(x, f, sample_id = 1L, colGeometryName = 1L) { + sample_id <- .check_sample_id(x, sample_id) + x <- x[, x$sample_id == sample_id] + lapply(sfc, function(g) { + crop(x, g, colGeometryName = colGeometryName, + keep_whole = "col", cover = TRUE) + }) + }) + +#' @rdname split-SpatialFeatureExperiment +#' @export +setMethod("split", c("SpatialFeatureExperiment", "list"), + function(x, f, sample_id = "all", colGeometryName = 1L) { + sample_id <- .check_sample_id(x, sample_id, one = FALSE) + f <- f[sample_id] + if (!length(f)) + stop("None of the geometries correspond to sample_id") + out <- lapply(sample_id, function(s) { + split(x, f[[s]], sample_id = s, colGeometryName = colGeometryName) + }) + names(out) <- sample_id + unlist(out, recursive = FALSE) + }) + +#' @rdname split-SpatialFeatureExperiment +#' @export +splitSamples <- function(x) { + ss <- sampleIDs(x) + if (length(ss) == 1L) return(x) + out <- lapply(ss, function(s) x[, x$sample_id == s]) + names(out) <- ss + out +} + +#' @rdname split-SpatialFeatureExperiment +#' @export +#' @importFrom sf st_collection_extract +splitContiguity <- function(x, colGeometryName = 1L, + annotGeometryName = "tissueBoundary", + min_area = 0, join = st_intersects) { + ag <- annotGeometry(x, annotGeometryName) + gt <- st_geometry_type(ag, by_geometry = FALSE) + # Will I allow points and linestrings in the future? + if (!gt %in% c("POLYGON", "MULTIPOLYGON", "GEOMETRY")) + stop("The geometries must be POLYGON, MULTIPOLYGON, or GEOMETRY.") + if (gt == "GEOMETRY") { + tryCatch(ag <- st_collection_extract(ag, type = c("POLYGON", "MULTIPOLYGON")), + warning = function(w) stop("None of the geometries are POLYGON or MULTIPOLYGON")) + } + if (gt != "POLYGON") ag <- st_cast(ag, "POLYGON", warn = FALSE) + if (min_area > 0) { + ag$area <- st_area(ag) + ag <- ag[ag$area > min_area,] + } + split(x, ag, colGeometryName = colGeometryName) +} diff --git a/R/utils.R b/R/utils.R index cdcb500a..f0837795 100644 --- a/R/utils.R +++ b/R/utils.R @@ -294,7 +294,7 @@ gdalParquetAvailable <- function() { (major_version == 1L && minor_version > 4L) || major_version == 2L } -.get_tech_tx_fields <- function(tech, data_dir) { +.get_tech_tx_fields <- function(tech, data_dir = NULL) { spatialCoordsNames <- switch( tech, Vizgen = c("global_x", "global_y", "global_z"), @@ -313,14 +313,17 @@ gdalParquetAvailable <- function() { Xenium = "cell_id", CosMX = "cell_ID" ) - fn <- switch( - tech, - Vizgen = .check_vizgen_fns(data_dir, "detected_transcripts.csv"), - CosMX = grep("tx_file.csv", - list.files(data_dir, pattern = "\\.csv$", full.names = TRUE), - value = TRUE), - Xenium = .check_xenium_fns(data_dir, "transcripts", .no_raw_bytes(data_dir)) - ) + if (is.null(data_dir)) fn <- NULL + else { + fn <- switch( + tech, + Vizgen = .check_vizgen_fns(data_dir, "detected_transcripts.csv"), + CosMX = grep("tx_file.csv", + list.files(data_dir, pattern = "\\.csv$", full.names = TRUE), + value = TRUE), + Xenium = .check_xenium_fns(data_dir, "transcripts", .no_raw_bytes(data_dir)) + ) + } list(spatialCoordsNames = spatialCoordsNames, gene_col = gene_col, cell_col = cell_col, @@ -334,3 +337,46 @@ gdalParquetAvailable <- function() { } else features <- ids features } + +.check_tx_file <- function(file, spatialCoordsNames, gene_col, phred_col, + min_phred, flip, BPPARAM = SerialParam()) { + if (is.character(file)) { + ext <- file_ext(file) + if (!ext %in% c("csv", "gz", "tsv", "txt", "parquet")) { + stop("The file must be one of csv, gz, tsv, txt, or parquet") + } + if (ext == "parquet") { + check_installed("arrow") + mols <- arrow::read_parquet(file) + # convert cols with raw bytes to character + # NOTE: can take a while. + mols <- .rawToChar_df(mols, BPPARAM = BPPARAM) + # sanity, convert to data.table + if (!is(mols, "data.table")) { + mols <- data.table::as.data.table(mols) + } + } else { + mols <- fread(file) + } + } else mols <- file + + names(mols)[names(mols) == gene_col] <- "gene" + gene_col <- "gene" + ind <- !spatialCoordsNames[1:2] %in% names(mols) + if (any(ind)) { + col_offending <- setdiff(spatialCoordsNames[1:2], names(mols)) + ax <- c("x", "y") + stop(paste(ax[ind], collapse = ", "), " coordinate column(s) ", + paste(col_offending, collapse = ", "), " not found.") + } + spatialCoordsNames <- intersect(spatialCoordsNames, names(mols)) + if (flip) { + y_name <- spatialCoordsNames[2] + if (!is.data.table(mols)) ..y_name <- y_name + mols[,y_name] <- -mols[,..y_name] + } + if (phred_col %in% names(mols) && !is.null(min_phred)) { + mols <- mols[mols[[phred_col]] >= min_phred,] + } + mols +} diff --git a/man/aggregate-SpatialFeatureExperiment-method.Rd b/man/aggregate-SpatialFeatureExperiment-method.Rd new file mode 100644 index 00000000..c3bc3015 --- /dev/null +++ b/man/aggregate-SpatialFeatureExperiment-method.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregate.R +\name{aggregate,SpatialFeatureExperiment-method} +\alias{aggregate,SpatialFeatureExperiment-method} +\title{Aggregate data in SFE using geometry} +\usage{ +\S4method{aggregate}{SpatialFeatureExperiment}( + x, + by = NULL, + FUN = sum, + sample_id = "all", + colGeometryName = 1L, + rowGeometryName = NULL, + cellsize = NULL, + square = TRUE, + flat_topped = FALSE, + new_geometry_name = "bins", + BPPARAM = SerialParam() +) +} +\arguments{ +\item{x}{An SFE object to be aggregated.} + +\item{by}{A \code{sf} data frame whose geometry column is used for +aggregation or \code{sfc} or for multiple samples a list of \code{sfc} +whose names are the sample IDs. For multiple samples, the \code{sf} data +frame must have a column \code{sample_id} to indicate which geometry for +which sample. This argument is optional if \code{cellsize} is specified.} + +\item{FUN}{Function to aggregate the numerical columns in \code{colData} and +the gene count matrix. This can be \code{sum}, \code{mean}, or any +\code{row*} function in the \code{sparseMatrixStats} package such as +\code{\link{rowMedians}}. Aggregation is not done when aggregating by +transcript spots in \code{rowGeometry}.} + +\item{sample_id}{Which samples to aggregate, defaults to "all".} + +\item{colGeometryName}{Which \code{colGeometry} to spatially aggregate the +data, by default the first one.} + +\item{rowGeometryName}{Which \code{rowGeometry} to spatially aggregate} + +\item{cellsize}{numeric of length 1 or 2 with target cellsize: for square or rectangular cells the width and height, for hexagonal cells the distance between opposite edges (edge length is cellsize/sqrt(3)). A length units object can be passed, or an area unit object with area size of the square or hexagonal cell.} + +\item{square}{logical; if \code{FALSE}, create hexagonal grid} + +\item{flat_topped}{logical; if \code{TRUE} generate flat topped hexagons, else generate pointy topped} + +\item{new_geometry_name}{Name to give to the new \code{colGeometry} in the +output. Defaults to "bins".} + +\item{BPPARAM}{A \code{\link{BiocParallelParam}} object specifying parallel +computing when aggregating different genes. Defaults to +\code{SerialParam()}.} +} +\value{ +An SFE object with \code{colGeometry} the same as the geometry + specified in \code{by} or same as the grid specified in \code{cellsize}. + \code{rowGeometries} and \code{rowData} remain the same as in the input + \code{x}. \code{reducedDims}, \code{localResults}, \code{colFeatureData} + (and its \code{colGeometry}, \code{annotGeometry}, and \code{reducedDim} + counterparts), and \code{spatialGraphs} are dropped because those results + no longer apply after aggregation. +} +\description{ +Gene expression and numeric columns of \code{colData} will be aggregated with +the function specified in \code{FUN}, according to another geometry supplied +and a geometry predicate (such as \code{st_intersects}). For example, when +the predicate is \code{st_intersects} and a spatial grid is used to +aggregate, then the data associated with all cells that intersect with each +grid cell will be aggregated with \code{FUN}, such as \code{mean} or +\code{sum}. The categorical columns will be collected into list columns, and +logical columns will be converted into numeric before applying \code{FUN}. +} +\details{ +For smFISH-based data where the transcript spots are available, the +transcript spots can be used instead of cells to aggregate the gene count +matrix, in which case all assays other than \code{counts} will be dropped and +\code{FUN} only applies to \code{colData} because the transcript spots are +simply counted. + +What this function does is similar to +\href{https://github.com/JEFworks-Lab/SEraster}{SEraster} but more general +because any geometry and more aggregation function can be used, not just +regular grids, and the aggregation can be performed on the transcript spots. +} +\examples{ +# example code + +} +\concept{Geometric operations} diff --git a/man/aggregateTx.Rd b/man/aggregateTx.Rd new file mode 100644 index 00000000..669eabcd --- /dev/null +++ b/man/aggregateTx.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregate.R +\name{aggregateTx} +\alias{aggregateTx} +\alias{aggregateTxTech} +\title{Aggregate transcript spots from file} +\usage{ +aggregateTx( + file, + df = NULL, + by = NULL, + spatialCoordsNames = c("X", "Y", "Z"), + gene_col = "gene", + phred_col = "qv", + min_phred = 20, + flip = FALSE, + cellsize = NULL, + square = TRUE, + flat_topped = FALSE, + new_geometry_name = "bins", + unit = "micron" +) + +aggregateTxTech( + data_dir, + df = NULL, + by = NULL, + tech = c("Vizgen", "Xenium", "CosMX"), + min_phred = 20, + flip = FALSE, + cellsize = NULL, + square = TRUE, + flat_topped = FALSE, + new_geometry_name = "bins" +) +} +\arguments{ +\item{file}{File with the transcript spot coordinates. Should be one row per +spot when read into R and should have columns for coordinates on each axis, +gene the transcript is assigned to, and optionally cell the transcript is +assigned to. Must be csv, tsv, or parquet.} + +\item{df}{If the file is already loaded into memory, a data frame (sf) with +columns for the x, y, and optionally z coordinates and gene assignment of +each transcript spot. If specified, then argument \code{file} will be +ignored.} + +\item{by}{A \code{sfc} or \code{sf} object for spatial aggregation.} + +\item{spatialCoordsNames}{Column names for the x, y, and optionally z +coordinates of the spots. The defaults are for Vizgen.} + +\item{gene_col}{Column name for genes.} + +\item{phred_col}{Column name for Phred scores of the spots.} + +\item{min_phred}{Minimum Phred score to keep spot. By default 20, the +conventional threshold indicating "acceptable", meaning that there's 1% +chance that the spot was decoded in error.} + +\item{flip}{Logical, whether to flip the geometry to match image. Here the y +coordinates are simply set to -y, so the original bounding box is not +preserved. This is consistent with \code{readVizgen} and \code{readXenium}.} + +\item{data_dir}{Top level output directory.} + +\item{tech}{Which technology whose output to read, must be one of "Vizgen", +"Xenium", or "CosMX" though more technologies may be added later.} +} +\value{ +A SFE object with count matrix for number of spots of each gene in + each geometry. Geometries with no spot are removed. +} +\description{ +This function reads the transcript spot file from the standard output of the +commercial technologies (not GeoParquet) for spatial aggregation where the +spots are assigned to polygons such as cells or spatial bins. Presets for +Xenium, MERFISH, and CosMX are available. +} +\note{ +The resulting SFE object often includes geometries (e.g. grid cells) + outside tissue, because there can be transcript spots detected outside the + tissue. +} diff --git a/man/read10xVisiumSFE.Rd b/man/read10xVisiumSFE.Rd index 48bb504c..dc3addae 100644 --- a/man/read10xVisiumSFE.Rd +++ b/man/read10xVisiumSFE.Rd @@ -8,10 +8,12 @@ read10xVisiumSFE( samples = "", dirs = file.path(samples, "outs"), sample_id = paste0("sample", sprintf("\%02d", seq_along(samples))), + bin_size = NULL, type = c("HDF5", "sparse"), data = c("filtered", "raw"), images = c("lowres", "hires"), unit = c("full_res_image_pixel", "micron"), + add_Graph = TRUE, style = "W", zero.policy = NULL, load = FALSE @@ -32,6 +34,10 @@ have moved or renamed the output directory.} one for each directory specified via \code{samples}; ignored if \code{!is.null(names(samples))}} +\item{bin_size}{\code{c(character)}, use this only when loading VisiumHD. +Specify which bin resolution to load, default is \code{NULL} which assumes that data is standard Visium. +Eg, single resolution is \code{c("8")}, if to load all three resolutions, use \code{c("2", "8", "16")}.} + \item{type}{Either "HDF5", and the matrix will be represented as \code{TENxMatrix}, or "sparse", and the matrix will be read as a \code{dgCMatrix}.} @@ -47,6 +53,9 @@ unit. If using microns, then spacing between spots in pixels will be used to convert the coordinates into microns, as the spacing is known to be 100 microns. This is used to plot scale bar.} +\item{add_Graph}{\code{c(local)}, if to add spatial neighborhood graph for spots and only if \code{c(data = "filtered")}. +Default is \code{c(TRUE)}} + \item{style}{\code{style} can take values \dQuote{W}, \dQuote{B}, \dQuote{C}, \dQuote{U}, \dQuote{minmax} and \dQuote{S}} \item{zero.policy}{default NULL, use global option value; if FALSE stop with error for any empty neighbour sets, if TRUE permit the weights list to be formed with zero-length weights vectors} @@ -87,5 +96,24 @@ list.files(file.path(samples[1], "spatial")) type = "sparse", data = "filtered", load = FALSE )) + +# load VisiumHD +# path to "binned_outputs" directory containing: +└── binned_outputs + ├── square_002um + ├── square_008um + └── square_016um +dir_hd <- "~/Downloads/Visium_HD_Mouse_Brain/binned_outputs/" +# this is public dataset -> https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-mouse-brain-he +sfe_hd <- +read10xVisiumSFE(samples = list.files(dir_hd), # 1:3 resolutions + dirs = dir_hd, + bin_size = c("8", "16"), # this defines which of 1:3 resolutions to load + type = "HDF5", # Note, "sparse" -> takes longer to load + data = "filtered", # spots under tissue + images = c("lowres"), # for now low res. image only + add_Graph = FALSE # Note, if VisiumHD this can take time for 2 or 8µm res. + ) + } \concept{Read data into SFE} diff --git a/man/split-SpatialFeatureExperiment.Rd b/man/split-SpatialFeatureExperiment.Rd new file mode 100644 index 00000000..2c4a37c0 --- /dev/null +++ b/man/split-SpatialFeatureExperiment.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/split.R +\name{split-SpatialFeatureExperiment} +\alias{split-SpatialFeatureExperiment} +\alias{split,SpatialFeatureExperiment,sf-method} +\alias{split,SpatialFeatureExperiment,sfc-method} +\alias{split,SpatialFeatureExperiment,list-method} +\alias{splitSamples} +\alias{splitContiguity} +\title{Split SFE object with categorical vector or geometry} +\usage{ +\S4method{split}{SpatialFeatureExperiment,sf}(x, f, sample_id = "all", colGeometryName = 1L) + +\S4method{split}{SpatialFeatureExperiment,sfc}(x, f, sample_id = 1L, colGeometryName = 1L) + +\S4method{split}{SpatialFeatureExperiment,list}(x, f, sample_id = "all", colGeometryName = 1L) + +splitSamples(x) + +splitContiguity( + x, + colGeometryName = 1L, + annotGeometryName = "tissueBoundary", + min_area = 0, + join = st_intersects +) +} +\arguments{ +\item{x}{An SFE object} + +\item{f}{It can be a \code{sf} data frame or \code{sfc} to split by geometry. +Each row of the \code{sf} data frame or each element in the \code{sfc} will +correspond to a new SFE object. The \code{sf} data frame must have a column +\code{sample_id} when splitting multiple samples. Can also be a list of +\code{sfc} whose names correspond to \code{sample_id}s to split.} + +\item{sample_id}{Which samples to split.} + +\item{colGeometryName}{Which \code{colGeometry} to use to determine which +cells or spots should belong to which new SFE object when splitting by +\code{sf} or \code{sfc}. Default to the first one.} + +\item{annotGeometryName}{Name of \code{annotGeometry} to use to split by +contiguity.} + +\item{min_area}{Minimum area in the same unit as the geometry coordinates +(squared) for each piece to be considered a separate piece when splitting +by contiguity. Only pieces that are large enough are considered.} + +\item{join}{Logical spatial predicate function to use when \code{f} is +\code{sf} or \code{sfc}. See \code{\link{st_join}}.} +} +\value{ +A list of SFE objects. +} +\description{ +The \code{split} methods for SFE split an SFE object into multiple SFE +objects by geometries (all cells/spots intersecting with each geometry will +become a separate SFE object). The \code{splitSamples} function splits the +SFE object by \code{sample_id} so each sample will become a separate SFE +object. The \code{splitContiguity} function splits the SFE object by +contiguity of an \code{annotGeometry}, which by default is "tissueBoundary". +} +\examples{ +# example code +} +\concept{Geometric operations}