Skip to content

Commit

Permalink
fix(tabix): move scanTabix up before DF logic; separate tabix R file
Browse files Browse the repository at this point in the history
  • Loading branch information
jamespeapen committed Nov 20, 2024
1 parent 7ef328e commit 17f1b3f
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 120 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ export(tabix)
import(Matrix)
import(stringfish)
importFrom(Rcpp,sourceCpp)
importFrom(data.table,":=")
importFrom(data.table,as.data.table)
importFrom(data.table,rbindlist)
importFrom(data.table,set)
importFrom(data.table,tstrsplit)
importFrom(parallel,mclapply)
importFrom(parallelly,availableCores)
importFrom(stats,setNames)
importFrom(stringfish,sf_grepl)
importFrom(tools,file_path_sans_ext)
useDynLib(iscream, .registration = TRUE)
119 changes: 0 additions & 119 deletions R/query.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,122 +12,3 @@ query_chroms <- function(bedfiles, nthreads = NULL) {

Cpp_query_chroms(bedfiles, n_threads)
}

single_tabix <- function(bedfile, regions, result_colnames, mergecg) {
lines <- Cpp_query_interval(bedfile, regions)
if (length(lines) == 0) {
warning(paste("No records found in", bedfile))
return(NULL)
}
lines_dt <- as.data.table(lines)

lines_dt <- lines_dt[, tstrsplit(lines, "\t", fixed = TRUE)]
n_col <- ncol(lines_dt)
if (length(result_colnames) < n_col) {
warning(paste(
"Did not use input 'colnames' - only",
length(result_colnames), "names provided for", n_col, "column data.table"
))
return(lines_dt)
} else if (length(result_colnames) > n_col) {
warning("Fewer columns in data than provided colnames")
}

colnames(lines_dt) <- result_colnames[1:n_col]
end_col <- ifelse(mergecg, ncol(lines_dt) - 1, ncol(lines_dt))
for (i in 2:end_col) set(lines_dt, j = i, value = as.numeric(lines_dt[[i]]))

return(lines_dt)
}

#' Query lines from a tabixed bedfile
#' @param bedfiles The bedfiles to be queried
#' @param regions A vector of genomic region strings
#' @param aligner The aligner used to produce the BED files - one of "biscuit",
#' "bismark", "bsbolt". Will set the result data.table's column names based on
#' this argument.
#' @param colnames A vector of column names for the result data.table. Set if
#' your bedfile is not from the supported aligners or is a general bedfile.
#' @param raw Set true to give a named list of raw strings from the regions in
#' the style of `Rsamtools::scanTabix` instead of a data.table
#' @param nthreads Set number of threads to use overriding the
#' `"iscream.threads"` option. See `?set_threads` for more information.
#'
#' @importFrom data.table as.data.table tstrsplit set
#' @importFrom parallel mclapply
#' @importFrom tools file_path_sans_ext
#' @return A data.table
#'
#' @export
#' @examples
#' bedfiles <- system.file("extdata", package = "iscream") |>
#' list.files(pattern = "[a|b|c|d].bed.gz$", full.names = TRUE)
#' regions <- c("chr1:1-6", "chr1:7-10", "chr1:11-14")
#' tabix(bedfiles[1], regions, colnames = c("chr", "start", "end", "beta", "coverage"))
tabix <- function(bedfiles, regions, aligner = "biscuit", colnames = NULL, raw = FALSE, nthreads = NULL) {
verify_files_or_stop(bedfiles)
if (class(regions)[1] == "GRanges"){
regions <- get_granges_string(regions)
}
verify_regions_or_stop(regions, nthreads)
verify_aligner_or_stop(aligner)
verify_filetype(bedfiles, aligner)
base_colnames <- c("chr", "start", "end")
biscuit_colnames <- c("beta", "coverage")
bismark_colnames <- c("methylation.percentage", "count.methylated", "count.unmethylated")

mergecg <- FALSE
if (!is.null(colnames)) {
result_colnames <- colnames
} else if (aligner == "biscuit") {
result_colnames <- c(base_colnames, biscuit_colnames)
if (grepl("mergecg", bedfiles[1])) {
result_colnames <- c(result_colnames, "mergecg")
mergecg <- TRUE
}
} else {
result_colnames <- c(base_colnames, bismark_colnames)
}

if (length(bedfiles) > 1) {
if (raw) {
bedline_list <- mclapply(bedfiles, function(file) {
scan_tabix(file, regions)
}, mc.cores = .get_threads(nthreads))

setNames(
nm = file_path_sans_ext(basename(bedfiles), compression = TRUE),
object = bedline_list
)
return(out)
}

dt_list <- mclapply(bedfiles, function(file) {
tbx_query <- single_tabix(
bedfile = file,
regions = regions,
result_colnames = result_colnames,
mergecg = mergecg
)
if (!is.null(tbx_query)) {
tbx_query[, sample := file_path_sans_ext(basename(file), compression = TRUE)]
}
return(tbx_query)
},
mc.cores = .get_threads(nthreads)
)
rbindlist(dt_list)
} else {
if (raw) {
scan_tabix(bedfiles, regions)
} else {
single_tabix(
bedfile = bedfiles,
regions = regions,
result_colnames = result_colnames,
mergecg = mergecg
)
}
}
}

117 changes: 117 additions & 0 deletions R/tabix.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#' Query lines from a tabixed bedfile
#' @param bedfiles The bedfiles to be queried
#' @param regions A vector of genomic region strings
#' @param aligner The aligner used to produce the BED files - one of "biscuit",
#' "bismark", "bsbolt". Will set the result data.table's column names based on
#' this argument.
#' @param colnames A vector of column names for the result data.table. Set if
#' your bedfile is not from the supported aligners or is a general bedfile.
#' @param raw Set true to give a named list of raw strings from the regions in
#' the style of `Rsamtools::scanTabix` instead of a data.table
#' @param nthreads Set number of threads to use overriding the
#' `"iscream.threads"` option. See `?set_threads` for more information.
#'
#' @importFrom data.table as.data.table tstrsplit set := rbindlist
#' @importFrom parallel mclapply
#' @importFrom tools file_path_sans_ext
#' @importFrom stats setNames
#' @return A data.table
#'
#' @export
#' @examples
#' bedfiles <- system.file("extdata", package = "iscream") |>
#' list.files(pattern = "[a|b|c|d].bed.gz$", full.names = TRUE)
#' regions <- c("chr1:1-6", "chr1:7-10", "chr1:11-14")
#' tabix(bedfiles[1], regions, colnames = c("chr", "start", "end", "beta", "coverage"))
tabix <- function(bedfiles, regions, aligner = "biscuit", colnames = NULL, raw = FALSE, nthreads = NULL) {
verify_files_or_stop(bedfiles)
if (class(regions)[1] == "GRanges") {
regions <- get_granges_string(regions)
}
verify_aligner_or_stop(aligner)
verify_filetype(bedfiles, aligner)

if (raw) {
if (length(bedfiles) == 1) {
return (scan_tabix(bedfiles, regions))
} else {
bedline_list <- mclapply(bedfiles, function(file) {
scan_tabix(file, regions)
}, mc.cores = .get_threads(nthreads)) |>
setNames(
nm = file_path_sans_ext(basename(bedfiles), compression = TRUE),
object = _
)
return(bedline_list)
}
}

base_colnames <- c("chr", "start", "end")
biscuit_colnames <- c("beta", "coverage")
bismark_colnames <- c("methylation.percentage", "count.methylated", "count.unmethylated")

mergecg <- FALSE
if (!is.null(colnames)) {
result_colnames <- colnames
} else if (aligner == "biscuit") {
result_colnames <- c(base_colnames, biscuit_colnames)
if (grepl("mergecg", bedfiles[1])) {
result_colnames <- c(result_colnames, "mergecg")
mergecg <- TRUE
}
} else {
result_colnames <- c(base_colnames, bismark_colnames)
}

if (length(bedfiles) == 1) {
single_tabix(
bedfile = bedfiles,
regions = regions,
result_colnames = result_colnames,
mergecg = mergecg
)
} else {
dt_list <- mclapply(bedfiles, function(file) {
tbx_query <- single_tabix(
bedfile = file,
regions = regions,
result_colnames = result_colnames,
mergecg = mergecg
)
if (!is.null(tbx_query)) {
tbx_query[, sample := file_path_sans_ext(basename(file), compression = TRUE)]
}
return(tbx_query)
},
mc.cores = .get_threads(nthreads)
)
rbindlist(dt_list)
}
}

single_tabix <- function(bedfile, regions, result_colnames, mergecg) {
lines <- Cpp_query_interval(bedfile, regions)
if (length(lines) == 0) {
warning(paste("No records found in", bedfile, "- if this is unexpected check that your region format matches your bedfiles"))
return(NULL)
}
lines_dt <- as.data.table(lines)

lines_dt <- lines_dt[, tstrsplit(lines, "\t", fixed = TRUE)]
n_col <- ncol(lines_dt)
if (length(result_colnames) < n_col) {
warning(paste(
"Did not use input 'colnames' - only",
length(result_colnames), "names provided for", n_col, "column data.table"
))
return(lines_dt)
} else if (length(result_colnames) > n_col) {
warning("Fewer columns in data than provided colnames")
}

colnames(lines_dt) <- result_colnames[1:n_col]
end_col <- ifelse(mergecg, ncol(lines_dt) - 1, ncol(lines_dt))
for (i in 2:end_col) set(lines_dt, j = i, value = as.numeric(lines_dt[[i]]))

return(lines_dt)
}
2 changes: 1 addition & 1 deletion man/tabix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 17f1b3f

Please sign in to comment.