-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(tabix): move scanTabix up before DF logic; separate tabix R file
- Loading branch information
1 parent
7ef328e
commit 17f1b3f
Showing
4 changed files
with
121 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#' Query lines from a tabixed bedfile | ||
#' @param bedfiles The bedfiles to be queried | ||
#' @param regions A vector of genomic region strings | ||
#' @param aligner The aligner used to produce the BED files - one of "biscuit", | ||
#' "bismark", "bsbolt". Will set the result data.table's column names based on | ||
#' this argument. | ||
#' @param colnames A vector of column names for the result data.table. Set if | ||
#' your bedfile is not from the supported aligners or is a general bedfile. | ||
#' @param raw Set true to give a named list of raw strings from the regions in | ||
#' the style of `Rsamtools::scanTabix` instead of a data.table | ||
#' @param nthreads Set number of threads to use overriding the | ||
#' `"iscream.threads"` option. See `?set_threads` for more information. | ||
#' | ||
#' @importFrom data.table as.data.table tstrsplit set := rbindlist | ||
#' @importFrom parallel mclapply | ||
#' @importFrom tools file_path_sans_ext | ||
#' @importFrom stats setNames | ||
#' @return A data.table | ||
#' | ||
#' @export | ||
#' @examples | ||
#' bedfiles <- system.file("extdata", package = "iscream") |> | ||
#' list.files(pattern = "[a|b|c|d].bed.gz$", full.names = TRUE) | ||
#' regions <- c("chr1:1-6", "chr1:7-10", "chr1:11-14") | ||
#' tabix(bedfiles[1], regions, colnames = c("chr", "start", "end", "beta", "coverage")) | ||
tabix <- function(bedfiles, regions, aligner = "biscuit", colnames = NULL, raw = FALSE, nthreads = NULL) { | ||
verify_files_or_stop(bedfiles) | ||
if (class(regions)[1] == "GRanges") { | ||
regions <- get_granges_string(regions) | ||
} | ||
verify_aligner_or_stop(aligner) | ||
verify_filetype(bedfiles, aligner) | ||
|
||
if (raw) { | ||
if (length(bedfiles) == 1) { | ||
return (scan_tabix(bedfiles, regions)) | ||
} else { | ||
bedline_list <- mclapply(bedfiles, function(file) { | ||
scan_tabix(file, regions) | ||
}, mc.cores = .get_threads(nthreads)) |> | ||
setNames( | ||
nm = file_path_sans_ext(basename(bedfiles), compression = TRUE), | ||
object = _ | ||
) | ||
return(bedline_list) | ||
} | ||
} | ||
|
||
base_colnames <- c("chr", "start", "end") | ||
biscuit_colnames <- c("beta", "coverage") | ||
bismark_colnames <- c("methylation.percentage", "count.methylated", "count.unmethylated") | ||
|
||
mergecg <- FALSE | ||
if (!is.null(colnames)) { | ||
result_colnames <- colnames | ||
} else if (aligner == "biscuit") { | ||
result_colnames <- c(base_colnames, biscuit_colnames) | ||
if (grepl("mergecg", bedfiles[1])) { | ||
result_colnames <- c(result_colnames, "mergecg") | ||
mergecg <- TRUE | ||
} | ||
} else { | ||
result_colnames <- c(base_colnames, bismark_colnames) | ||
} | ||
|
||
if (length(bedfiles) == 1) { | ||
single_tabix( | ||
bedfile = bedfiles, | ||
regions = regions, | ||
result_colnames = result_colnames, | ||
mergecg = mergecg | ||
) | ||
} else { | ||
dt_list <- mclapply(bedfiles, function(file) { | ||
tbx_query <- single_tabix( | ||
bedfile = file, | ||
regions = regions, | ||
result_colnames = result_colnames, | ||
mergecg = mergecg | ||
) | ||
if (!is.null(tbx_query)) { | ||
tbx_query[, sample := file_path_sans_ext(basename(file), compression = TRUE)] | ||
} | ||
return(tbx_query) | ||
}, | ||
mc.cores = .get_threads(nthreads) | ||
) | ||
rbindlist(dt_list) | ||
} | ||
} | ||
|
||
single_tabix <- function(bedfile, regions, result_colnames, mergecg) { | ||
lines <- Cpp_query_interval(bedfile, regions) | ||
if (length(lines) == 0) { | ||
warning(paste("No records found in", bedfile, "- if this is unexpected check that your region format matches your bedfiles")) | ||
return(NULL) | ||
} | ||
lines_dt <- as.data.table(lines) | ||
|
||
lines_dt <- lines_dt[, tstrsplit(lines, "\t", fixed = TRUE)] | ||
n_col <- ncol(lines_dt) | ||
if (length(result_colnames) < n_col) { | ||
warning(paste( | ||
"Did not use input 'colnames' - only", | ||
length(result_colnames), "names provided for", n_col, "column data.table" | ||
)) | ||
return(lines_dt) | ||
} else if (length(result_colnames) > n_col) { | ||
warning("Fewer columns in data than provided colnames") | ||
} | ||
|
||
colnames(lines_dt) <- result_colnames[1:n_col] | ||
end_col <- ifelse(mergecg, ncol(lines_dt) - 1, ncol(lines_dt)) | ||
for (i in 2:end_col) set(lines_dt, j = i, value = as.numeric(lines_dt[[i]])) | ||
|
||
return(lines_dt) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.