fix(tabix): move scanTabix up before DF logic; separate tabix R file

huishenlab · Nov 20, 2024 · 17f1b3f · 17f1b3f
1 parent 7ef328e
commit 17f1b3f
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 120 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -18,11 +18,14 @@ export(tabix)
 import(Matrix)
 import(stringfish)
 importFrom(Rcpp,sourceCpp)
+importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
+importFrom(data.table,rbindlist)
 importFrom(data.table,set)
 importFrom(data.table,tstrsplit)
 importFrom(parallel,mclapply)
 importFrom(parallelly,availableCores)
+importFrom(stats,setNames)
 importFrom(stringfish,sf_grepl)
 importFrom(tools,file_path_sans_ext)
 useDynLib(iscream, .registration = TRUE)
diff --git a/R/query.R b/R/query.R
@@ -12,122 +12,3 @@ query_chroms <- function(bedfiles, nthreads = NULL) {
 
   Cpp_query_chroms(bedfiles, n_threads)
 }
-
-single_tabix <- function(bedfile, regions, result_colnames, mergecg) {
-  lines <- Cpp_query_interval(bedfile, regions)
-  if (length(lines) == 0) {
-    warning(paste("No records found in", bedfile))
-    return(NULL)
-  }
-  lines_dt <- as.data.table(lines)
-
-  lines_dt <- lines_dt[, tstrsplit(lines, "\t", fixed = TRUE)]
-  n_col <- ncol(lines_dt)
-  if (length(result_colnames) < n_col) {
-    warning(paste(
-        "Did not use input 'colnames' - only",
-        length(result_colnames), "names provided for", n_col, "column data.table"
-      ))
-    return(lines_dt)
-  } else if (length(result_colnames) > n_col) {
-    warning("Fewer columns in data than provided colnames")
-  }
-
-  colnames(lines_dt) <- result_colnames[1:n_col]
-  end_col <- ifelse(mergecg, ncol(lines_dt) - 1, ncol(lines_dt))
-  for (i in 2:end_col) set(lines_dt, j = i, value = as.numeric(lines_dt[[i]]))
-
-  return(lines_dt)
-}
-
-#' Query lines from a tabixed bedfile
-#' @param bedfiles The bedfiles to be queried
-#' @param regions A vector of genomic region strings
-#' @param aligner The aligner used to produce the BED files - one of "biscuit",
-#' "bismark", "bsbolt". Will set the result data.table's column names based on
-#' this argument.
-#' @param colnames A vector of column names for the result data.table. Set if
-#' your bedfile is not from the supported aligners or is a general bedfile.
-#' @param raw Set true to give a named list of raw strings from the regions in
-#' the style of `Rsamtools::scanTabix` instead of a data.table
-#' @param nthreads Set number of threads to use overriding the
-#' `"iscream.threads"` option. See `?set_threads` for more information.
-#'
-#' @importFrom data.table as.data.table tstrsplit set
-#' @importFrom parallel mclapply
-#' @importFrom tools file_path_sans_ext
-#' @return A data.table
-#'
-#' @export
-#' @examples
-#' bedfiles <- system.file("extdata", package = "iscream") |>
-#'   list.files(pattern = "[a|b|c|d].bed.gz$", full.names = TRUE)
-#' regions <- c("chr1:1-6", "chr1:7-10", "chr1:11-14")
-#' tabix(bedfiles[1], regions, colnames = c("chr", "start", "end", "beta", "coverage"))
-tabix <- function(bedfiles, regions, aligner = "biscuit", colnames = NULL, raw = FALSE, nthreads = NULL) {
-  verify_files_or_stop(bedfiles)
-  if (class(regions)[1] == "GRanges"){
-    regions <- get_granges_string(regions)
-  }
-  verify_regions_or_stop(regions, nthreads)
-  verify_aligner_or_stop(aligner)
-  verify_filetype(bedfiles, aligner)
-  base_colnames <- c("chr", "start", "end")
-  biscuit_colnames <- c("beta", "coverage")
-  bismark_colnames <- c("methylation.percentage", "count.methylated", "count.unmethylated")
-
-  mergecg <- FALSE
-  if (!is.null(colnames)) {
-    result_colnames <- colnames
-  } else if (aligner == "biscuit") {
-    result_colnames <- c(base_colnames, biscuit_colnames)
-    if (grepl("mergecg", bedfiles[1])) {
-      result_colnames <- c(result_colnames, "mergecg")
-      mergecg <- TRUE
-    }
-  } else {
-    result_colnames <- c(base_colnames, bismark_colnames)
-  }
-
-  if (length(bedfiles) > 1) {
-    if (raw) {
-      bedline_list <- mclapply(bedfiles, function(file) {
-        scan_tabix(file, regions)
-      }, mc.cores = .get_threads(nthreads))
-
-      setNames(
-        nm = file_path_sans_ext(basename(bedfiles), compression = TRUE),
-        object = bedline_list
-      )
-      return(out)
-    }
-
-    dt_list <- mclapply(bedfiles, function(file) {
-      tbx_query <- single_tabix(
-        bedfile = file,
-        regions = regions,
-        result_colnames = result_colnames,
-        mergecg = mergecg
-      )
-      if (!is.null(tbx_query)) {
-        tbx_query[, sample := file_path_sans_ext(basename(file), compression = TRUE)]
-      }
-      return(tbx_query)
-    },
-      mc.cores = .get_threads(nthreads)
-    )
-    rbindlist(dt_list)
-  } else {
-    if (raw) {
-      scan_tabix(bedfiles, regions)
-    } else {
-      single_tabix(
-        bedfile = bedfiles,
-        regions = regions,
-        result_colnames = result_colnames,
-        mergecg = mergecg
-      )
-    }
-  }
-}
-
diff --git a/R/tabix.R b/R/tabix.R
@@ -0,0 +1,117 @@
+#' Query lines from a tabixed bedfile
+#' @param bedfiles The bedfiles to be queried
+#' @param regions A vector of genomic region strings
+#' @param aligner The aligner used to produce the BED files - one of "biscuit",
+#' "bismark", "bsbolt". Will set the result data.table's column names based on
+#' this argument.
+#' @param colnames A vector of column names for the result data.table. Set if
+#' your bedfile is not from the supported aligners or is a general bedfile.
+#' @param raw Set true to give a named list of raw strings from the regions in
+#' the style of `Rsamtools::scanTabix` instead of a data.table
+#' @param nthreads Set number of threads to use overriding the
+#' `"iscream.threads"` option. See `?set_threads` for more information.
+#'
+#' @importFrom data.table as.data.table tstrsplit set := rbindlist
+#' @importFrom parallel mclapply
+#' @importFrom tools file_path_sans_ext
+#' @importFrom stats setNames
+#' @return A data.table
+#'
+#' @export
+#' @examples
+#' bedfiles <- system.file("extdata", package = "iscream") |>
+#'   list.files(pattern = "[a|b|c|d].bed.gz$", full.names = TRUE)
+#' regions <- c("chr1:1-6", "chr1:7-10", "chr1:11-14")
+#' tabix(bedfiles[1], regions, colnames = c("chr", "start", "end", "beta", "coverage"))
+tabix <- function(bedfiles, regions, aligner = "biscuit", colnames = NULL, raw = FALSE, nthreads = NULL) {
+  verify_files_or_stop(bedfiles)
+  if (class(regions)[1] == "GRanges") {
+    regions <- get_granges_string(regions)
+  }
+  verify_aligner_or_stop(aligner)
+  verify_filetype(bedfiles, aligner)
+
+  if (raw) {
+    if (length(bedfiles) == 1) {
+      return (scan_tabix(bedfiles, regions))
+    } else {
+      bedline_list <- mclapply(bedfiles, function(file) {
+        scan_tabix(file, regions)
+      }, mc.cores = .get_threads(nthreads)) |>
+      setNames(
+        nm = file_path_sans_ext(basename(bedfiles), compression = TRUE),
+        object = _
+      )
+      return(bedline_list)
+    }
+  }
+
+  base_colnames <- c("chr", "start", "end")
+  biscuit_colnames <- c("beta", "coverage")
+  bismark_colnames <- c("methylation.percentage", "count.methylated", "count.unmethylated")
+
+  mergecg <- FALSE
+  if (!is.null(colnames)) {
+    result_colnames <- colnames
+  } else if (aligner == "biscuit") {
+    result_colnames <- c(base_colnames, biscuit_colnames)
+    if (grepl("mergecg", bedfiles[1])) {
+      result_colnames <- c(result_colnames, "mergecg")
+      mergecg <- TRUE
+    }
+  } else {
+    result_colnames <- c(base_colnames, bismark_colnames)
+  }
+
+  if (length(bedfiles) == 1) {
+      single_tabix(
+        bedfile = bedfiles,
+        regions = regions,
+        result_colnames = result_colnames,
+        mergecg = mergecg
+      )
+  } else {
+    dt_list <- mclapply(bedfiles, function(file) {
+      tbx_query <- single_tabix(
+        bedfile = file,
+        regions = regions,
+        result_colnames = result_colnames,
+        mergecg = mergecg
+      )
+      if (!is.null(tbx_query)) {
+        tbx_query[, sample := file_path_sans_ext(basename(file), compression = TRUE)]
+      }
+      return(tbx_query)
+    },
+      mc.cores = .get_threads(nthreads)
+    )
+    rbindlist(dt_list)
+  }
+}
+
+single_tabix <- function(bedfile, regions, result_colnames, mergecg) {
+  lines <- Cpp_query_interval(bedfile, regions)
+  if (length(lines) == 0) {
+    warning(paste("No records found in", bedfile, "- if this is unexpected check that your region format matches your bedfiles"))
+    return(NULL)
+  }
+  lines_dt <- as.data.table(lines)
+
+  lines_dt <- lines_dt[, tstrsplit(lines, "\t", fixed = TRUE)]
+  n_col <- ncol(lines_dt)
+  if (length(result_colnames) < n_col) {
+    warning(paste(
+        "Did not use input 'colnames' - only",
+        length(result_colnames), "names provided for", n_col, "column data.table"
+      ))
+    return(lines_dt)
+  } else if (length(result_colnames) > n_col) {
+    warning("Fewer columns in data than provided colnames")
+  }
+
+  colnames(lines_dt) <- result_colnames[1:n_col]
+  end_col <- ifelse(mergecg, ncol(lines_dt) - 1, ncol(lines_dt))
+  for (i in 2:end_col) set(lines_dt, j = i, value = as.numeric(lines_dt[[i]]))
+
+  return(lines_dt)
+}
diff --git a/man/tabix.Rd b/man/tabix.Rd