diff --git a/DESCRIPTION b/DESCRIPTION index 5c8b76f..661c5f2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: bsseq -Version: 1.39.3 +Version: 1.39.2 Encoding: UTF-8 Title: Analyze, manage and store whole-genome methylation data Description: A collection of tools for analyzing and visualizing whole-genome diff --git a/R/read.modkit.R b/R/read.modkit.R index 6a3f46e..d4f6af2 100644 --- a/R/read.modkit.R +++ b/R/read.modkit.R @@ -3,7 +3,7 @@ read.modkit <- function(files, rmZeroCov = FALSE, strandCollapse = TRUE){ gr_list <- list() - sampleNames <- sub("\\.bed$", "", basename(files)) + sampleNames <- sub("\\.bed.gz$", "", basename(files)) if (!is.null(colData)){ rownames(colData) <- sampleNames } @@ -11,68 +11,67 @@ read.modkit <- function(files, for (i in seq_along(files)){ data <- read.table(files[i], header = FALSE, sep="\t", stringsAsFactors=FALSE, quote="") - gr <- GRanges(seqnames = data$V1, - ranges = IRanges(start = data$V2+1, end = data$V3)) - mcols(gr)$m <- data$V12 - mcols(gr)$cov <- data$V12 + data$V13 - mcols(gr)$filter <- data$V16 - if (any(data$V14 != 0)){ - mcols(gr)$other_mod <- data$V14 - mcols(gr)$cov_other_mod <- data$V14 + data$V13 - } + + if (length(unique(data$V4)) == 2){ + gr <- GRanges(seqnames = data[data$V4 == "m", ]$V1, + ranges = IRanges(start = data[data$V4 == "m", ]$V2+1, + end = data[data$V4 == "m", ]$V3)) + + mcols(gr)$m <- data[data$V4 == "m", ]$V12 + mcols(gr)$h <- data[data$V4 != "m", ]$V12 + mcols(gr)$u <- data[data$V4 == "m", ]$V13 + mcols(gr)$filter <- data[data$V4 == "m", ]$V16 + }else{ + gr <- GRanges(seqnames = data$V1, + ranges = IRanges(start = data$V2+1, end = data$V3)) + + mcols(gr)$m <- data$V12 + mcols(gr)$u <- data$V13 + mcols(gr)$filter <- data$V16 + } + names(gr) <- sampleNames[i] gr_list[[sampleNames[i]]] <- gr } overlap_gr <- Reduce(subsetByOverlaps, gr_list) - m_cov_list <- lapply(gr_list, function(gr){ + m_u_list <- lapply(gr_list, function(gr){ overlap_data <- gr[gr %over% overlap_gr] - if (!is.null(gr$other_mod)) { - data.frame(m = overlap_data$m, cov = overlap_data$cov, - filter = overlap_data$filter, - other_mod = overlap_data$other_mod, - cov_other_mod = overlap_data$cov_other_mod) + if (!is.null(gr$h)) { + data.frame(m = overlap_data$m, u = overlap_data$u, + h = overlap_data$h, filter = overlap_data$filter) } else { - data.frame(m = overlap_data$m, cov = overlap_data$cov, - filter = overlap_data$filter) + data.frame(m = overlap_data$m, u = overlap_data$u, + filter = overlap_data$filter) } }) - m <- do.call(cbind, lapply(m_cov_list, `[[`, "m")) - cov <- do.call(cbind, lapply(m_cov_list, `[[`, "cov")) - filter <- do.call(cbind, lapply(m_cov_list, `[[`, "filter")) - other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "other_mod")) - cov_other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "cov_other_mod")) - - bsseq_objs <- list() - bsseq_obj_m <- BSseq(M = as.matrix(m), Cov = as.matrix(cov), - Filtered = as.matrix(filter), - coef = NULL, se.coef = NULL, - pos = start(overlap_gr), trans = NULL, - parameters = NULL, pData = colData, gr = NULL, - chr = as.vector(seqnames(overlap_gr)), - sampleNames = sampleNames, rmZeroCov = rmZeroCov) + m <- do.call(cbind, lapply(m_u_list, `[[`, "m")) + u <- do.call(cbind, lapply(m_u_list, `[[`, "u")) + h <- do.call(cbind, lapply(m_u_list, `[[`, "h")) + filter <- do.call(cbind, lapply(m_u_list, `[[`, "filter")) - bsseq_objs[[1]] <- bsseq_obj_m - - if (!is.null(other_mod)){ - bsseq_obj_other_mod <- BSseq(M = as.matrix(other_mod), - Cov = as.matrix(cov_other_mod), - Filtered = as.matrix(filter), - coef = NULL, se.coef = NULL, - pos = start(overlap_gr), trans = NULL, - parameters = NULL, pData = colData, gr = NULL, - chr = as.vector(seqnames(overlap_gr)), - sampleNames = sampleNames, rmZeroCov = rmZeroCov) - - bsseq_objs[[2]] <- bsseq_obj_other_mod - } - - if (strandCollapse){ - for (i in seq_along(bsseq_objs)) { - bsseq:::strandCollapse(bsseq_objs[[i]])} - } + if (!is.null(h)){ + bsseq_obj <- BSseq(M = as.matrix(m + h), + Cov = as.matrix(m + u + h), + Filtered = as.matrix(filter), + coef = NULL, se.coef = NULL, + pos = start(overlap_gr), trans = NULL, + parameters = NULL, pData = colData, gr = NULL, + chr = as.vector(seqnames(overlap_gr)), + sampleNames = sampleNames, rmZeroCov = rmZeroCov) + if (strandCollapse) {strandCollapse(bsseq_obj)} + }else{ + bsseq_obj <- BSseq(M = as.matrix(m), Cov = as.matrix(u + m), + Filtered = as.matrix(filter), + coef = NULL, se.coef = NULL, + pos = start(overlap_gr), trans = NULL, + parameters = NULL, pData = colData, gr = NULL, + chr = as.vector(seqnames(overlap_gr)), + sampleNames = sampleNames, rmZeroCov = rmZeroCov) + if (strandCollapse) {strandCollapse(bsseq_obj)} + } - return(bsseq_objs) + return(bsseq_obj) } diff --git a/inst/extdata/modkit/Hypo1.first50Bed.txt b/inst/extdata/modkit/Hypo1.first50Bed.txt new file mode 100644 index 0000000..024905b --- /dev/null +++ b/inst/extdata/modkit/Hypo1.first50Bed.txt @@ -0,0 +1,50 @@ +chr1 3000826 3000827 h 28 . 3000826 3000827 255,0,0 28 0.00 0 26 2 1 0 0 1 +chr1 3000826 3000827 m 28 . 3000826 3000827 255,0,0 28 7.14 2 26 0 1 0 0 1 +chr1 3001006 3001007 h 27 . 3001006 3001007 255,0,0 27 0.00 0 27 0 0 1 1 1 +chr1 3001006 3001007 m 27 . 3001006 3001007 255,0,0 27 0.00 0 27 0 0 1 1 1 +chr1 3001017 3001018 h 25 . 3001017 3001018 255,0,0 25 0.00 0 25 0 1 3 1 0 +chr1 3001017 3001018 m 25 . 3001017 3001018 255,0,0 25 0.00 0 25 0 1 3 1 0 +chr1 3001276 3001277 h 24 . 3001276 3001277 255,0,0 24 0.00 0 24 0 1 0 1 2 +chr1 3001276 3001277 m 24 . 3001276 3001277 255,0,0 24 0.00 0 24 0 1 0 1 2 +chr1 3001628 3001629 h 25 . 3001628 3001629 255,0,0 25 0.00 0 25 0 0 0 0 1 +chr1 3001628 3001629 m 25 . 3001628 3001629 255,0,0 25 0.00 0 25 0 0 0 0 1 +chr1 3003225 3003226 h 24 . 3003225 3003226 255,0,0 24 4.17 1 12 11 1 5 0 0 +chr1 3003225 3003226 m 24 . 3003225 3003226 255,0,0 24 45.83 11 12 1 1 5 0 0 +chr1 3003338 3003339 h 30 . 3003338 3003339 255,0,0 30 3.33 1 7 22 0 3 0 0 +chr1 3003338 3003339 m 30 . 3003338 3003339 255,0,0 30 73.33 22 7 1 0 3 0 0 +chr1 3003378 3003379 h 31 . 3003378 3003379 255,0,0 31 0.00 0 18 13 0 0 1 2 +chr1 3003378 3003379 m 31 . 3003378 3003379 255,0,0 31 41.94 13 18 0 0 0 1 2 +chr1 3003581 3003582 h 30 . 3003581 3003582 255,0,0 30 0.00 0 24 6 0 3 1 2 +chr1 3003581 3003582 m 30 . 3003581 3003582 255,0,0 30 20.00 6 24 0 0 3 1 2 +chr1 3003639 3003640 h 4 . 3003639 3003640 255,0,0 4 0.00 0 4 0 8 19 0 6 +chr1 3003639 3003640 m 4 . 3003639 3003640 255,0,0 4 0.00 0 4 0 8 19 0 6 +chr1 3003720 3003721 h 32 . 3003720 3003721 255,0,0 32 0.00 0 24 8 0 0 1 4 +chr1 3003720 3003721 m 32 . 3003720 3003721 255,0,0 32 25.00 8 24 0 0 0 1 4 +chr1 3003884 3003885 h 36 . 3003884 3003885 255,0,0 36 0.00 0 21 15 0 2 0 0 +chr1 3003884 3003885 m 36 . 3003884 3003885 255,0,0 36 41.67 15 21 0 0 2 0 0 +chr1 3003897 3003898 h 27 . 3003897 3003898 255,0,0 27 0.00 0 13 14 0 9 1 1 +chr1 3003897 3003898 m 27 . 3003897 3003898 255,0,0 27 51.85 14 13 0 0 9 1 1 +chr1 3004529 3004530 h 35 . 3004529 3004530 255,0,0 35 0.00 0 35 0 1 0 1 1 +chr1 3004529 3004530 m 35 . 3004529 3004530 255,0,0 35 0.00 0 35 0 1 0 1 1 +chr1 3005997 3005998 h 46 . 3005997 3005998 255,0,0 46 0.00 0 46 0 0 1 0 1 +chr1 3005997 3005998 m 46 . 3005997 3005998 255,0,0 46 0.00 0 46 0 0 1 0 1 +chr1 3006186 3006187 h 51 . 3006186 3006187 255,0,0 51 0.00 0 48 3 0 0 0 2 +chr1 3006186 3006187 m 51 . 3006186 3006187 255,0,0 51 5.88 3 48 0 0 0 0 2 +chr1 3006415 3006416 h 48 . 3006415 3006416 255,0,0 48 0.00 0 37 11 0 1 0 1 +chr1 3006415 3006416 m 48 . 3006415 3006416 255,0,0 48 22.92 11 37 0 0 1 0 1 +chr1 3006781 3006782 h 48 . 3006781 3006782 255,0,0 48 0.00 0 39 9 0 0 1 0 +chr1 3006781 3006782 m 48 . 3006781 3006782 255,0,0 48 18.75 9 39 0 0 0 1 0 +chr1 3006880 3006881 h 40 . 3006880 3006881 255,0,0 40 0.00 0 40 0 3 0 4 1 +chr1 3006880 3006881 m 40 . 3006880 3006881 255,0,0 40 0.00 0 40 0 3 0 4 1 +chr1 3007169 3007170 h 41 . 3007169 3007170 255,0,0 41 0.00 0 36 5 0 1 1 2 +chr1 3007169 3007170 m 41 . 3007169 3007170 255,0,0 41 12.20 5 36 0 0 1 1 2 +chr1 3007429 3007430 h 43 . 3007429 3007430 255,0,0 43 0.00 0 29 14 0 2 0 0 +chr1 3007429 3007430 m 43 . 3007429 3007430 255,0,0 43 32.56 14 29 0 0 2 0 0 +chr1 3007531 3007532 h 40 . 3007531 3007532 255,0,0 40 0.00 0 40 0 3 0 1 3 +chr1 3007531 3007532 m 40 . 3007531 3007532 255,0,0 40 0.00 0 40 0 3 0 1 3 +chr1 3007579 3007580 h 43 . 3007579 3007580 255,0,0 43 0.00 0 43 0 0 0 0 5 +chr1 3007579 3007580 m 43 . 3007579 3007580 255,0,0 43 0.00 0 43 0 0 0 0 5 +chr1 3007681 3007682 h 44 . 3007681 3007682 255,0,0 44 0.00 0 44 0 0 0 3 5 +chr1 3007681 3007682 m 44 . 3007681 3007682 255,0,0 44 0.00 0 44 0 0 0 3 5 +chr1 3008544 3008545 h 47 . 3008544 3008545 255,0,0 47 0.00 0 44 3 0 1 3 2 +chr1 3008544 3008545 m 47 . 3008544 3008545 255,0,0 47 6.38 3 44 0 0 1 3 2 \ No newline at end of file diff --git a/inst/extdata/modkit/chr21.chr22.HG002.top1000.bed.gz b/inst/extdata/modkit/chr21.chr22.HG002.top1000.bed.gz new file mode 100644 index 0000000..af77aa3 Binary files /dev/null and b/inst/extdata/modkit/chr21.chr22.HG002.top1000.bed.gz differ diff --git a/inst/extdata/modkit/chr22.HG002.top1000.bed.gz b/inst/extdata/modkit/chr22.HG002.top1000.bed.gz deleted file mode 100644 index c803fd6..0000000 Binary files a/inst/extdata/modkit/chr22.HG002.top1000.bed.gz and /dev/null differ diff --git a/inst/extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz b/inst/extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz deleted file mode 100644 index de25e23..0000000 Binary files a/inst/extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz and /dev/null differ diff --git a/man/read.modkit.Rd b/man/read.modkit.Rd index c4e14e9..58a0538 100644 --- a/man/read.modkit.Rd +++ b/man/read.modkit.Rd @@ -32,15 +32,15 @@ are converted to bedMethyl (BED) files using \href{https://github.com/nanoporete Modkit outputs modified reads, unmodified reads, ambiguous modification reads (reads where the probability was below the threshold and usually failing the lowest 10th percentile), and other modified reads.} \subsection{modkit to Bsseq object}{ - After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. If there are other modified reads, two Bsseq objects are constructed with the first one using major modified reads and their corresponding coverage and the second one using the other modified reads and their corresponding coverage.} + After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. Other modification bases such as hydroxymethylation are extracted and added to the methylation matrix when present.} } \examples{ -# only one modification and one BSseq object is constructed. -files <- c(system.file("extdata/modkit/chr22.HG002.top1000.bed.gz", package = "bsseq")) -bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE) +# No other modification present +files <- c(system.file("extdata/modkit/chr21.chr22.HG002.top1000.bed.gz", package = "bsseq")) +bsseq_nano <- read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE) -# there is other modification, so two BSseq objects are constructed. -files <- c(system.file("extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz",package = "bsseq")) -bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE) +# Other modification present +files <- c(system.file("extdata/modkit/Hypo1.first50Bed.txt",package = "bsseq")) +bsseq_nano <- read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE) } diff --git a/tests/testthat/test_read.modkit.R b/tests/testthat/test_read.modkit.R index af67251..e68f750 100644 --- a/tests/testthat/test_read.modkit.R +++ b/tests/testthat/test_read.modkit.R @@ -1,13 +1,24 @@ context("read.modkit") # TODO: Re-factor read.modkit() and update tests accordingly -test_that("read.modkit() works for BED files", { - infile <- system.file("extdata", "modkit/chr22.HG002.top1000.other_mod.bed.gz", +test_that("read.modkit() works for BED files without 5hmc", { + infile <- system.file("extdata", "modkit/chr21.chr22.HG002.top1000.bed.gz", package = "bsseq") bsseq <- read.modkit(files = infile, colData = NULL, rmZeroCov = FALSE, strandCollapse = TRUE) - - lapply(bsseq, function(x) {expect_is(x, "BSseq")}) + + expect_is(bsseq, "BSseq") +}) + +test_that("read.modkit() works for BED files with 5hmc", { + infile <- system.file("extdata", "modkit/Hypo1.first50Bed.txt", + package = "bsseq") + bsseq <- read.modkit(files = infile, + colData = NULL, + rmZeroCov = FALSE, + strandCollapse = TRUE) + + expect_is(bsseq, "BSseq") })