hansenlab · kasperdanielhansen · Apr 8, 2024 · Apr 3, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: bsseq
-Version: 1.39.3
+Version: 1.39.2
 Encoding: UTF-8
 Title: Analyze, manage and store whole-genome methylation data
 Description: A collection of tools for analyzing and visualizing whole-genome

diff --git a/R/read.modkit.R b/R/read.modkit.R
@@ -3,76 +3,75 @@ read.modkit <- function(files,
                         rmZeroCov = FALSE,
                         strandCollapse = TRUE){
     gr_list <- list()
-    sampleNames <- sub("\\.bed$", "", basename(files))
+    sampleNames <- sub("\\.bed.gz$", "", basename(files))
     if (!is.null(colData)){
         rownames(colData) <- sampleNames
         }
 
     for (i in seq_along(files)){
         data <- read.table(files[i], header = FALSE, sep="\t",
                     stringsAsFactors=FALSE, quote="")
-        gr <- GRanges(seqnames = data$V1,
-                  ranges = IRanges(start = data$V2+1, end = data$V3))
-        mcols(gr)$m <- data$V12
-        mcols(gr)$cov <- data$V12 + data$V13
-        mcols(gr)$filter <- data$V16
-        if (any(data$V14 != 0)){
-            mcols(gr)$other_mod <- data$V14
-            mcols(gr)$cov_other_mod <- data$V14 + data$V13
-            }
+
+        if (length(unique(data$V4)) == 2){
+            gr <- GRanges(seqnames = data[data$V4 == "m", ]$V1,
+                    ranges = IRanges(start = data[data$V4 == "m", ]$V2+1,
+                                     end = data[data$V4 == "m", ]$V3))
+
+            mcols(gr)$m <- data[data$V4 == "m", ]$V12
+            mcols(gr)$h <- data[data$V4 != "m", ]$V12
+            mcols(gr)$u <- data[data$V4 == "m", ]$V13
+            mcols(gr)$filter <- data[data$V4 == "m", ]$V16
+        }else{
+            gr <- GRanges(seqnames = data$V1,
+                          ranges = IRanges(start = data$V2+1, end = data$V3))
+
+            mcols(gr)$m <- data$V12
+            mcols(gr)$u <- data$V13
+            mcols(gr)$filter <- data$V16
+        }
+
         names(gr) <- sampleNames[i]
         gr_list[[sampleNames[i]]] <- gr
         }
 
     overlap_gr <- Reduce(subsetByOverlaps, gr_list)
 
-    m_cov_list <- lapply(gr_list, function(gr){
+    m_u_list <- lapply(gr_list, function(gr){
         overlap_data <- gr[gr %over% overlap_gr]
-        if (!is.null(gr$other_mod)) {
-            data.frame(m = overlap_data$m, cov = overlap_data$cov,
-                filter = overlap_data$filter,
-                other_mod = overlap_data$other_mod,
-                cov_other_mod = overlap_data$cov_other_mod)
+        if (!is.null(gr$h)) {
+            data.frame(m = overlap_data$m, u = overlap_data$u,
+                       h = overlap_data$h, filter = overlap_data$filter)
             } else {
-                data.frame(m = overlap_data$m, cov = overlap_data$cov,
-                    filter = overlap_data$filter)
+                data.frame(m = overlap_data$m,  u = overlap_data$u,
+                           filter = overlap_data$filter)
                 }
         })
 
-    m <- do.call(cbind, lapply(m_cov_list, `[[`, "m"))
-    cov <- do.call(cbind, lapply(m_cov_list, `[[`, "cov"))
-    filter <- do.call(cbind, lapply(m_cov_list, `[[`, "filter"))
-    other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "other_mod"))
-    cov_other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "cov_other_mod"))
-
-    bsseq_objs <- list()
-    bsseq_obj_m <- BSseq(M = as.matrix(m), Cov = as.matrix(cov),
-                      Filtered = as.matrix(filter),
-                      coef = NULL, se.coef = NULL,
-                      pos = start(overlap_gr), trans = NULL,
-                      parameters = NULL, pData = colData, gr = NULL,
-                      chr = as.vector(seqnames(overlap_gr)),
-                      sampleNames = sampleNames, rmZeroCov = rmZeroCov)
+    m <- do.call(cbind, lapply(m_u_list, `[[`, "m"))
+    u <- do.call(cbind, lapply(m_u_list, `[[`, "u"))
+    h <- do.call(cbind, lapply(m_u_list, `[[`, "h"))
+    filter <- do.call(cbind, lapply(m_u_list, `[[`, "filter"))
 
-    bsseq_objs[[1]] <- bsseq_obj_m
-
-    if (!is.null(other_mod)){
-        bsseq_obj_other_mod <- BSseq(M = as.matrix(other_mod),
-                                   Cov = as.matrix(cov_other_mod),
-                                   Filtered = as.matrix(filter),
-                                   coef = NULL, se.coef = NULL,
-                                   pos = start(overlap_gr), trans = NULL,
-                                   parameters = NULL, pData = colData, gr = NULL,
-                                   chr = as.vector(seqnames(overlap_gr)),
-                                   sampleNames = sampleNames, rmZeroCov = rmZeroCov)
-
-        bsseq_objs[[2]] <- bsseq_obj_other_mod
-        }
-
-    if (strandCollapse){
-        for (i in seq_along(bsseq_objs)) {
-            bsseq:::strandCollapse(bsseq_objs[[i]])}
-        }
+    if (!is.null(h)){
+          bsseq_obj <- BSseq(M = as.matrix(m + h),
+                             Cov = as.matrix(m + u + h),
+                             Filtered = as.matrix(filter),
+                             coef = NULL, se.coef = NULL,
+                             pos = start(overlap_gr), trans = NULL,
+                             parameters = NULL, pData = colData, gr = NULL,
+                             chr = as.vector(seqnames(overlap_gr)),
+                             sampleNames = sampleNames, rmZeroCov = rmZeroCov)
+          if (strandCollapse) {strandCollapse(bsseq_obj)}
+    }else{
+      bsseq_obj <- BSseq(M = as.matrix(m), Cov = as.matrix(u + m),
+                         Filtered = as.matrix(filter),
+                         coef = NULL, se.coef = NULL,
+                         pos = start(overlap_gr), trans = NULL,
+                         parameters = NULL, pData = colData, gr = NULL,
+                         chr = as.vector(seqnames(overlap_gr)),
+                         sampleNames = sampleNames, rmZeroCov = rmZeroCov)
+      if (strandCollapse) {strandCollapse(bsseq_obj)}
+    }
 
-    return(bsseq_objs)
+    return(bsseq_obj)
 }
diff --git a/inst/extdata/modkit/Hypo1.first50Bed.txt b/inst/extdata/modkit/Hypo1.first50Bed.txt
@@ -0,0 +1,50 @@
+chr1	3000826	3000827	h	28	.	3000826	3000827	255,0,0	28	0.00	0	26	2	1	0	0	1
+chr1	3000826	3000827	m	28	.	3000826	3000827	255,0,0	28	7.14	2	26	0	1	0	0	1
+chr1	3001006	3001007	h	27	.	3001006	3001007	255,0,0	27	0.00	0	27	0	0	1	1	1
+chr1	3001006	3001007	m	27	.	3001006	3001007	255,0,0	27	0.00	0	27	0	0	1	1	1
+chr1	3001017	3001018	h	25	.	3001017	3001018	255,0,0	25	0.00	0	25	0	1	3	1	0
+chr1	3001017	3001018	m	25	.	3001017	3001018	255,0,0	25	0.00	0	25	0	1	3	1	0
+chr1	3001276	3001277	h	24	.	3001276	3001277	255,0,0	24	0.00	0	24	0	1	0	1	2
+chr1	3001276	3001277	m	24	.	3001276	3001277	255,0,0	24	0.00	0	24	0	1	0	1	2
+chr1	3001628	3001629	h	25	.	3001628	3001629	255,0,0	25	0.00	0	25	0	0	0	0	1
+chr1	3001628	3001629	m	25	.	3001628	3001629	255,0,0	25	0.00	0	25	0	0	0	0	1
+chr1	3003225	3003226	h	24	.	3003225	3003226	255,0,0	24	4.17	1	12	11	1	5	0	0
+chr1	3003225	3003226	m	24	.	3003225	3003226	255,0,0	24	45.83	11	12	1	1	5	0	0
+chr1	3003338	3003339	h	30	.	3003338	3003339	255,0,0	30	3.33	1	7	22	0	3	0	0
+chr1	3003338	3003339	m	30	.	3003338	3003339	255,0,0	30	73.33	22	7	1	0	3	0	0
+chr1	3003378	3003379	h	31	.	3003378	3003379	255,0,0	31	0.00	0	18	13	0	0	1	2
+chr1	3003378	3003379	m	31	.	3003378	3003379	255,0,0	31	41.94	13	18	0	0	0	1	2
+chr1	3003581	3003582	h	30	.	3003581	3003582	255,0,0	30	0.00	0	24	6	0	3	1	2
+chr1	3003581	3003582	m	30	.	3003581	3003582	255,0,0	30	20.00	6	24	0	0	3	1	2
+chr1	3003639	3003640	h	4	.	3003639	3003640	255,0,0	4	0.00	0	4	0	8	19	0	6
+chr1	3003639	3003640	m	4	.	3003639	3003640	255,0,0	4	0.00	0	4	0	8	19	0	6
+chr1	3003720	3003721	h	32	.	3003720	3003721	255,0,0	32	0.00	0	24	8	0	0	1	4
+chr1	3003720	3003721	m	32	.	3003720	3003721	255,0,0	32	25.00	8	24	0	0	0	1	4
+chr1	3003884	3003885	h	36	.	3003884	3003885	255,0,0	36	0.00	0	21	15	0	2	0	0
+chr1	3003884	3003885	m	36	.	3003884	3003885	255,0,0	36	41.67	15	21	0	0	2	0	0
+chr1	3003897	3003898	h	27	.	3003897	3003898	255,0,0	27	0.00	0	13	14	0	9	1	1
+chr1	3003897	3003898	m	27	.	3003897	3003898	255,0,0	27	51.85	14	13	0	0	9	1	1
+chr1	3004529	3004530	h	35	.	3004529	3004530	255,0,0	35	0.00	0	35	0	1	0	1	1
+chr1	3004529	3004530	m	35	.	3004529	3004530	255,0,0	35	0.00	0	35	0	1	0	1	1
+chr1	3005997	3005998	h	46	.	3005997	3005998	255,0,0	46	0.00	0	46	0	0	1	0	1
+chr1	3005997	3005998	m	46	.	3005997	3005998	255,0,0	46	0.00	0	46	0	0	1	0	1
+chr1	3006186	3006187	h	51	.	3006186	3006187	255,0,0	51	0.00	0	48	3	0	0	0	2
+chr1	3006186	3006187	m	51	.	3006186	3006187	255,0,0	51	5.88	3	48	0	0	0	0	2
+chr1	3006415	3006416	h	48	.	3006415	3006416	255,0,0	48	0.00	0	37	11	0	1	0	1
+chr1	3006415	3006416	m	48	.	3006415	3006416	255,0,0	48	22.92	11	37	0	0	1	0	1
+chr1	3006781	3006782	h	48	.	3006781	3006782	255,0,0	48	0.00	0	39	9	0	0	1	0
+chr1	3006781	3006782	m	48	.	3006781	3006782	255,0,0	48	18.75	9	39	0	0	0	1	0
+chr1	3006880	3006881	h	40	.	3006880	3006881	255,0,0	40	0.00	0	40	0	3	0	4	1
+chr1	3006880	3006881	m	40	.	3006880	3006881	255,0,0	40	0.00	0	40	0	3	0	4	1
+chr1	3007169	3007170	h	41	.	3007169	3007170	255,0,0	41	0.00	0	36	5	0	1	1	2
+chr1	3007169	3007170	m	41	.	3007169	3007170	255,0,0	41	12.20	5	36	0	0	1	1	2
+chr1	3007429	3007430	h	43	.	3007429	3007430	255,0,0	43	0.00	0	29	14	0	2	0	0
+chr1	3007429	3007430	m	43	.	3007429	3007430	255,0,0	43	32.56	14	29	0	0	2	0	0
+chr1	3007531	3007532	h	40	.	3007531	3007532	255,0,0	40	0.00	0	40	0	3	0	1	3
+chr1	3007531	3007532	m	40	.	3007531	3007532	255,0,0	40	0.00	0	40	0	3	0	1	3
+chr1	3007579	3007580	h	43	.	3007579	3007580	255,0,0	43	0.00	0	43	0	0	0	0	5
+chr1	3007579	3007580	m	43	.	3007579	3007580	255,0,0	43	0.00	0	43	0	0	0	0	5
+chr1	3007681	3007682	h	44	.	3007681	3007682	255,0,0	44	0.00	0	44	0	0	0	3	5
+chr1	3007681	3007682	m	44	.	3007681	3007682	255,0,0	44	0.00	0	44	0	0	0	3	5
+chr1	3008544	3008545	h	47	.	3008544	3008545	255,0,0	47	0.00	0	44	3	0	1	3	2
+chr1	3008544	3008545	m	47	.	3008544	3008545	255,0,0	47	6.38	3	44	0	0	1	3	2
diff --git a/inst/extdata/modkit/chr21.chr22.HG002.top1000.bed.gz b/inst/extdata/modkit/chr21.chr22.HG002.top1000.bed.gz
diff --git a/inst/extdata/modkit/chr22.HG002.top1000.bed.gz b/inst/extdata/modkit/chr22.HG002.top1000.bed.gz
diff --git a/inst/extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz b/inst/extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz
diff --git a/man/read.modkit.Rd b/man/read.modkit.Rd
@@ -32,15 +32,15 @@ are converted to bedMethyl (BED) files using \href{https://github.com/nanoporete
     Modkit outputs modified reads, unmodified reads, ambiguous modification reads (reads where the probability was below the threshold and usually failing the lowest 10th percentile), and other modified reads.}
 
 \subsection{modkit to Bsseq object}{
-    After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. If there are other modified reads, two Bsseq objects are constructed with the first one using major modified reads and their corresponding coverage and the second one using the other modified reads and their corresponding coverage.}
+    After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. Other modification bases such as hydroxymethylation are extracted and added to the methylation matrix when present.}
 }
 
 \examples{
-# only one modification and one BSseq object is constructed.
-files <- c(system.file("extdata/modkit/chr22.HG002.top1000.bed.gz", package = "bsseq"))
-bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)
+# No other modification present
+files <- c(system.file("extdata/modkit/chr21.chr22.HG002.top1000.bed.gz", package = "bsseq"))
+bsseq_nano <- read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)
 
-# there is other modification, so two BSseq objects are constructed.
-files <- c(system.file("extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz",package = "bsseq"))
-bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)
+# Other modification present
+files <- c(system.file("extdata/modkit/Hypo1.first50Bed.txt",package = "bsseq"))
+bsseq_nano <- read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)
 }
diff --git a/tests/testthat/test_read.modkit.R b/tests/testthat/test_read.modkit.R
@@ -1,13 +1,24 @@
 context("read.modkit")
 
 # TODO: Re-factor read.modkit() and update tests accordingly
-test_that("read.modkit() works for BED files", {
-    infile <- system.file("extdata", "modkit/chr22.HG002.top1000.other_mod.bed.gz",
+test_that("read.modkit() works for BED files without 5hmc", {
+    infile <- system.file("extdata", "modkit/chr21.chr22.HG002.top1000.bed.gz",
                           package = "bsseq")
     bsseq <- read.modkit(files = infile,
                          colData = NULL,
                          rmZeroCov = FALSE,
                          strandCollapse = TRUE)
-
-    lapply(bsseq, function(x) {expect_is(x, "BSseq")})
+
+    expect_is(bsseq, "BSseq")
+})
+
+test_that("read.modkit() works for BED files with 5hmc", {
+    infile <- system.file("extdata", "modkit/Hypo1.first50Bed.txt",
+                          package = "bsseq")
+    bsseq <- read.modkit(files = infile,
+                         colData = NULL,
+                         rmZeroCov = FALSE,
+                         strandCollapse = TRUE)
+
+    expect_is(bsseq, "BSseq")
 })