Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update functions on handling other modification bases #137

Merged
merged 4 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: bsseq
Version: 1.39.3
Version: 1.39.2
Encoding: UTF-8
Title: Analyze, manage and store whole-genome methylation data
Description: A collection of tools for analyzing and visualizing whole-genome
Expand Down
103 changes: 51 additions & 52 deletions R/read.modkit.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,76 +3,75 @@ read.modkit <- function(files,
rmZeroCov = FALSE,
strandCollapse = TRUE){
gr_list <- list()
sampleNames <- sub("\\.bed$", "", basename(files))
sampleNames <- sub("\\.bed.gz$", "", basename(files))
if (!is.null(colData)){
rownames(colData) <- sampleNames
}

for (i in seq_along(files)){
data <- read.table(files[i], header = FALSE, sep="\t",
stringsAsFactors=FALSE, quote="")
gr <- GRanges(seqnames = data$V1,
ranges = IRanges(start = data$V2+1, end = data$V3))
mcols(gr)$m <- data$V12
mcols(gr)$cov <- data$V12 + data$V13
mcols(gr)$filter <- data$V16
if (any(data$V14 != 0)){
mcols(gr)$other_mod <- data$V14
mcols(gr)$cov_other_mod <- data$V14 + data$V13
}

if (length(unique(data$V4)) == 2){
gr <- GRanges(seqnames = data[data$V4 == "m", ]$V1,
ranges = IRanges(start = data[data$V4 == "m", ]$V2+1,
end = data[data$V4 == "m", ]$V3))

mcols(gr)$m <- data[data$V4 == "m", ]$V12
mcols(gr)$h <- data[data$V4 != "m", ]$V12
mcols(gr)$u <- data[data$V4 == "m", ]$V13
mcols(gr)$filter <- data[data$V4 == "m", ]$V16
}else{
gr <- GRanges(seqnames = data$V1,
ranges = IRanges(start = data$V2+1, end = data$V3))

mcols(gr)$m <- data$V12
mcols(gr)$u <- data$V13
mcols(gr)$filter <- data$V16
}

names(gr) <- sampleNames[i]
gr_list[[sampleNames[i]]] <- gr
}

overlap_gr <- Reduce(subsetByOverlaps, gr_list)

m_cov_list <- lapply(gr_list, function(gr){
m_u_list <- lapply(gr_list, function(gr){
overlap_data <- gr[gr %over% overlap_gr]
if (!is.null(gr$other_mod)) {
data.frame(m = overlap_data$m, cov = overlap_data$cov,
filter = overlap_data$filter,
other_mod = overlap_data$other_mod,
cov_other_mod = overlap_data$cov_other_mod)
if (!is.null(gr$h)) {
data.frame(m = overlap_data$m, u = overlap_data$u,
h = overlap_data$h, filter = overlap_data$filter)
} else {
data.frame(m = overlap_data$m, cov = overlap_data$cov,
filter = overlap_data$filter)
data.frame(m = overlap_data$m, u = overlap_data$u,
filter = overlap_data$filter)
}
})

m <- do.call(cbind, lapply(m_cov_list, `[[`, "m"))
cov <- do.call(cbind, lapply(m_cov_list, `[[`, "cov"))
filter <- do.call(cbind, lapply(m_cov_list, `[[`, "filter"))
other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "other_mod"))
cov_other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "cov_other_mod"))

bsseq_objs <- list()
bsseq_obj_m <- BSseq(M = as.matrix(m), Cov = as.matrix(cov),
Filtered = as.matrix(filter),
coef = NULL, se.coef = NULL,
pos = start(overlap_gr), trans = NULL,
parameters = NULL, pData = colData, gr = NULL,
chr = as.vector(seqnames(overlap_gr)),
sampleNames = sampleNames, rmZeroCov = rmZeroCov)
m <- do.call(cbind, lapply(m_u_list, `[[`, "m"))
u <- do.call(cbind, lapply(m_u_list, `[[`, "u"))
h <- do.call(cbind, lapply(m_u_list, `[[`, "h"))
filter <- do.call(cbind, lapply(m_u_list, `[[`, "filter"))

bsseq_objs[[1]] <- bsseq_obj_m

if (!is.null(other_mod)){
bsseq_obj_other_mod <- BSseq(M = as.matrix(other_mod),
Cov = as.matrix(cov_other_mod),
Filtered = as.matrix(filter),
coef = NULL, se.coef = NULL,
pos = start(overlap_gr), trans = NULL,
parameters = NULL, pData = colData, gr = NULL,
chr = as.vector(seqnames(overlap_gr)),
sampleNames = sampleNames, rmZeroCov = rmZeroCov)

bsseq_objs[[2]] <- bsseq_obj_other_mod
}

if (strandCollapse){
for (i in seq_along(bsseq_objs)) {
bsseq:::strandCollapse(bsseq_objs[[i]])}
}
if (!is.null(h)){
bsseq_obj <- BSseq(M = as.matrix(m + h),
Cov = as.matrix(m + u + h),
Filtered = as.matrix(filter),
coef = NULL, se.coef = NULL,
pos = start(overlap_gr), trans = NULL,
parameters = NULL, pData = colData, gr = NULL,
chr = as.vector(seqnames(overlap_gr)),
sampleNames = sampleNames, rmZeroCov = rmZeroCov)
if (strandCollapse) {strandCollapse(bsseq_obj)}
}else{
bsseq_obj <- BSseq(M = as.matrix(m), Cov = as.matrix(u + m),
Filtered = as.matrix(filter),
coef = NULL, se.coef = NULL,
pos = start(overlap_gr), trans = NULL,
parameters = NULL, pData = colData, gr = NULL,
chr = as.vector(seqnames(overlap_gr)),
sampleNames = sampleNames, rmZeroCov = rmZeroCov)
if (strandCollapse) {strandCollapse(bsseq_obj)}
}

return(bsseq_objs)
return(bsseq_obj)
}
50 changes: 50 additions & 0 deletions inst/extdata/modkit/Hypo1.first50Bed.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
chr1 3000826 3000827 h 28 . 3000826 3000827 255,0,0 28 0.00 0 26 2 1 0 0 1
chr1 3000826 3000827 m 28 . 3000826 3000827 255,0,0 28 7.14 2 26 0 1 0 0 1
chr1 3001006 3001007 h 27 . 3001006 3001007 255,0,0 27 0.00 0 27 0 0 1 1 1
chr1 3001006 3001007 m 27 . 3001006 3001007 255,0,0 27 0.00 0 27 0 0 1 1 1
chr1 3001017 3001018 h 25 . 3001017 3001018 255,0,0 25 0.00 0 25 0 1 3 1 0
chr1 3001017 3001018 m 25 . 3001017 3001018 255,0,0 25 0.00 0 25 0 1 3 1 0
chr1 3001276 3001277 h 24 . 3001276 3001277 255,0,0 24 0.00 0 24 0 1 0 1 2
chr1 3001276 3001277 m 24 . 3001276 3001277 255,0,0 24 0.00 0 24 0 1 0 1 2
chr1 3001628 3001629 h 25 . 3001628 3001629 255,0,0 25 0.00 0 25 0 0 0 0 1
chr1 3001628 3001629 m 25 . 3001628 3001629 255,0,0 25 0.00 0 25 0 0 0 0 1
chr1 3003225 3003226 h 24 . 3003225 3003226 255,0,0 24 4.17 1 12 11 1 5 0 0
chr1 3003225 3003226 m 24 . 3003225 3003226 255,0,0 24 45.83 11 12 1 1 5 0 0
chr1 3003338 3003339 h 30 . 3003338 3003339 255,0,0 30 3.33 1 7 22 0 3 0 0
chr1 3003338 3003339 m 30 . 3003338 3003339 255,0,0 30 73.33 22 7 1 0 3 0 0
chr1 3003378 3003379 h 31 . 3003378 3003379 255,0,0 31 0.00 0 18 13 0 0 1 2
chr1 3003378 3003379 m 31 . 3003378 3003379 255,0,0 31 41.94 13 18 0 0 0 1 2
chr1 3003581 3003582 h 30 . 3003581 3003582 255,0,0 30 0.00 0 24 6 0 3 1 2
chr1 3003581 3003582 m 30 . 3003581 3003582 255,0,0 30 20.00 6 24 0 0 3 1 2
chr1 3003639 3003640 h 4 . 3003639 3003640 255,0,0 4 0.00 0 4 0 8 19 0 6
chr1 3003639 3003640 m 4 . 3003639 3003640 255,0,0 4 0.00 0 4 0 8 19 0 6
chr1 3003720 3003721 h 32 . 3003720 3003721 255,0,0 32 0.00 0 24 8 0 0 1 4
chr1 3003720 3003721 m 32 . 3003720 3003721 255,0,0 32 25.00 8 24 0 0 0 1 4
chr1 3003884 3003885 h 36 . 3003884 3003885 255,0,0 36 0.00 0 21 15 0 2 0 0
chr1 3003884 3003885 m 36 . 3003884 3003885 255,0,0 36 41.67 15 21 0 0 2 0 0
chr1 3003897 3003898 h 27 . 3003897 3003898 255,0,0 27 0.00 0 13 14 0 9 1 1
chr1 3003897 3003898 m 27 . 3003897 3003898 255,0,0 27 51.85 14 13 0 0 9 1 1
chr1 3004529 3004530 h 35 . 3004529 3004530 255,0,0 35 0.00 0 35 0 1 0 1 1
chr1 3004529 3004530 m 35 . 3004529 3004530 255,0,0 35 0.00 0 35 0 1 0 1 1
chr1 3005997 3005998 h 46 . 3005997 3005998 255,0,0 46 0.00 0 46 0 0 1 0 1
chr1 3005997 3005998 m 46 . 3005997 3005998 255,0,0 46 0.00 0 46 0 0 1 0 1
chr1 3006186 3006187 h 51 . 3006186 3006187 255,0,0 51 0.00 0 48 3 0 0 0 2
chr1 3006186 3006187 m 51 . 3006186 3006187 255,0,0 51 5.88 3 48 0 0 0 0 2
chr1 3006415 3006416 h 48 . 3006415 3006416 255,0,0 48 0.00 0 37 11 0 1 0 1
chr1 3006415 3006416 m 48 . 3006415 3006416 255,0,0 48 22.92 11 37 0 0 1 0 1
chr1 3006781 3006782 h 48 . 3006781 3006782 255,0,0 48 0.00 0 39 9 0 0 1 0
chr1 3006781 3006782 m 48 . 3006781 3006782 255,0,0 48 18.75 9 39 0 0 0 1 0
chr1 3006880 3006881 h 40 . 3006880 3006881 255,0,0 40 0.00 0 40 0 3 0 4 1
chr1 3006880 3006881 m 40 . 3006880 3006881 255,0,0 40 0.00 0 40 0 3 0 4 1
chr1 3007169 3007170 h 41 . 3007169 3007170 255,0,0 41 0.00 0 36 5 0 1 1 2
chr1 3007169 3007170 m 41 . 3007169 3007170 255,0,0 41 12.20 5 36 0 0 1 1 2
chr1 3007429 3007430 h 43 . 3007429 3007430 255,0,0 43 0.00 0 29 14 0 2 0 0
chr1 3007429 3007430 m 43 . 3007429 3007430 255,0,0 43 32.56 14 29 0 0 2 0 0
chr1 3007531 3007532 h 40 . 3007531 3007532 255,0,0 40 0.00 0 40 0 3 0 1 3
chr1 3007531 3007532 m 40 . 3007531 3007532 255,0,0 40 0.00 0 40 0 3 0 1 3
chr1 3007579 3007580 h 43 . 3007579 3007580 255,0,0 43 0.00 0 43 0 0 0 0 5
chr1 3007579 3007580 m 43 . 3007579 3007580 255,0,0 43 0.00 0 43 0 0 0 0 5
chr1 3007681 3007682 h 44 . 3007681 3007682 255,0,0 44 0.00 0 44 0 0 0 3 5
chr1 3007681 3007682 m 44 . 3007681 3007682 255,0,0 44 0.00 0 44 0 0 0 3 5
chr1 3008544 3008545 h 47 . 3008544 3008545 255,0,0 47 0.00 0 44 3 0 1 3 2
chr1 3008544 3008545 m 47 . 3008544 3008545 255,0,0 47 6.38 3 44 0 0 1 3 2
Binary file not shown.
Binary file removed inst/extdata/modkit/chr22.HG002.top1000.bed.gz
Binary file not shown.
Binary file not shown.
14 changes: 7 additions & 7 deletions man/read.modkit.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ are converted to bedMethyl (BED) files using \href{https://github.com/nanoporete
Modkit outputs modified reads, unmodified reads, ambiguous modification reads (reads where the probability was below the threshold and usually failing the lowest 10th percentile), and other modified reads.}

\subsection{modkit to Bsseq object}{
After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. If there are other modified reads, two Bsseq objects are constructed with the first one using major modified reads and their corresponding coverage and the second one using the other modified reads and their corresponding coverage.}
After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. Other modification bases such as hydroxymethylation are extracted and added to the methylation matrix when present.}
}

\examples{
# only one modification and one BSseq object is constructed.
files <- c(system.file("extdata/modkit/chr22.HG002.top1000.bed.gz", package = "bsseq"))
bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)
# No other modification present
files <- c(system.file("extdata/modkit/chr21.chr22.HG002.top1000.bed.gz", package = "bsseq"))
bsseq_nano <- read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)

# there is other modification, so two BSseq objects are constructed.
files <- c(system.file("extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz",package = "bsseq"))
bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)
# Other modification present
files <- c(system.file("extdata/modkit/Hypo1.first50Bed.txt",package = "bsseq"))
bsseq_nano <- read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)
}
19 changes: 15 additions & 4 deletions tests/testthat/test_read.modkit.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
context("read.modkit")

# TODO: Re-factor read.modkit() and update tests accordingly
test_that("read.modkit() works for BED files", {
infile <- system.file("extdata", "modkit/chr22.HG002.top1000.other_mod.bed.gz",
test_that("read.modkit() works for BED files without 5hmc", {
infile <- system.file("extdata", "modkit/chr21.chr22.HG002.top1000.bed.gz",
package = "bsseq")
bsseq <- read.modkit(files = infile,
colData = NULL,
rmZeroCov = FALSE,
strandCollapse = TRUE)

lapply(bsseq, function(x) {expect_is(x, "BSseq")})

expect_is(bsseq, "BSseq")
})

test_that("read.modkit() works for BED files with 5hmc", {
infile <- system.file("extdata", "modkit/Hypo1.first50Bed.txt",
package = "bsseq")
bsseq <- read.modkit(files = infile,
colData = NULL,
rmZeroCov = FALSE,
strandCollapse = TRUE)

expect_is(bsseq, "BSseq")
})
Loading