-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #131 from yixuan-chen-elisa/read.modkit
Create a function read.modkit to read in methyl Bed files created by modkit to Bsseq object
- Loading branch information
Showing
9 changed files
with
141 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
read.modkit = function(files, | ||
colData = NULL, | ||
rmZeroCov = FALSE, | ||
strandCollapse = TRUE){ | ||
gr_list = list() | ||
sampleNames = sub("\\.bed$", "", basename(files)) | ||
if (!is.null(colData)){ | ||
rownames(colData) <- sampleNames | ||
} | ||
|
||
for (i in seq_along(files)){ | ||
data = read.table(files[i], header = FALSE, sep="\t", | ||
stringsAsFactors=FALSE, quote="") | ||
|
||
gr = GRanges(seqnames = data$V1, | ||
ranges = IRanges(start = data$V2+1, end = data$V3)) | ||
mcols(gr)$m = data$V12 | ||
mcols(gr)$cov = data$V12 + data$V13 | ||
mcols(gr)$filter = data$V16 | ||
if (any(data$V14 != 0)){ | ||
mcols(gr)$other_mod = data$V14 | ||
mcols(gr)$cov_other_mod = data$V14 + data$V13} | ||
names(gr) = sampleNames[i] | ||
gr_list[[sampleNames[i]]] = gr | ||
} | ||
|
||
overlap_gr = Reduce(subsetByOverlaps, gr_list) | ||
|
||
m_cov_list = lapply(gr_list, function(gr){ | ||
overlap_data = gr[gr %over% overlap_gr] | ||
if (!is.null(gr$other_mod)) { | ||
data.frame(m = overlap_data$m, cov = overlap_data$cov, | ||
filter = overlap_data$filter, | ||
other_mod = overlap_data$other_mod, | ||
cov_other_mod = overlap_data$cov_other_mod) | ||
} else { | ||
data.frame(m = overlap_data$m, cov = overlap_data$cov, | ||
filter = overlap_data$filter) | ||
} | ||
}) | ||
|
||
m = do.call(cbind, lapply(m_cov_list, `[[`, "m")) | ||
cov = do.call(cbind, lapply(m_cov_list, `[[`, "cov")) | ||
filter = do.call(cbind, lapply(m_cov_list, `[[`, "filter")) | ||
other_mod = do.call(cbind, lapply(m_cov_list, `[[`, "other_mod")) | ||
cov_other_mod = do.call(cbind, lapply(m_cov_list, `[[`, "cov_other_mod")) | ||
|
||
bsseq_objs = list() | ||
bsseq_obj_m = BSseq(M = as.matrix(m), Cov = as.matrix(cov), | ||
Filtered = as.matrix(filter), | ||
coef = NULL, se.coef = NULL, | ||
pos = start(overlap_gr), trans = NULL, | ||
parameters = NULL, pData = colData, gr = NULL, | ||
chr = as.vector(seqnames(overlap_gr)), | ||
sampleNames = sampleNames, rmZeroCov = rmZeroCov) | ||
|
||
bsseq_objs[[1]] = bsseq_obj_m | ||
|
||
if (!is.null(other_mod)){ | ||
bsseq_obj_other_mod = BSseq(M = as.matrix(other_mod), | ||
Cov = as.matrix(cov_other_mod), | ||
Filtered = as.matrix(filter), | ||
coef = NULL, se.coef = NULL, | ||
pos = start(overlap_gr), trans = NULL, | ||
parameters = NULL, pData = colData, gr = NULL, | ||
chr = as.vector(seqnames(overlap_gr)), | ||
sampleNames = sampleNames, rmZeroCov = rmZeroCov) | ||
|
||
bsseq_objs[[2]] = bsseq_obj_other_mod | ||
} | ||
|
||
if (strandCollapse){ | ||
for (i in 1:length(bsseq_objs)) { | ||
bsseq:::strandCollapse(bsseq_objs[[i]])} | ||
} | ||
|
||
return(bsseq_objs) | ||
} |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
\name{read.modkit} | ||
\alias{read.modkit} | ||
\title{Construct BSseq objects from nanopore BED files} | ||
\description{ | ||
Construct BSseq objects from nanopore BED files | ||
} | ||
\usage{ | ||
read.modkit( | ||
files, | ||
colData = NULL, | ||
rmZeroCov = FALSE, | ||
strandCollapse = TRUE | ||
) | ||
} | ||
\arguments{ | ||
\item{files}{vector, BED files} | ||
\item{colData}{data frame, phenotypic data with samples as rows and variables as columns} | ||
\item{rmZeroCov}{A logical (1) indicating whether methylation loci that have zero coverage in all samples be removed} | ||
\item{strandCollapse}{A logical (1) indicating whether stand-symmetric methylation loci (i.e. CpGs) should be collapsed across strands} | ||
} | ||
|
||
\value{ | ||
BSseq objects | ||
} | ||
\details{ | ||
This function reads in nanopore sequencing modified BED files | ||
to Bsseq objects. Nanopore sequencing data (i.e. aggregated modified base | ||
counts) is stored in modified-base BAM files. These modified-base BAM files | ||
are converted to bedMethyl (BED) files using \href{https://github.com/nanoporetech/modkit}{modkit}. | ||
|
||
\subsection{Details for modkit}{ | ||
Modkit outputs modified reads, unmodified reads, ambiguous modification reads (reads where the probability was below the threshold and usually failing the lowest 10th percentile), and other modified reads.} | ||
|
||
\subsection{modkit to Bsseq object}{ | ||
After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. If there are other modified reads, two Bsseq objects are constructed with the first one using major modified reads and their corresponding coverage and the second one using the other modified reads and their corresponding coverage.} | ||
} | ||
|
||
\examples{ | ||
# only one modification and one BSseq object is constructed. | ||
files <- c(system.file("extdata/modkit/chr22.HG002.top1000.bed.gz", package = "bsseq")) | ||
bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE) | ||
|
||
# there is other modification, so two BSseq objects are constructed. | ||
files <- c(system.file("extdata/modkit/chr22.HG002.top1000.other_mod.bed.gz",package = "bsseq")) | ||
bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
context("read.modkit") | ||
|
||
# TODO: Re-factor read.modkit() and update tests accordingly | ||
test_that("read.modkit() works for BED files", { | ||
infile <- system.file("extdata", "modkit/chr22.HG002.top1000.other_mod.bed.gz", | ||
package = "bsseq") | ||
bsseq <- read.modkit(files = infile, | ||
colData = NULL, | ||
rmZeroCov = FALSE, | ||
strandCollapse = TRUE) | ||
|
||
lapply(bsseq, function(x) {expect_is(x, "BSseq")}) | ||
}) |