Removed digestion code

lucasnell · Mar 19, 2019 · 63fb595 · 63fb595
1 parent 2808864
commit 63fb595
Show file tree

Hide file tree

Showing 25 changed files with 24 additions and 2,111 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,5 @@ inst/doc
 Gac
 jackal_*.tgz
 *.pdf
-methods
 _crash_test.R
 !inst/art_profiles/*.txt.gz
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,8 +9,7 @@ Description: `jackal` efficiently (i) reads and simulates reference genomes;
     (ii) generates variants using summary statistics, phylogenies, Variant
     Call Format (VCF) files, and coalescent simulations—the latter of which can include
     selection, recombination, and demographic fluctuations;
-    (iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion, 
-    and variance in coverage among sites; and
+    (iii) simulates sequencing error, mapping qualities, and optical/PCR duplicates; and
     (iv) writes outputs to standard file formats.
     `jackal` can simulate single, paired-end, or mate-pair Illumina reads, as well as
     reads from Pacific BioSciences.

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,6 @@
 
 export(create_genome)
 export(create_variants)
-export(digest)
 export(illumina)
 export(make_mevo)
 export(mevo)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -94,81 +94,6 @@ rando_seqs <- function(n_seqs, len_mean, len_sd = 0, pi_tcag = numeric(0), n_cor
     .Call(`_jackal_rando_seqs`, n_seqs, len_mean, len_sd, pi_tcag, n_cores)
 }
 
-#' Calculate how many bases come before a cleavage site.
-#'
-#'
-#' @noRd
-#'
-get_precleavage_lens <- function(seqs) {
-    .Call(`_jackal_get_precleavage_lens`, seqs)
-}
-
-#' Expand sequences for reverse complements and for non-specific nucleobases.
-#'
-#'
-#' @noRd
-#'
-expand_seqs <- function(seqs) {
-    .Call(`_jackal_expand_seqs`, seqs)
-}
-
-#' Internal C++ function to digest all sequences for all variants in a variant set.
-#'
-#'
-#'
-#' @param var_set_ptr An external pointer to a C++ \code{VarSet} object
-#'     representing variants from the reference genome.
-#' @inheritParams bind_sites digest_ref
-#' @inheritParams len5s digest_ref
-#' @param chunk_size The size of chunks to divide sequences into when digesting.
-#' @param n_cores The number of cores to use for processing. Defaults to \code{1}.
-#'
-#' @return A list of lists, each sub-list containing multiple vectors representing
-#'     the locations of cut sites for a given variant on a given sequence.
-#'     Indexing the output list would be done as such:
-#'     \code{output_list[[variant_index]][[sequence_index]][position_index]}.
-#'
-#' @noRd
-#'
-digest_var_set <- function(var_set_ptr, bind_sites, len5s, chunk_size, n_cores = 1L) {
-    .Call(`_jackal_digest_var_set`, var_set_ptr, bind_sites, len5s, chunk_size, n_cores)
-}
-
-#' Internal C++ function to digest all sequences in a reference genome.
-#'
-#'
-#'
-#' @param ref_genome_ptr An external pointer to a C++ \code{RefGenome} object
-#'     representing the reference genome.
-#' @param bind_sites Vector of enzyme full recognition site(s).
-#' @param len5s A vector of the numbers of characters of the prime5 sites for each
-#'     recognition site.
-#' @param chunk_size Size of chunks to break sequences into for processing.
-#'     This value is ignored if it's set to zero.
-#'     Ideally this is set to a value that results in a number of chunks divisible by
-#'     the number of cores you're using, and is most useful when `n_cores` is greater
-#'     than the number of scaffolds.
-#'     Breaking into increasingly small chunks results in increasing overhead, so
-#'     beware of making this argument very small.
-#'     Reference genome sequences are not copied during this function, so using
-#'     this argument for a reference genome does NOT decrease memory usage
-#'     appreciably.
-#'     Defaults to \code{0}.
-#' @param n_cores The number of cores to use for processing. This value is ignored
-#'     if the input reference genome is merged and \code{chunk_size == 0}.
-#'     Defaults to \code{1}.
-#'
-#' @return A list of vectors, each vector representing the locations of cut sites
-#'     on a given sequence.
-#'     Indexing the output list would be done as such:
-#'     \code{output_list[[sequence_index]][position_index]}.
-#'
-#' @noRd
-#'
-digest_ref <- function(ref_genome_ptr, bind_sites, len5s, chunk_size = 0L, n_cores = 1L) {
-    .Call(`_jackal_digest_ref`, ref_genome_ptr, bind_sites, len5s, chunk_size, n_cores)
-}
-
 #' Illumina sequence for reference object.
 #'
 #'

diff --git a/R/aaa-classes.R b/R/aaa-classes.R
@@ -11,8 +11,6 @@
 #'
 #' @field genome An \code{externalptr} to a C++ object storing the sequences
 #'     representing the genome.
-#' @field digests An \code{externalptr} to a C++ object storing the digestion of the
-#'     genome, if a digestion has been carried out. It's \code{NULL} otherwise.
 #'
 #' @section Methods:
 #' \describe{
@@ -55,7 +53,6 @@ ref_genome <- R6::R6Class(
     public = list(
 
         genome = NULL,
-        digests = NULL,
 
         initialize = function(genome_ptr) {
             if (!inherits(genome_ptr, "externalptr")) {
@@ -384,8 +381,6 @@ mevo$lock()
 #'
 #' @field genome An \code{externalptr} to a C++ object storing the sequences
 #'     representing the genome.
-#' @field digests An \code{externalptr} to a C++ object storing the digestion of the
-#'     genome, if a digestion has been carried out. It's \code{NULL} otherwise.
 #' @field reference An \code{externalptr} to a C++ object storing the sequences
 #'     representing the genome.
 #'     There are a few extra notes for this field:
@@ -437,7 +432,6 @@ variants <- R6::R6Class(
     public = list(
 
         genomes = NULL,
-        digests = NULL,
 
         initialize = function(genomes_ptr, reference_ptr) {
             if (!inherits(genomes_ptr, "externalptr")) {

diff --git a/R/data.R b/R/data.R
@@ -1,34 +1,3 @@
-#' Binding sites for selected restriction enzymes.
-#'
-#' @format A 695 x 2 data frame with the following columns:
-#' \describe{
-#'   \item{enzyme}{Restriction enzyme name.}
-#'   \item{sites}{Enzyme binding site sequences.
-#'       See \code{\link{nucleobase_legend}} for what bases other than `T`, `C`, `A`,
-#'       and `G` mean.
-#'       Each `/` indicates a cleavage point.
-#'       According to NEB (see link below under Source):
-#'       "Numbers in parentheses indicate point of cleaveage for non-palindromic
-#'       enzymes."
-#'       These types of enzymes are not implemented.}
-#' }
-#' @source \url{https://www.neb.com/tools-and-resources/selection-charts/alphabetized-list-of-recognition-specificities}
-"binding_sites"
-
-
-
-#' Legend for the single-letter code of nucleobases indicating restriction sequences.
-#'
-#' @format A data frame of 28 rows and two columns:
-#' \describe{
-#'   \item{code}{The letter indicating more than one possible nucleotides.}
-#'   \item{nucleotides}{One of the multiple nucleotides the code can refer to.}
-#' }
-#' @source \url{https://en.wikipedia.org/wiki/List_of_restriction_enzyme_cutting_sites:_A#Whole_list_navigation}
-"nucleobase_legend"
-
-
-
 
 #' Table of evolutionary rates.
 #'

diff --git a/R/digest.R b/R/digest.R
diff --git a/R/jackal.R b/R/jackal.R
@@ -4,13 +4,10 @@
 #' (ii) generates variants using summary statistics, phylogenies, Variant
 #' Call Format (VCF) files, and coalescent simulations—the latter of which can include
 #' selection, recombination, and demographic fluctuations;
-#' (iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion,
-#' and variance in coverage among sites; and
+#' (iii) simulates sequencing error, mapping qualities, and optical/PCR duplicates; and
 #' (iv) writes outputs to standard file formats.
-#' `jackal` can simulate single or paired-ended reads for WGS on the Illumina platform,
-#' and can be extended to simulate different methods (e.g., original RADseq,
-#' double-digest RADseq, and genotyping-by-sequencing), and sequencing technologies
-#' (e.g., Pacific BioSciences, Oxford Nanopore Technologies).
+#' `jackal` can simulate single, paired-end, or mate-pair Illumina reads, as well as
+#' reads from Pacific BioSciences.
 #'
 #'
 #' @importFrom Rcpp evalCpp

diff --git a/R/mevo_phylo.R b/R/mevo_phylo.R
@@ -137,7 +137,7 @@ read_phy_obj <- function(phy, n_seqs, chunked, err_msg = "") {
 #'
 read_coal_obj <- function(coal_obj, seq_sizes, chunked, err_msg) {
 
-    # Check for coal_obj begin a list and either having a `trees` field or all its
+    # Check for coal_obj being a list and either having a `trees` field or all its
     # items within having `trees` fields
     err <- FALSE
     nested <- FALSE

diff --git a/README.Rmd b/README.Rmd
@@ -33,8 +33,7 @@ __An efficient, versatile molecular evolution and sequencing simulator__
 (ii) generates variants using summary statistics, phylogenies, Variant
 Call Format (VCF) files, and coalescent simulations—the latter of which can include
 selection, recombination, and demographic fluctuations;
-(iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion, 
-and variance in coverage among sites; and
+(iii) simulates sequencing error, mapping qualities, and optical/PCR duplicates; and
 (iv) writes outputs to standard file formats.
 `jackal` can simulate single, paired-end, or mate-pair Illumina reads, as well as
 reads from Pacific BioSciences.