diff --git a/.gitignore b/.gitignore index 9e6ac64..0756e95 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,5 @@ inst/doc Gac jackal_*.tgz *.pdf -methods _crash_test.R !inst/art_profiles/*.txt.gz diff --git a/DESCRIPTION b/DESCRIPTION index 5c73ee8..85197fb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,8 +9,7 @@ Description: `jackal` efficiently (i) reads and simulates reference genomes; (ii) generates variants using summary statistics, phylogenies, Variant Call Format (VCF) files, and coalescent simulations—the latter of which can include selection, recombination, and demographic fluctuations; - (iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion, - and variance in coverage among sites; and + (iii) simulates sequencing error, mapping qualities, and optical/PCR duplicates; and (iv) writes outputs to standard file formats. `jackal` can simulate single, paired-end, or mate-pair Illumina reads, as well as reads from Pacific BioSciences. diff --git a/NAMESPACE b/NAMESPACE index b4abcd4..6f67973 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,6 @@ export(create_genome) export(create_variants) -export(digest) export(illumina) export(make_mevo) export(mevo) diff --git a/R/RcppExports.R b/R/RcppExports.R index bae1749..6844a56 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -94,81 +94,6 @@ rando_seqs <- function(n_seqs, len_mean, len_sd = 0, pi_tcag = numeric(0), n_cor .Call(`_jackal_rando_seqs`, n_seqs, len_mean, len_sd, pi_tcag, n_cores) } -#' Calculate how many bases come before a cleavage site. -#' -#' -#' @noRd -#' -get_precleavage_lens <- function(seqs) { - .Call(`_jackal_get_precleavage_lens`, seqs) -} - -#' Expand sequences for reverse complements and for non-specific nucleobases. -#' -#' -#' @noRd -#' -expand_seqs <- function(seqs) { - .Call(`_jackal_expand_seqs`, seqs) -} - -#' Internal C++ function to digest all sequences for all variants in a variant set. -#' -#' -#' -#' @param var_set_ptr An external pointer to a C++ \code{VarSet} object -#' representing variants from the reference genome. -#' @inheritParams bind_sites digest_ref -#' @inheritParams len5s digest_ref -#' @param chunk_size The size of chunks to divide sequences into when digesting. -#' @param n_cores The number of cores to use for processing. Defaults to \code{1}. -#' -#' @return A list of lists, each sub-list containing multiple vectors representing -#' the locations of cut sites for a given variant on a given sequence. -#' Indexing the output list would be done as such: -#' \code{output_list[[variant_index]][[sequence_index]][position_index]}. -#' -#' @noRd -#' -digest_var_set <- function(var_set_ptr, bind_sites, len5s, chunk_size, n_cores = 1L) { - .Call(`_jackal_digest_var_set`, var_set_ptr, bind_sites, len5s, chunk_size, n_cores) -} - -#' Internal C++ function to digest all sequences in a reference genome. -#' -#' -#' -#' @param ref_genome_ptr An external pointer to a C++ \code{RefGenome} object -#' representing the reference genome. -#' @param bind_sites Vector of enzyme full recognition site(s). -#' @param len5s A vector of the numbers of characters of the prime5 sites for each -#' recognition site. -#' @param chunk_size Size of chunks to break sequences into for processing. -#' This value is ignored if it's set to zero. -#' Ideally this is set to a value that results in a number of chunks divisible by -#' the number of cores you're using, and is most useful when `n_cores` is greater -#' than the number of scaffolds. -#' Breaking into increasingly small chunks results in increasing overhead, so -#' beware of making this argument very small. -#' Reference genome sequences are not copied during this function, so using -#' this argument for a reference genome does NOT decrease memory usage -#' appreciably. -#' Defaults to \code{0}. -#' @param n_cores The number of cores to use for processing. This value is ignored -#' if the input reference genome is merged and \code{chunk_size == 0}. -#' Defaults to \code{1}. -#' -#' @return A list of vectors, each vector representing the locations of cut sites -#' on a given sequence. -#' Indexing the output list would be done as such: -#' \code{output_list[[sequence_index]][position_index]}. -#' -#' @noRd -#' -digest_ref <- function(ref_genome_ptr, bind_sites, len5s, chunk_size = 0L, n_cores = 1L) { - .Call(`_jackal_digest_ref`, ref_genome_ptr, bind_sites, len5s, chunk_size, n_cores) -} - #' Illumina sequence for reference object. #' #' diff --git a/R/aaa-classes.R b/R/aaa-classes.R index 0087adc..9e83a3e 100644 --- a/R/aaa-classes.R +++ b/R/aaa-classes.R @@ -11,8 +11,6 @@ #' #' @field genome An \code{externalptr} to a C++ object storing the sequences #' representing the genome. -#' @field digests An \code{externalptr} to a C++ object storing the digestion of the -#' genome, if a digestion has been carried out. It's \code{NULL} otherwise. #' #' @section Methods: #' \describe{ @@ -55,7 +53,6 @@ ref_genome <- R6::R6Class( public = list( genome = NULL, - digests = NULL, initialize = function(genome_ptr) { if (!inherits(genome_ptr, "externalptr")) { @@ -384,8 +381,6 @@ mevo$lock() #' #' @field genome An \code{externalptr} to a C++ object storing the sequences #' representing the genome. -#' @field digests An \code{externalptr} to a C++ object storing the digestion of the -#' genome, if a digestion has been carried out. It's \code{NULL} otherwise. #' @field reference An \code{externalptr} to a C++ object storing the sequences #' representing the genome. #' There are a few extra notes for this field: @@ -437,7 +432,6 @@ variants <- R6::R6Class( public = list( genomes = NULL, - digests = NULL, initialize = function(genomes_ptr, reference_ptr) { if (!inherits(genomes_ptr, "externalptr")) { diff --git a/R/data.R b/R/data.R index 8fb12ae..a2255bb 100644 --- a/R/data.R +++ b/R/data.R @@ -1,34 +1,3 @@ -#' Binding sites for selected restriction enzymes. -#' -#' @format A 695 x 2 data frame with the following columns: -#' \describe{ -#' \item{enzyme}{Restriction enzyme name.} -#' \item{sites}{Enzyme binding site sequences. -#' See \code{\link{nucleobase_legend}} for what bases other than `T`, `C`, `A`, -#' and `G` mean. -#' Each `/` indicates a cleavage point. -#' According to NEB (see link below under Source): -#' "Numbers in parentheses indicate point of cleaveage for non-palindromic -#' enzymes." -#' These types of enzymes are not implemented.} -#' } -#' @source \url{https://www.neb.com/tools-and-resources/selection-charts/alphabetized-list-of-recognition-specificities} -"binding_sites" - - - -#' Legend for the single-letter code of nucleobases indicating restriction sequences. -#' -#' @format A data frame of 28 rows and two columns: -#' \describe{ -#' \item{code}{The letter indicating more than one possible nucleotides.} -#' \item{nucleotides}{One of the multiple nucleotides the code can refer to.} -#' } -#' @source \url{https://en.wikipedia.org/wiki/List_of_restriction_enzyme_cutting_sites:_A#Whole_list_navigation} -"nucleobase_legend" - - - #' Table of evolutionary rates. #' diff --git a/R/digest.R b/R/digest.R deleted file mode 100644 index 2fcb29b..0000000 --- a/R/digest.R +++ /dev/null @@ -1,156 +0,0 @@ -# -# Digest genome(s) -# - - - -#' Check validity, adjust for degeneracy, and remove duplicates for input enzymes. -#' -#' @inheritParams digest -#' -#' @noRd -#' -process_enzymes <- function(enzyme_names, custom_enzymes) { - - if (missing(enzyme_names) & missing(custom_enzymes)) { - stop("\nWhen digesting a reference genome, you must provide either an ", - "enzyme name, a custom enzyme, or both.", - call. = FALSE) - } - enzyme_seqs <- character(0) - if (!missing(enzyme_names)) { - enzyme_seqs <- character(length(enzyme_names)) - for (i in 1:length(enzyme_names)) { - x <- enzyme_names[i] - z <- jackal::binding_sites$sequence[jackal::binding_sites == x] - if (length(z) == 0) { - stop(paste0("\nEnzyme name \"", x, "\" not found."), - call. = TRUE) - } - if (grepl("\\(|\\)", z)) { - stop(paste0("\n", x, " is a non-palindromic enzyme. ", - "See the link in ?binding_sites for ", - "more info."), - call. = TRUE) - } - if (!grepl("/", z)) { - stop(paste("\n", x, "doesn't have a cleavage site", - "according to NEB.", - "Choose another enzyme,", - "or provide a custom enzyme if you're", - "sure I've erred in this error."), - call. = TRUE) - } - enzyme_seqs[i] <- z - } - - } - if (!missing(custom_enzymes)) { - stopifnot(inherits(custom_enzymes, "character")) - - allowed_chars <- sort(unique(c(jackal::nucleobase_legend, "/", - recursive = TRUE))) - allowed_chars <- paste(allowed_chars, collapse = "") - weird_chars <- grepl(sprintf("[^%s]", allowed_chars), custom_enzymes) - if (any(weird_chars)) { - stop(paste("\nThe only allowed characters for restriction enzymes are", - allowed_chars), - call. = FALSE) - } - - no_cleavage <- ! grepl("/", custom_enzymes) - - if (any(no_cleavage)) { - stop("\nYou must include a cleavage site for each custom restriction ", - "site using \"/\" (e.g., \"TTA/TAA\" for the enzyme AanI).", - call. = TRUE) - } - - enzyme_seqs <- c(enzyme_seqs, custom_enzymes) - } - - # Get number of nucleotides before the cleavage site: - len5s <- get_precleavage_lens(enzyme_seqs) - - # Now we no longer want/need the cleavage indicators: - bind_sites <- gsub("/", "", enzyme_seqs) - - bind_sites <- expand_seqs(bind_sites) - - return(list(len5s = len5s, bind_sites = bind_sites)) -} - - - - - - - -#' Digest reference genome or variants from a genome. -#' -#' \emph{Note:} This will override any digestions currently in place in the -#' object. If you want to add a new digestion, re-run this function with the names -#' of all enzymes you're interested in included in the \code{enzyme_names} argument. -#' -#' -#' @inheritParams create_genome -#' @param object Either a \code{ref_genome} or \code{variants} object. -#' @param enzyme_names Name of enzymes that should be present inside the package data -#' object `binding_sites` (in the `enzyme` column). -#' @param custom_enzymes Custom enzymes for those not present in internal data. -#' @param chunk_size The size of chunks to break up sqeuences into when digesting -#' a \code{ref_genome} or \code{variants} object. -#' Changing this might affect performance, for better or worse. -#' The default worked best on my computer. Defaults to \code{1000}. -#' -#' @return All changes occur in place, but the input object is returned invisibly -#' so that piping works. -#' -#' -#' @export -#' -#' -digest <- function(object, - enzyme_names, - custom_enzymes, - chunk_size = 1000, - n_cores = 1) { - - if (missing(enzyme_names) & missing(custom_enzymes)) { - stop("\nWhen digesting a reference genome or variants, you must provide ", - "either an enzyme name, a custom enzyme, or both.", - call. = FALSE) - } - - enz_info <- process_enzymes(enzyme_names, custom_enzymes) - - if (inherits(object, "ref_genome")) { - if (!inherits(object$genome, "externalptr")) { - stop("\nYou're attempting a digestion on a ref_genome object with ", - "a genome field that is not an externalptr. ", - "Restart by reading a FASTA file or by simulating a genome. ", - "And do NOT change the genome field manually.", - call. = FALSE) - } - object$digests <- digest_ref(object$genome, - enz_info$bind_sites, enz_info$len5s, - chunk_size, n_cores) - } else if (inherits(object, "variants")) { - if (!inherits(object$genomes, "externalptr")) { - stop("\nYou're attempting a digestion on a variants object with ", - "a genomes field that is not an externalptr. ", - "Restart by reading a FASTA file or by simulating a genome, ", - "then generating variants. ", - "And do NOT change the genomes field manually.", - call. = FALSE) - } - object$digests <- digest_var_set(object$genomes, - enz_info$bind_sites, enz_info$len5s, - chunk_size, n_cores) - } else { - stop("\nA digestion can only be applied to a ref_genome or variants object.", - call. = FALSE) - } - invisible(object) -} - diff --git a/R/jackal.R b/R/jackal.R index b789e6a..76e3a61 100644 --- a/R/jackal.R +++ b/R/jackal.R @@ -4,13 +4,10 @@ #' (ii) generates variants using summary statistics, phylogenies, Variant #' Call Format (VCF) files, and coalescent simulations—the latter of which can include #' selection, recombination, and demographic fluctuations; -#' (iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion, -#' and variance in coverage among sites; and +#' (iii) simulates sequencing error, mapping qualities, and optical/PCR duplicates; and #' (iv) writes outputs to standard file formats. -#' `jackal` can simulate single or paired-ended reads for WGS on the Illumina platform, -#' and can be extended to simulate different methods (e.g., original RADseq, -#' double-digest RADseq, and genotyping-by-sequencing), and sequencing technologies -#' (e.g., Pacific BioSciences, Oxford Nanopore Technologies). +#' `jackal` can simulate single, paired-end, or mate-pair Illumina reads, as well as +#' reads from Pacific BioSciences. #' #' #' @importFrom Rcpp evalCpp diff --git a/R/mevo_phylo.R b/R/mevo_phylo.R index 2e6ab30..edf6898 100644 --- a/R/mevo_phylo.R +++ b/R/mevo_phylo.R @@ -137,7 +137,7 @@ read_phy_obj <- function(phy, n_seqs, chunked, err_msg = "") { #' read_coal_obj <- function(coal_obj, seq_sizes, chunked, err_msg) { - # Check for coal_obj begin a list and either having a `trees` field or all its + # Check for coal_obj being a list and either having a `trees` field or all its # items within having `trees` fields err <- FALSE nested <- FALSE diff --git a/README.Rmd b/README.Rmd index 232bb66..3c8f419 100644 --- a/README.Rmd +++ b/README.Rmd @@ -33,8 +33,7 @@ __An efficient, versatile molecular evolution and sequencing simulator__ (ii) generates variants using summary statistics, phylogenies, Variant Call Format (VCF) files, and coalescent simulations—the latter of which can include selection, recombination, and demographic fluctuations; -(iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion, -and variance in coverage among sites; and +(iii) simulates sequencing error, mapping qualities, and optical/PCR duplicates; and (iv) writes outputs to standard file formats. `jackal` can simulate single, paired-end, or mate-pair Illumina reads, as well as reads from Pacific BioSciences. diff --git a/README.md b/README.md index 5426c7e..2f370d2 100644 --- a/README.md +++ b/README.md @@ -18,10 +18,10 @@ Status](https://travis-ci.com/lucasnell/jackal.svg?branch=master)](https://travi generates variants using summary statistics, phylogenies, Variant Call Format (VCF) files, and coalescent simulations—the latter of which can include selection, recombination, and demographic fluctuations; (iii) -simulates sequencing error, mapping qualities, restriction-enzyme -digestion, and variance in coverage among sites; and (iv) writes outputs -to standard file formats. `jackal` can simulate single, paired-end, or -mate-pair Illumina reads, as well as reads from Pacific BioSciences. +simulates sequencing error, mapping qualities, and optical/PCR +duplicates; and (iv) writes outputs to standard file formats. `jackal` +can simulate single, paired-end, or mate-pair Illumina reads, as well as +reads from Pacific BioSciences. ## Installation @@ -42,20 +42,20 @@ ref_variants <- create_variants(reference, "phylo", phy, mevo_info) ref_variants #> << Variants object >> #> # Variants: 5 -#> # Mutations: 21,301 +#> # Mutations: 5,130 #> #> << Reference genome info: >> #> < Set of 10 sequences > #> # Total size: 10,000 bp #> name sequence length -#> seq0 CGTACCCCGTCGTGATTCTAGACTC...GTGTGAAAATGGCGATGTTACATGTG 1000 -#> seq1 TCTCCTCCTGATACCTGAACTCGCC...ACGATCGCGGGTGTTGCTCCGACCGG 1000 -#> seq2 ATCTTAAGGCTCACCACTGAGCCAG...AAATAGCATTAGTCCCGTTCGTAAAA 1000 -#> seq3 TCCAACAAGTGGGCCAGCTGCTTTA...ACGGTGTCGCCCAGCGTATTATGATA 1000 -#> seq4 ACGATATGCTTTCGCGCCTAGGATC...GAGAGTGTTCTCTTCACATCTGTCCG 1000 -#> seq5 GCGTTCTAGTTACGTGTTACCCAGG...GTCATAACATTAGATAATCCATCCGA 1000 -#> seq6 CGGCTCTAAACTTACCGGAAGACCA...CGGGACGGCGTGCTTCTAGTAACCGC 1000 -#> seq7 CCGCGGGCCAGGACAATCTTAACAT...ACTGAGAGTAATGCGTAGCACTGGCA 1000 -#> seq8 ATAACGGAGCTCCACACCGTCCTTG...CGATTGGCTTAAGGGTCCGGAATTAA 1000 -#> seq9 ATCCAATTGGACTCGAGGGTGATAG...CACCGGGCACACTAGCGTCCTCGTGA 1000 +#> seq0 GACTACGGCGCAGAACCTACAGAAC...TACCGGCGACAATAGTTTACCACTAC 1000 +#> seq1 AAGCATCCACATATTAGGGTTTTAC...GGCCCTAACGTATCGTGTGGCCTGAT 1000 +#> seq2 CGCTTATAACAACGCGTTTCGCGAT...ACAACAAGCGGTCCCACCTTAGAATT 1000 +#> seq3 CCTTGTCATCGGCTTAGTTGAGCAT...AGGCTAGAGAATAGTAACATGCATAC 1000 +#> seq4 AGGCACTATGCACTGTTCCAACGCA...CGTACGTACACTCCAATGTGTAGGAC 1000 +#> seq5 CGTCAGGACACGTTGGAGTTGAAAA...GTGTAGGTTATAGTCCATTGATGCGT 1000 +#> seq6 CTTGACGGGGGCCTTAACCGGCGAG...GTAGTTCTCCCTCTCCGACTGCTGCA 1000 +#> seq7 GGACTAAGGCAGTCCAGCTACTCAT...ACGCAGGGTTATCATTCATACACGTC 1000 +#> seq8 CATACACGACGTCCCATTAGGATGA...ATAAAGATTCCTGAGATCCGCAATGA 1000 +#> seq9 CAACGGTATGCACGATTGCATGGGA...GGGGCTTAACCTCTCGTTTCATTATC 1000 ``` diff --git a/data-raw/binding_sites.R b/data-raw/binding_sites.R deleted file mode 100644 index 8da473f..0000000 --- a/data-raw/binding_sites.R +++ /dev/null @@ -1,139 +0,0 @@ - -library(dplyr) -library(readr) -library(tidyr) -library(purrr) -library(rvest) - - - -# Altered version of rvest::html_table that doesn't remove first row: -html_table2 <- function (x, header = NA, trim = TRUE, fill = FALSE, dec = ".") { - stopifnot(html_name(x) == "table") - rows <- html_nodes(x, "tr") - n <- length(rows) - cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th") - ncols <- lapply(cells, html_attr, "colspan", default = "1") - ncols <- lapply(ncols, as.integer) - nrows <- lapply(cells, html_attr, "rowspan", default = "1") - nrows <- lapply(nrows, as.integer) - p <- unique(vapply(ncols, sum, integer(1))) - maxp <- max(p) - if (length(p) > 1 & maxp * n != sum(unlist(nrows)) & maxp * - n != sum(unlist(ncols))) { - if (!fill) { - stop("Table has inconsistent number of columns. ", - "Do you want fill = TRUE?", call. = FALSE) - } - } - values <- lapply(cells, html_text, trim = trim) - out <- matrix(NA_character_, nrow = n, ncol = maxp) - for (i in seq_len(n)) { - row <- values[[i]] - ncol <- ncols[[i]] - col <- 1 - for (j in seq_len(length(ncol))) { - out[i, col:(col + ncol[j] - 1)] <- row[[j]] - col <- col + ncol[j] - } - } - # Not sure why, but this part removes the first row: - # for (i in seq_len(maxp)) { - # for (j in seq_len(n)) { - # rowspan <- nrows[[j]][i] - # colspan <- ncols[[j]][i] - # if (!is.na(rowspan) & (rowspan > 1)) { - # if (!is.na(colspan) & (colspan > 1)) { - # nrows[[j]] <- c(utils::head(nrows[[j]], i), - # rep(rowspan, colspan - 1), utils::tail(nrows[[j]], - # length(rowspan) - (i + 1))) - # rowspan <- nrows[[j]][i] - # } - # for (k in seq_len(rowspan - 1)) { - # l <- utils::head(out[j + k, ], i - 1) - # r <- utils::tail(out[j + k, ], maxp - i + 1) - # out[j + k, ] <- utils::head(c(l, out[j, i], - # r), maxp) - # } - # } - # } - # } - if (is.na(header)) { - header <- all(html_name(cells[[1]]) == "th") - } - if (header) { - col_names <- out[1, , drop = FALSE] - out <- out[-1, , drop = FALSE] - } - else { - col_names <- paste0("X", seq_len(ncol(out))) - } - df <- lapply(seq_len(maxp), function(i) { - utils::type.convert(out[, i], as.is = TRUE, dec = dec) - }) - names(df) <- col_names - class(df) <- "data.frame" - attr(df, "row.names") <- .set_row_names(length(df[[1]])) - if (length(unique(col_names)) < length(col_names)) { - warning("At least two columns have the same name") - } - df -} - - - -binding_sites <- paste0("https://www.neb.com/tools-and-resources/selection-charts/", - "alphabetized-list-of-recognition-specificities") %>% - read_html() %>% - html_node("table") %>% - html_table2() %>% - setNames(c("sequence", "enzyme")) %>% - tbl_df() %>% - mutate(enzyme = strsplit(gsub("®", "", enzyme), " ")) %>% - unnest() %>% - mutate(sequence = ifelse(grepl("BstZ17I", x = enzyme), - sprintf("%.3s/%s", sequence, substr(sequence,4,6)), - sequence)) - - - - -process_isos <- function(isos) { - strsplit(isos, ", ") %>% - map(function(x) { - xx <- keep(x, ~ !grepl("\\^$", x = .x)) - return(xx) - }) -} - - -other_isos <- "https://www.neb.com/tools-and-resources/selection-charts/isoschizomers" %>% - read_html() %>% - html_node("table") %>% - html_table2() %>% - tbl_df() %>% - select(sequence = `Sequence`, enzyme = `Enzyme`, isos = `Other Isoschizomers`) %>% - filter(enzyme != "", sequence != "", isos != "") %>% - mutate(isos = process_isos(isos), - enzyme = gsub(" x$", "", enzyme)) %>% - filter(map_lgl(isos, ~ length(.x) > 0)) - - - -binding_sites <- full_join(binding_sites, other_isos, - by = c("enzyme", "sequence")) %>% - mutate(isos = map(isos, ~ {if (length(.x) == 0) character(0) else .x})) %>% - group_by(enzyme) %>% - summarize(sequence = sequence[nchar(sequence) == max(nchar(sequence))][1], - isos = isos[sapply(isos, length) == max(sapply(isos, length))]) %>% - ungroup() %>% - group_by(sequence) %>% - summarize(enzyme = list(unique(c(enzyme, isos, recursive = TRUE)))) %>% - unnest() %>% - select(enzyme, sequence) %>% - as.data.frame() - - -write_csv(binding_sites, "data-raw/binding_sites.csv") -devtools::use_data(binding_sites, overwrite = TRUE) - diff --git a/data-raw/binding_sites.csv b/data-raw/binding_sites.csv deleted file mode 100644 index 8f1ddb6..0000000 --- a/data-raw/binding_sites.csv +++ /dev/null @@ -1,696 +0,0 @@ -enzyme,sequence -Nt.CviPII,(0/-1)CCD -BcgI,(10/12)CGANNNNNNTGC(12/10) -BaeI,(10/15)ACNNNNGTAYC(12/7) -CspCI,(11/13)CAANNNNNGTGG(12/10) -BsaXI,(9/12)ACNNNNNCTCC(10/7) -MluCI,/AATT -Sse9I,/AATT -TasI,/AATT -TspEI,/AATT -FatI,/CATG -BstSCI,/CCNGG -StyD4I,/CCNGG -AjnI,/CCWGG -EcoRII,/CCWGG -Psp6I,/CCWGG -PspGI,/CCWGG -Bsp143I,/GATC -BssMI,/GATC -BstMBI,/GATC -DpnII,/GATC -Kzo9I,/GATC -MboI,/GATC -NdeII,/GATC -Sau3AI,/GATC -NmuCI,/GTSAC -TseFI,/GTSAC -Tsp45I,/GTSAC -HindIII,A/AGCTT -HindIII-HF,A/AGCTT -BspLU11I,A/CATGT -PciI,A/CATGT -PscI,A/CATGT -AgeI,A/CCGGT -AgeI-HF,A/CCGGT -AsiGI,A/CCGGT -BshTI,A/CCGGT -CspAI,A/CCGGT -PinAI,A/CCGGT -CsiI,A/CCWGGT -MabI,A/CCWGGT -SexAI,A/CCWGGT -MluI,A/CGCGT -MluI-HF,A/CGCGT -HpyCH4IV,A/CGT -HpySE526I,A/CGT -MaeII,A/CGT -AflIII,A/CRYGT -AhlI,A/CTAGT -BcuI,A/CTAGT -SpeI,A/CTAGT -SpeI-HF,A/CTAGT -BglII,A/GATCT -AclI,AA/CGTT -Psp1406I,AA/CGTT -SspI,AAT/ATT -SspI-HF,AAT/ATT -Acc36I,ACCTGC(4/8) -BfuAI,ACCTGC(4/8) -BspMI,ACCTGC(4/8) -BveI,ACCTGC(4/8) -BceAI,ACGGC(12/14) -Bst4CI,ACN/GT -HpyCH4III,ACN/GT -TaaI,ACN/GT -Tsp4CI,ACN/GT -Bse1I,ACTGG(1/-1) -BseNI,ACTGG(1/-1) -BsrI,ACTGG(1/-1) -BfiI,ACTGGG(5/4) -BmrI,ACTGGG(5/4) -BmuI,ACTGGG(5/4) -AluBI,AG/CT -AluI,AG/CT -AfeI,AGC/GCT -Aor51HI,AGC/GCT -Eco47III,AGC/GCT -Eco147I,AGG/CCT -PceI,AGG/CCT -SseBI,AGG/CCT -StuI,AGG/CCT -BmcAI,AGT/ACT -ScaI-HF,AGT/ACT -ZrmI,AGT/ACT -Bsa29I,AT/CGAT -BseCI,AT/CGAT -BshVI,AT/CGAT -BspDI,AT/CGAT -Bsu15I,AT/CGAT -BsuTUI,AT/CGAT -ClaI,AT/CGAT -AseI,AT/TAAT -PshBI,AT/TAAT -VspI,AT/TAAT -PI-SceI,ATCTATGTCGGGTGCGGAGAAAGAGGTAAT(-15/-19) -EcoT22I,ATGCA/T -Mph1103I,ATGCA/T -NsiI,ATGCA/T -NsiI-HF,ATGCA/T -Zsp2I,ATGCA/T -AvaIII,ATGCAT -EcoT22I,ATGCAT -Mph1103I,ATGCAT -NsiI,ATGCAT -NsiI-HF,ATGCAT -Zsp2I,ATGCAT -SmiI,ATTT/AAAT -SwaI,ATTT/AAAT -MfeI,C/AATTG -MfeI-HF,C/AATTG -MunI,C/AATTG -CviAII,C/ATG -Bsp19I,C/CATGG -NcoI,C/CATGG -NcoI-HF,C/CATGG -Cfr9I,C/CCGGG -TspMI,C/CCGGG -XmaI,C/CCGGG -BsiSI,C/CGG -HapII,C/CGG -HpaII,C/CGG -MspI,C/CGG -BsaJI,C/CNNGG -BseDI,C/CNNGG -BssECI,C/CNNGG -SecI,C/CNNGG -BstDSI,C/CRYGG -BtgI,C/CRYGG -DsaI,C/CRYGG -AspA2I,C/CTAGG -AvrII,C/CTAGG -BlnI,C/CTAGG -XmaJI,C/CTAGG -BssT1I,C/CWWGG -Eco130I,C/CWWGG -EcoT14I,C/CWWGG -ErhI,C/CWWGG -StyI,C/CWWGG -StyI-HF,C/CWWGG -BseX3I,C/GGCCG -BstZI,C/GGCCG -EagI,C/GGCCG -EagI-HF,C/GGCCG -EclXI,C/GGCCG -Eco52I,C/GGCCG -XmaIII,C/GGCCG -BsiWI,C/GTACG -BsiWI-HF,C/GTACG -Pfl23II,C/GTACG -PspLI,C/GTACG -SplI,C/GTACG -BfaI,C/TAG -FspBI,C/TAG -MaeI,C/TAG -SspMI,C/TAG -XspI,C/TAG -PaeR7I,C/TCGAG -Sfr274I,C/TCGAG -SlaI,C/TCGAG -XhoI,C/TCGAG -BstDEI,C/TNAG -DdeI,C/TNAG -HpyF3I,C/TNAG -BfmI,C/TRYAG -BstSFI,C/TRYAG -SfcI,C/TRYAG -SfeI,C/TRYAG -AflII,C/TTAAG -BfrI,C/TTAAG -BspTI,C/TTAAG -BstAFI,C/TTAAG -MspCI,C/TTAAG -Vha464I,C/TTAAG -SmlI,C/TYRAG -SmoI,C/TYRAG -Ama87I,C/YCGRG -AvaI,C/YCGRG -BmeT110I,C/YCGRG -BsiHKCI,C/YCGRG -BsoBI,C/YCGRG -Eco88I,C/YCGRG -FauNDI,CA/TATG -NdeI,CA/TATG -AcvI,CAC/GTG -BbrPI,CAC/GTG -Eco72I,CAC/GTG -PmaCI,CAC/GTG -PmlI,CAC/GTG -PspCI,CAC/GTG -Nb.BssSI,CACGAG -BauI,CACGAG(-5/-1) -BsiI,CACGAG(-5/-1) -BssSI,CACGAG(-5/-1) -BssSαI,CACGAG(-5/-1) -Bst2BI,CACGAG(-5/-1) -AjiI,CACGTC(-3/-3) -BmgBI,CACGTC(-3/-3) -BtrI,CACGTC(-3/-3) -AleI,CACNN/NNGTG -OliI,CACNN/NNGTG -AdeI,CACNNN/GTG -DraIII-HF,CACNNN/GTG -PvuII,CAG/CTG -PvuII-HF,CAG/CTG -EcoP15I,CAGCAG(25/27) -AlwNI,CAGNNN/CTG -CaiI,CAGNNN/CTG -PstNI,CAGNNN/CTG -BtsIMutI,CAGTG(2/0) -TscAI,CASTGNN/ -TspRI,CASTGNN/ -FaeI,CATG/ -Hin1II,CATG/ -Hsp92II,CATG/ -NlaIII,CATG/ -MslI,CAYNN/NNRTG -RseI,CAYNN/NNRTG -SmiMI,CAYNN/NNRTG -FspEI,CC(12/16) -Bme1390I,CC/NGG -BmrFI,CC/NGG -MspR9I,CC/NGG -ScrFI,CC/NGG -AsuC2I,CC/SGG -BcnI,CC/SGG -BpuMI,CC/SGG -CauII,CC/SGG -NciI,CC/SGG -AxyI,CC/TNAGG -Bse21I,CC/TNAGG -Bsu36I,CC/TNAGG -Eco81I,CC/TNAGG -SauI,CC/TNAGG -BciT130I,CC/WGG -BseBI,CC/WGG -Bst2UI,CC/WGG -BstNI,CC/WGG -MvaI,CC/WGG -AccB7I,CCANNNN/NTGG -PflMI,CCANNNN/NTGG -Van91I,CCANNNN/NTGG -XcmI,CCANNNNN/NNNNTGG -BstXI,CCANNNNN/NTGG -BccI,CCATC(4/5) -SmaI,CCC/GGG -BseYI,CCCAGC(-5/-1) -PspFI,CCCAGC(-5/-1) -FauI,CCCGC(4/6) -LpnPI,CCDG(10/14) -AciI,CCGC(-3/-1) -BspACI,CCGC(-3/-1) -SsiI,CCGC(-3/-1) -Cfr42I,CCGC/GG -KspI,CCGC/GG -SacII,CCGC/GG -Sfr303I,CCGC/GG -SgrBI,CCGC/GG -AccBSI,CCGCTC(-3/-3) -BsrBI,CCGCTC(-3/-3) -MbiI,CCGCTC(-3/-3) -AfiI,CCNNNNN/NNGG -Bsc4I,CCNNNNN/NNGG -BseLI,CCNNNNN/NNGG -BsiYI,CCNNNNN/NNGG -BslI,CCNNNNN/NNGG -MnlI,CCTC(7/6) -Nb.BbvCI,CCTCAGC -BbvCI,CCTCAGC(-5/-2) -Nt.BbvCI,CCTCAGC(-5/-7) -SbfI,CCTGCA/GG -SbfI-HF,CCTGCA/GG -SdaI,CCTGCA/GG -Sse8387I,CCTGCA/GG -Bpu10I,CCTNAGC(-5/-2) -BstENI,CCTNN/NNNAGG -EcoNI,CCTNN/NNNAGG -XagI,CCTNN/NNNAGG -Hin4II,CCTTC(6/5) -HpyAV,CCTTC(6/5) -Sse232I,CG/CCGGCG -MreI,CG/CCGGCG -AccII,CG/CG -Bsh1236I,CG/CG -BspFNI,CG/CG -BstFNI,CG/CG -BstUI,CG/CG -FnuDII,CG/CG -MvnI,CG/CG -CpoI,CG/GWCCG -CspI,CG/GWCCG -Rsr2I,CG/GWCCG -RsrII,CG/GWCCG -Ple19I,CGAT/CG -PvuI,CGAT/CG -PvuI-HF,CGAT/CG -Bsh1285I,CGRY/CG -BsiEI,CGRY/CG -BstMCI,CGRY/CG -McrI,CGRY/CG -BsmBI,CGTCTC(1/5) -Esp3I,CGTCTC(1/5) -Hpy99I,CGWCG/ -MspA1I,CMG/CKG -NspBII,CMG/CKG -MspJI,CNNR(9/13) -SgrAI,CR/CCGGYG -BspCNI,CTCAG(9/7) -Bst6I,CTCTTC(1/4) -Eam1104I,CTCTTC(1/4) -EarI,CTCTTC(1/4) -Ksp632I,CTCTTC(1/4) -AcuI,CTGAAG(16/14) -Eco57I,CTGAAG(16/14) -BspMAI,CTGCA/G -PstI,CTGCA/G -PstI-HF,CTGCA/G -BpmI,CTGGAG(16/14) -GsuI,CTGGAG(16/14) -Bce83I,CTTGAG(16/14) -BpuEI,CTTGAG(16/14) -EcoRI,G/AATTC -EcoRI-HF,G/AATTC -HinfI,G/ANTC -PfeI,G/AWTC -TfiI,G/AWTC -MroNI,G/CCGGC -NgoMIV,G/CCGGC -Hin6I,G/CGC -HinP1I,G/CGC -HspAI,G/CGC -BsePI,G/CGCGC -BssHII,G/CGCGC -PauI,G/CGCGC -PteI,G/CGCGC -AsuNHI,G/CTAGC -NheI,G/CTAGC -NheI-HF,G/CTAGC -ApeKI,G/CWGC -TseI,G/CWGC -BamHI,G/GATCC -BamHI-HF,G/GATCC -KasI,G/GCGCC -SspDI,G/GCGCC -Bsp120I,G/GGCCC -PspOMI,G/GGCCC -AspS9I,G/GNCC -AsuI,G/GNCC -BmgT120I,G/GNCC -Cfr13I,G/GNCC -PspPI,G/GNCC -Sau96I,G/GNCC -Acc65I,G/GTACC -Asp718I,G/GTACC -BstEII,G/GTNACC -BstEII-HF,G/GTNACC -BstPI,G/GTNACC -Eco91I,G/GTNACC -EcoO65I,G/GTNACC -PspEI,G/GTNACC -AvaII,G/GWCC -Bme18I,G/GWCC -Eco47I,G/GWCC -SinI,G/GWCC -VpaK11BI,G/GWCC -AccB1I,G/GYRCC -BanI,G/GYRCC -BshNI,G/GYRCC -BspT107I,G/GYRCC -HgiCI,G/GYRCC -Csp6I,G/TAC -CviQI,G/TAC -RsaNI,G/TAC -SalI,G/TCGAC -SalI-HF,G/TCGAC -Alw44I,G/TGCAC -ApaLI,G/TGCAC -VneI,G/TGCAC -DpnI,GA/TC -MalI,GA/TC -MboII,GAAGA(8/7) -BbsI,GAAGAC(2/6) -BbsI-HF,GAAGAC(2/6) -BbvII,GAAGAC(2/6) -BpiI,GAAGAC(2/6) -BstV2I,GAAGAC(2/6) -Asp700I,GAANN/NNTTC -MroXI,GAANN/NNTTC -PdmI,GAANN/NNTTC -XmnI,GAANN/NNTTC -Nb.BsmI,GAATGC -BsmI,GAATGC(1/-1) -Mva1269I,GAATGC(1/-1) -PctI,GAATGC(1/-1) -ZraI,GAC/GTC -CseI,GACGC(5/10) -HgaI,GACGC(5/10) -AatII,GACGT/C -PflFI,GACN/NNGTC -PsyI,GACN/NNGTC -Tth111I,GACN/NNGTC -BoxI,GACNN/NNGTC -BstPAI,GACNN/NNGTC -PshAI,GACNN/NNGTC -AhdI,GACNNN/NNGTC -BmeRI,GACNNN/NNGTC -DriI,GACNNN/NNGTC -Eam1105I,GACNNN/NNGTC -AasI,GACNNNN/NNGTC -DrdI,GACNNNN/NNGTC -DseDI,GACNNNN/NNGTC -Ecl136II,GAG/CTC -Eco53kI,GAG/CTC -EcoICRI,GAG/CTC -Psp124BI,GAGCT/C -SacI,GAGCT/C -SacI-HF,GAGCT/C -SstI,GAGCT/C -BseRI,GAGGAG(10/8) -Nt.BstNBI,GAGTC(4/-5) -PleI,GAGTC(4/5) -PpsI,GAGTC(4/5) -MlyI,GAGTC(5/5) -SchI,GAGTC(5/5) -Eco32I,GAT/ATC -EcoRV,GAT/ATC -EcoRV-HF,GAT/ATC -BsaBI,GATNN/NNATC -Bse8I,GATNN/NNATC -BseJI,GATNN/NNATC -CciNI,GC/GGCCGC -NotI,GC/GGCCGC -NotI-HF,GC/GGCCGC -BisI,GC/NGC -Fnu4HI,GC/NGC -Fsp4HI,GC/NGC -GluI,GC/NGC -SatI,GC/NGC -BlpI,GC/TNAGC -Bpu1102I,GC/TNAGC -Bsp1720I,GC/TNAGC -EspI,GC/TNAGC -Nb.BsrDI,GCAATG -Bse3DI,GCAATG(2/0) -BseMI,GCAATG(2/0) -BsrDI,GCAATG(2/0) -BbvI,GCAGC(8/12) -BseXI,GCAGC(8/12) -BstV1I,GCAGC(8/12) -Lsp1109I,GCAGC(8/12) -Nb.BtsI,GCAGTG -BtsαI,GCAGTG(2/0) -BstAPI,GCANNNN/NTGC -BmsI,GCATC(5/9) -LweI,GCATC(5/9) -SfaNI,GCATC(5/9) -PaeI,GCATG/C -SphI,GCATG/C -SphI-HF,GCATG/C -NaeI,GCC/GGC -PdiI,GCC/GGC -SrfI,GCCC/GGGC -NmeAIII,GCCGAG(21/19) -BglI,GCCNNNN/NGGC -AspLEI,GCG/C -BstHHI,GCG/C -CfoI,GCG/C -HhaI,GCG/C -AsiSI,GCGAT/CGC -RgaI,GCGAT/CGC -SfaAI,GCGAT/CGC -SgfI,GCGAT/CGC -BtgZI,GCGATG(10/14) -BlsI,GCN/GC -PkrI,GCN/GC -BstC8I,GCN/NGC -Cac8I,GCN/NGC -BstMWI,GCNNNNN/NNGC -HpyF10VI,GCNNNNN/NNGC -MwoI,GCNNNNN/NNGC -BmtI,GCTAG/C -BmtI-HF,GCTAG/C -BspOI,GCTAG/C -AsuNHI,GCTAG/C -NheI,GCTAG/C -Nt.BspQI,GCTCTTC(1/-7) -BspQI,GCTCTTC(1/4) -LguI,GCTCTTC(1/4) -PciSI,GCTCTTC(1/4) -SapI,GCTCTTC(1/4) -Bsp1286I,GDGCH/C -MhlI,GDGCH/C -SduI,GDGCH/C -BshFI,GG/CC -BsnI,GG/CC -BspANI,GG/CC -BsuRI,GG/CC -HaeIII,GG/CC -Mly113I,GG/CGCC -NarI,GG/CGCC -AscI,GG/CGCGCC -PalAI,GG/CGCGCC -SgsI,GG/CGCGCC -SanDI,GG/GWCCC -KflI,GG/GWCCC -Nt.AlwI,GGATC(4/-5) -AclWI,GGATC(4/5) -AlwI,GGATC(4/5) -BinI,GGATC(4/5) -BspPI,GGATC(4/5) -BseGI,GGATG(2/0) -BstF5I,GGATG(2/0) -BtsCI,GGATG(2/0) -FokI,GGATG(9/13) -DinI,GGC/GCC -EgeI,GGC/GCC -EheI,GGC/GCC -SfoI,GGC/GCC -FseI,GGCCGG/CC -RigI,GGCCGG/CC -SfiI,GGCCNNNN/NGGCC -PluTI,GGCGC/C -EciI,GGCGGA(11/9) -FinI,GGGAC -BslFI,GGGAC -BsmFI,GGGAC -FaqI,GGGAC -BslFI,GGGAC(10/14) -BsmFI,GGGAC(10/14) -FaqI,GGGAC(10/14) -ApaI,GGGCC/C -BmiI,GGN/NCC -BspLI,GGN/NCC -NlaIV,GGN/NCC -PspN4I,GGN/NCC -KpnI,GGTAC/C -KpnI-HF,GGTAC/C -BsaI,GGTCTC(1/5) -BsaI-HF,GGTCTC(1/5) -BsaI-HFv2,GGTCTC(1/5) -Bso31I,GGTCTC(1/5) -BspTNI,GGTCTC(1/5) -Eco31I,GGTCTC(1/5) -AsuHPI,GGTGA(8/7) -HphI,GGTGA(8/7) -BaeGI,GKGCM/C -BseSI,GKGCM/C -BstSLI,GKGCM/C -AcyI,GR/CGYC -BsaHI,GR/CGYC -BssNI,GR/CGYC -BstACI,GR/CGYC -Hin1I,GR/CGYC -Hsp92I,GR/CGYC -BanII,GRGCY/C -Eco24I,GRGCY/C -EcoT38I,GRGCY/C -FriOI,GRGCY/C -HgiJII,GRGCY/C -AfaI,GT/AC -RsaI,GT/AC -AccI,GT/MKAC -FblI,GT/MKAC -XmiI,GT/MKAC -BssNAI,GTA/TAC -Bst1107I,GTA/TAC -BstZ17I-HF,GTA/TAC -SnaI,GTATAC -BssNAI,GTATAC -Bst1107I,GTATAC -BstZ17I-HF,GTATAC -BciVI,GTATCC(6/5) -BfuI,GTATCC(6/5) -BsuI,GTATCC(6/5) -Nt.BsmAI,GTCTC(1/-5) -Alw26I,GTCTC(1/5) -BcoDI,GTCTC(1/5) -BsmAI,GTCTC(1/5) -BstMAI,GTCTC(1/5) -BsgI,GTGCAG(16/14) -Hpy166II,GTN/NAC -Hpy8I,GTN/NAC -MjaIV,GTNNAC -Hpy8I,GTNNAC -Hpy166II,GTNNAC -HpaI,GTT/AAC -KspAI,GTT/AAC -MssI,GTTT/AAAC -PmeI,GTTT/AAAC -HincII,GTY/RAC -HindII,GTY/RAC -Alw21I,GWGCW/C -Bbv12I,GWGCW/C -BsiHKAI,GWGCW/C -HgiAI,GWGCW/C -TspRI,NNCASTGNN/ -TscAI,NNCASTGNN/ -AcsI,R/AATTY -ApoI,R/AATTY -ApoI-HF,R/AATTY -XapI,R/AATTY -Bse118I,R/CCGGY -BsrFI,R/CCGGY -BsrFαI,R/CCGGY -BssAI,R/CCGGY -Cfr10I,R/CCGGY -BstX2I,R/GATCY -BstYI,R/GATCY -MflI,R/GATCY -PsuI,R/GATCY -XhoII,R/GATCY -BstNSI,RCATG/Y -NspI,RCATG/Y -XceI,RCATG/Y -CviJI,RG/CY -CviKI-1,RG/CY -DraII,RG/GNCCY -EcoO109I,RG/GNCCY -PpuMI,RG/GWCCY -Psp5II,RG/GWCCY -PspPPI,RG/GWCCY -BfoI,RGCGC/Y -BstH2I,RGCGC/Y -HaeII,RGCGC/Y -BspHI,T/CATGA -CciI,T/CATGA -PagI,T/CATGA -AccIII,T/CCGGA -Aor13HI,T/CCGGA -BseAI,T/CCGGA -Bsp13I,T/CCGGA -BspEI,T/CCGGA -BspMII,T/CCGGA -Kpn2I,T/CCGGA -MroI,T/CCGGA -TaqαI,T/CGA -XbaI,T/CTAGA -BclI,T/GATCA -BclI-HF,T/GATCA -FbaI,T/GATCA -Ksp22I,T/GATCA -Bsp1407I,T/GTACA -BsrGI,T/GTACA -BsrGI-HF,T/GTACA -BstAUI,T/GTACA -MseI,T/TAA -SaqAI,T/TAA -Tru1I,T/TAA -Tru9I,T/TAA -I-CeuI,TAACTATAACGGTCCTAAGGTAGCGAA(-9/-13) -BstSNI,TAC/GTA -Eco105I,TAC/GTA -SnaBI,TAC/GTA -I-SceI,TAGGGATAACAGGGTAAT(-9/-13) -Hpy178III,TC/NNGA -Hpy188III,TC/NNGA -MmeI,TCCRAC(20/18) -Bsp68I,TCG/CGA -BtuMI,TCG/CGA -NruI,TCG/CGA -NruI-HF,TCG/CGA -RruI,TCG/CGA -Hpy188I,TCN/GA -CviRI,TG/CA -HpyCH4V,TG/CA -Acc16I,TGC/GCA -FspI,TGC/GCA -MstI,TGC/GCA -NsbI,TGC/GCA -BalI,TGG/CCA -MlsI,TGG/CCA -MluNI,TGG/CCA -Mox20I,TGG/CCA -MscI,TGG/CCA -Msp20I,TGG/CCA -PI-PspI,TGGCAAACAGCTATTATGGGTATTATGGGT(-13/-17) -AsuII,TT/CGAA -Bpu14I,TT/CGAA -Bsp119I,TT/CGAA -BspT104I,TT/CGAA -BstBI,TT/CGAA -NspV,TT/CGAA -SfuI,TT/CGAA -AanI,TTA/TAA -PsiI,TTA/TAA -PacI,TTAAT/TAA -AhaIII,TTT/AAA -DraI,TTT/AAA -PspXI,VC/TCGAGB -BetI,W/CCGGW -BsaWI,W/CCGGW -AcoI,Y/GGCCR -CfrI,Y/GGCCR -EaeI,Y/GGCCR -BsaAI,YAC/GTR -BstBAI,YAC/GTR -Ppu21I,YAC/GTR diff --git a/data-raw/nucleobase_legend.R b/data-raw/nucleobase_legend.R deleted file mode 100644 index ac3ac81..0000000 --- a/data-raw/nucleobase_legend.R +++ /dev/null @@ -1,24 +0,0 @@ - - -library(dplyr) -library(readr) -library(tidyr) -library(rvest) - - -page <- read_html(paste0("https://en.wikipedia.org/wiki/List_of_restriction_enzyme", - "_cutting_sites:_A#Whole_list_navigation")) - - -nucleobase_legend <- page %>% - html_nodes("table") %>% - .[[1]] %>% - html_table() %>% - setNames(c("code", "nucleotides")) %>% - slice(-1:-5) %>% - mutate(nucleotides = strsplit(nucleotides, " or |, ")) %>% - unnest() - -write_csv(nucleobase_legend, "data-raw/nucleobase_legend.csv") -devtools::use_data(nucleobase_legend, overwrite = TRUE) - diff --git a/data-raw/nucleobase_legend.csv b/data-raw/nucleobase_legend.csv deleted file mode 100644 index d7e3b78..0000000 --- a/data-raw/nucleobase_legend.csv +++ /dev/null @@ -1,29 +0,0 @@ -code,nucleotides -N,A -N,C -N,G -N,T -M,A -M,C -R,A -R,G -W,A -W,T -Y,C -Y,T -S,C -S,G -K,G -K,T -H,A -H,C -H,T -B,C -B,G -B,T -V,A -V,C -V,G -D,A -D,G -D,T diff --git a/man/binding_sites.Rd b/man/binding_sites.Rd deleted file mode 100644 index c30b7ec..0000000 --- a/man/binding_sites.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{binding_sites} -\alias{binding_sites} -\title{Binding sites for selected restriction enzymes.} -\format{A 695 x 2 data frame with the following columns: -\describe{ -\item{enzyme}{Restriction enzyme name.} -\item{sites}{Enzyme binding site sequences. -See \code{\link{nucleobase_legend}} for what bases other than \code{T}, \code{C}, \code{A}, -and \code{G} mean. -Each \code{/} indicates a cleavage point. -According to NEB (see link below under Source): -"Numbers in parentheses indicate point of cleaveage for non-palindromic -enzymes." -These types of enzymes are not implemented.} -}} -\source{ -\url{https://www.neb.com/tools-and-resources/selection-charts/alphabetized-list-of-recognition-specificities} -} -\usage{ -binding_sites -} -\description{ -Binding sites for selected restriction enzymes. -} -\keyword{datasets} diff --git a/man/digest.Rd b/man/digest.Rd deleted file mode 100644 index 6e6ae83..0000000 --- a/man/digest.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/digest.R -\name{digest} -\alias{digest} -\title{Digest reference genome or variants from a genome.} -\usage{ -digest(object, enzyme_names, custom_enzymes, chunk_size = 1000, - n_cores = 1) -} -\arguments{ -\item{object}{Either a \code{ref_genome} or \code{variants} object.} - -\item{enzyme_names}{Name of enzymes that should be present inside the package data -object \code{binding_sites} (in the \code{enzyme} column).} - -\item{custom_enzymes}{Custom enzymes for those not present in internal data.} - -\item{chunk_size}{The size of chunks to break up sqeuences into when digesting -a \code{ref_genome} or \code{variants} object. -Changing this might affect performance, for better or worse. -The default worked best on my computer. Defaults to \code{1000}.} - -\item{n_cores}{Number of cores to use for parallel processing. This argument is -ignored if OpenMP is not enabled. Defaults to \code{1}.} -} -\value{ -All changes occur in place, but the input object is returned invisibly -so that piping works. -} -\description{ -\emph{Note:} This will override any digestions currently in place in the -object. If you want to add a new digestion, re-run this function with the names -of all enzymes you're interested in included in the \code{enzyme_names} argument. -} diff --git a/man/jackal.Rd b/man/jackal.Rd index d47fa53..9576413 100644 --- a/man/jackal.Rd +++ b/man/jackal.Rd @@ -10,11 +10,8 @@ (ii) generates variants using summary statistics, phylogenies, Variant Call Format (VCF) files, and coalescent simulations—the latter of which can include selection, recombination, and demographic fluctuations; -(iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion, -and variance in coverage among sites; and +(iii) simulates sequencing error, mapping qualities, and optical/PCR duplicates; and (iv) writes outputs to standard file formats. -\code{jackal} can simulate single or paired-ended reads for WGS on the Illumina platform, -and can be extended to simulate different methods (e.g., original RADseq, -double-digest RADseq, and genotyping-by-sequencing), and sequencing technologies -(e.g., Pacific BioSciences, Oxford Nanopore Technologies). +\code{jackal} can simulate single, paired-end, or mate-pair Illumina reads, as well as +reads from Pacific BioSciences. } diff --git a/man/nucleobase_legend.Rd b/man/nucleobase_legend.Rd deleted file mode 100644 index dba7df0..0000000 --- a/man/nucleobase_legend.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{nucleobase_legend} -\alias{nucleobase_legend} -\title{Legend for the single-letter code of nucleobases indicating restriction sequences.} -\format{A data frame of 28 rows and two columns: -\describe{ -\item{code}{The letter indicating more than one possible nucleotides.} -\item{nucleotides}{One of the multiple nucleotides the code can refer to.} -}} -\source{ -\url{https://en.wikipedia.org/wiki/List_of_restriction_enzyme_cutting_sites:_A#Whole_list_navigation} -} -\usage{ -nucleobase_legend -} -\description{ -Legend for the single-letter code of nucleobases indicating restriction sequences. -} -\keyword{datasets} diff --git a/man/ref_genome.Rd b/man/ref_genome.Rd index b1b873a..8346889 100644 --- a/man/ref_genome.Rd +++ b/man/ref_genome.Rd @@ -23,9 +23,6 @@ For safe ways of manipulating the reference genome, see the "Methods" section. \describe{ \item{\code{genome}}{An \code{externalptr} to a C++ object storing the sequences representing the genome.} - -\item{\code{digests}}{An \code{externalptr} to a C++ object storing the digestion of the -genome, if a digestion has been carried out. It's \code{NULL} otherwise.} }} \section{Methods}{ diff --git a/man/variants.Rd b/man/variants.Rd index 76172fd..4759d71 100644 --- a/man/variants.Rd +++ b/man/variants.Rd @@ -24,9 +24,6 @@ For safe ways of manipulating the variants' information, see the "Methods" secti \item{\code{genome}}{An \code{externalptr} to a C++ object storing the sequences representing the genome.} -\item{\code{digests}}{An \code{externalptr} to a C++ object storing the digestion of the -genome, if a digestion has been carried out. It's \code{NULL} otherwise.} - \item{\code{reference}}{An \code{externalptr} to a C++ object storing the sequences representing the genome. There are a few extra notes for this field: diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 0ee426d..9410107 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -72,58 +72,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// get_precleavage_lens -std::vector get_precleavage_lens(const std::vector& seqs); -RcppExport SEXP _jackal_get_precleavage_lens(SEXP seqsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::vector& >::type seqs(seqsSEXP); - rcpp_result_gen = Rcpp::wrap(get_precleavage_lens(seqs)); - return rcpp_result_gen; -END_RCPP -} -// expand_seqs -std::vector expand_seqs(const std::vector& seqs); -RcppExport SEXP _jackal_expand_seqs(SEXP seqsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::vector& >::type seqs(seqsSEXP); - rcpp_result_gen = Rcpp::wrap(expand_seqs(seqs)); - return rcpp_result_gen; -END_RCPP -} -// digest_var_set -SEXP digest_var_set(SEXP var_set_ptr, const std::vector& bind_sites, const std::vector& len5s, const uint32& chunk_size, const uint32& n_cores); -RcppExport SEXP _jackal_digest_var_set(SEXP var_set_ptrSEXP, SEXP bind_sitesSEXP, SEXP len5sSEXP, SEXP chunk_sizeSEXP, SEXP n_coresSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type var_set_ptr(var_set_ptrSEXP); - Rcpp::traits::input_parameter< const std::vector& >::type bind_sites(bind_sitesSEXP); - Rcpp::traits::input_parameter< const std::vector& >::type len5s(len5sSEXP); - Rcpp::traits::input_parameter< const uint32& >::type chunk_size(chunk_sizeSEXP); - Rcpp::traits::input_parameter< const uint32& >::type n_cores(n_coresSEXP); - rcpp_result_gen = Rcpp::wrap(digest_var_set(var_set_ptr, bind_sites, len5s, chunk_size, n_cores)); - return rcpp_result_gen; -END_RCPP -} -// digest_ref -SEXP digest_ref(SEXP ref_genome_ptr, const std::vector& bind_sites, const std::vector& len5s, const uint32& chunk_size, const uint32& n_cores); -RcppExport SEXP _jackal_digest_ref(SEXP ref_genome_ptrSEXP, SEXP bind_sitesSEXP, SEXP len5sSEXP, SEXP chunk_sizeSEXP, SEXP n_coresSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ref_genome_ptr(ref_genome_ptrSEXP); - Rcpp::traits::input_parameter< const std::vector& >::type bind_sites(bind_sitesSEXP); - Rcpp::traits::input_parameter< const std::vector& >::type len5s(len5sSEXP); - Rcpp::traits::input_parameter< const uint32& >::type chunk_size(chunk_sizeSEXP); - Rcpp::traits::input_parameter< const uint32& >::type n_cores(n_coresSEXP); - rcpp_result_gen = Rcpp::wrap(digest_ref(ref_genome_ptr, bind_sites, len5s, chunk_size, n_cores)); - return rcpp_result_gen; -END_RCPP -} // illumina_ref_cpp void illumina_ref_cpp(SEXP ref_genome_ptr, const bool& paired, const bool& matepair, const std::string& out_prefix, const bool& compress, const uint32& n_reads, const double& prob_dup, const uint32& n_cores, const bool& show_progress, const uint32& read_chunk_size, const double& frag_len_shape, const double& frag_len_scale, const uint32& frag_len_min, const uint32& frag_len_max, const std::vector>>& qual_probs1, const std::vector>>& quals1, const double& ins_prob1, const double& del_prob1, const std::vector>>& qual_probs2, const std::vector>>& quals2, const double& ins_prob2, const double& del_prob2, const std::vector& barcodes); RcppExport SEXP _jackal_illumina_ref_cpp(SEXP ref_genome_ptrSEXP, SEXP pairedSEXP, SEXP matepairSEXP, SEXP out_prefixSEXP, SEXP compressSEXP, SEXP n_readsSEXP, SEXP prob_dupSEXP, SEXP n_coresSEXP, SEXP show_progressSEXP, SEXP read_chunk_sizeSEXP, SEXP frag_len_shapeSEXP, SEXP frag_len_scaleSEXP, SEXP frag_len_minSEXP, SEXP frag_len_maxSEXP, SEXP qual_probs1SEXP, SEXP quals1SEXP, SEXP ins_prob1SEXP, SEXP del_prob1SEXP, SEXP qual_probs2SEXP, SEXP quals2SEXP, SEXP ins_prob2SEXP, SEXP del_prob2SEXP, SEXP barcodesSEXP) { @@ -863,10 +811,6 @@ static const R_CallMethodDef CallEntries[] = { {"_jackal_replace_Ns_cpp", (DL_FUNC) &_jackal_replace_Ns_cpp, 4}, {"_jackal_create_genome_", (DL_FUNC) &_jackal_create_genome_, 5}, {"_jackal_rando_seqs", (DL_FUNC) &_jackal_rando_seqs, 5}, - {"_jackal_get_precleavage_lens", (DL_FUNC) &_jackal_get_precleavage_lens, 1}, - {"_jackal_expand_seqs", (DL_FUNC) &_jackal_expand_seqs, 1}, - {"_jackal_digest_var_set", (DL_FUNC) &_jackal_digest_var_set, 5}, - {"_jackal_digest_ref", (DL_FUNC) &_jackal_digest_ref, 5}, {"_jackal_illumina_ref_cpp", (DL_FUNC) &_jackal_illumina_ref_cpp, 23}, {"_jackal_illumina_var_cpp", (DL_FUNC) &_jackal_illumina_var_cpp, 24}, {"_jackal_make_mutation_sampler_base", (DL_FUNC) &_jackal_make_mutation_sampler_base, 4}, diff --git a/src/digest.cpp b/src/digest.cpp deleted file mode 100644 index 520760e..0000000 --- a/src/digest.cpp +++ /dev/null @@ -1,565 +0,0 @@ -// -// This file digests sequences from variant set and sequence set objects -// - - -#include -#include // find, string::npos, string::size_type, ... -#include // vector class -#include // deque class -#ifdef _OPENMP -#include // omp -#endif - - -#include "jackal_types.h" // integer types -#include "seq_classes_ref.h" // Ref* classes -#include "seq_classes_var.h" // Var* classes -#include "digest.h" // DigestInfo, MultiOut, -#include "str_manip.h" // rev_comp - -using namespace Rcpp; - - - -/* - Remove duplicates from a vector of strings in place. - */ -void unique_(std::vector& x) { - std::sort(x.begin(), x.end()); - std::vector::iterator iter = std::unique(x.begin(), x.end()); - x.resize(iter - x.begin()); - return; -} - - - - -//' Calculate how many bases come before a cleavage site. -//' -//' -//' @noRd -//' -//[[Rcpp::export]] -std::vector get_precleavage_lens(const std::vector& seqs) { - - std::vector out(seqs.size()); - - for (uint32 i = 0; i < seqs.size(); i++) { - uint32 cleav = seqs[i].find('/'); - out[i] = cleav; - } - - return out; -} - - -/* - Expand sequences with non-specific nucleobases. - */ -void expand_sites(const std::vector& sites, - std::vector& seqs_out, - const bool& add_rev_comp = true) { - - uint32 n_combs = 1; - for (uint32 i = 0; i < sites.size(); i++) n_combs *= sites[i].size(); - seqs_out.reserve(n_combs); - - for (uint32 i = 0; i < sites[0].size(); i++) { - seqs_out.push_back(std::string(1, sites[0][i])); - } - - for (uint32 i = 1; i < sites.size(); i++) { - const std::string& site_i(sites[i]); - uint32 n = seqs_out.size(); - for (uint32 j = 1; j < site_i.size(); j++) { - for (uint32 k = 0; k < n; k++) { - seqs_out.push_back(seqs_out[k] + site_i[j]); - } - } - for (uint32 k = 0; k < n; k++) { - seqs_out[k] += site_i[0]; - } - } - // Add all reverse complements, whether or not they already exist in the output vector - if (add_rev_comp) { - seqs_out.reserve(n_combs * 2); - for (uint32 i = 0; i < n_combs; i++) { - std::string rc = seqs_out[i]; - rev_comp(rc); - seqs_out.push_back(rc); - } - } - - return; -} - - - - -//' Expand sequences for reverse complements and for non-specific nucleobases. -//' -//' -//' @noRd -//' -//[[Rcpp::export]] -std::vector expand_seqs(const std::vector& seqs) { - - std::vector out; - std::string bases = "TCAG"; - - for (uint32 i = 0; i < seqs.size(); i++) { - const std::string& seq(seqs[i]); - // See if the sequence is degenerate (i.e., ambiguous) or not: - bool degenerate = false; - for (const char& c : seq) { - if (bases.find(c) == std::string::npos) degenerate = true; - } - if (!degenerate) { - out.push_back(seq); - std::string rc = seq; - rev_comp(rc); - if (rc != seq) out.push_back(rc); - } else { - std::vector sites(seq.size()); - for (uint32 j = 0; j < seq.size(); j++) { - sites[j] = digest::deg_bases[seq[j]]; - } - // Create all combos of these sites and add to `out` - expand_sites(sites, out); - } - } - // Remove duplicates: - unique_(out); - - return out; -} - - - - - - -/* - Search for multiple substrings simulateously, change the `search_obj` object so that it - stores... - (1) the first position in `s` that matches one of the strings in `bind_sites` and - (2) the index in `bind_sites` for the site that matches to a position in `s` - */ -void multi_search( - MultiOut& search_obj, - const DigestInfo& dinfo, - const std::string& s, - const uint32& last_pos) { - - search_obj.reset(); - - std::string::size_type new_pos; - - for (uint32 j = 0; j < dinfo.bind_sites.size(); j++) { - new_pos = s.find(dinfo.bind_sites[j], last_pos); - if (new_pos < search_obj.new_pos) { - search_obj.new_pos = new_pos; - search_obj.j = j; - } - } - - return; -} - - -// ------------------ -// Digest one string -// ------------------ - -/* - (`pos_mod` is to modify positions because `s` is a chunk not from the start of the - sequence.) - */ -void digest_seq(std::deque& pos_vec, - uint32& start_pos, - const std::string& s, - const DigestInfo& dinfo, - const uint32& pos_mod = 0, - const uint32& end = 0) { - - uint32 cut_site_pos; - MultiOut search_obj; - - multi_search(search_obj, dinfo, s, start_pos); - - if (end == 0) { - while (search_obj.new_pos != std::string::npos) { - cut_site_pos = search_obj.new_pos + dinfo.len5s[search_obj.j] + pos_mod; - pos_vec.push_back(cut_site_pos); - start_pos = search_obj.new_pos + dinfo.to_skip(search_obj.j); - multi_search(search_obj, dinfo, s, start_pos); - } - } else { - while (search_obj.new_pos != std::string::npos && search_obj.new_pos <= end) { - cut_site_pos = search_obj.new_pos + dinfo.len5s[search_obj.j] + pos_mod; - pos_vec.push_back(cut_site_pos); - start_pos = search_obj.new_pos + dinfo.to_skip(search_obj.j); - multi_search(search_obj, dinfo, s, start_pos); - } - } - - return; -} - - - - -/* - ============================================================================= - ============================================================================= - ============================================================================= - ============================================================================= - - Variant digestion - - ============================================================================= - ============================================================================= - ============================================================================= - ============================================================================= - */ - - - - -// Digest one variant genome's sequence - -void VarSeqDigest::digest(const DigestInfo& dinfo) { - - // Reset these if it's been digested before - start = 0; - if (!cut_sites.empty()) cut_sites.clear(); - - if (chunk_size >= var_info.seq_size) { - std::string seq = var_info.get_seq_full(); - digest_seq(cut_sites, start, seq, dinfo); - return; - } - - // index to the nearest mutation for a given chunk's starting position: - uint32 mut_i = 0; - - // Initialize chunk sequence string - std::string chunk_str(chunk_size, 'x'); - - /* - I'm iterating by `chunk_size - (max_size - 1)` to make sure I catch any - matches that overlap the "seam" between chunks. - */ - uint32 it = chunk_size - (dinfo.max_size - 1); - - /* - The last chunk start position in this sequence (without allowing only overlap - at the end) - */ - uint32 last_chunk = var_info.seq_size - dinfo.max_size; - - // Digesting each chunk: - for (uint32 chunk_start = 0; chunk_start <= last_chunk; chunk_start += it) { - // Filling new chunk string: - var_info.set_seq_chunk(chunk_str, chunk_start, chunk_size, mut_i); - // Compensating for matches at the end of the previous chunk: - if (start > it) { - start -= it; - } else start = 0; - // Digest chunk: - digest_seq(cut_sites, start, chunk_str, dinfo, chunk_start); - } - - return; -} - - - - -// ------------------ -// Digest all sequences, all variants -// ------------------ - - - -//' Internal C++ function to digest all sequences for all variants in a variant set. -//' -//' -//' -//' @param var_set_ptr An external pointer to a C++ \code{VarSet} object -//' representing variants from the reference genome. -//' @inheritParams bind_sites digest_ref -//' @inheritParams len5s digest_ref -//' @param chunk_size The size of chunks to divide sequences into when digesting. -//' @param n_cores The number of cores to use for processing. Defaults to \code{1}. -//' -//' @return A list of lists, each sub-list containing multiple vectors representing -//' the locations of cut sites for a given variant on a given sequence. -//' Indexing the output list would be done as such: -//' \code{output_list[[variant_index]][[sequence_index]][position_index]}. -//' -//' @noRd -//' -// [[Rcpp::export]] -SEXP digest_var_set( - SEXP var_set_ptr, - const std::vector& bind_sites, - const std::vector& len5s, - const uint32& chunk_size, - const uint32& n_cores = 1) { - - XPtr var_set(var_set_ptr); - - const uint32 n_vars = var_set->size(); - const uint32 n_seqs = var_set->reference->size(); - - XPtr out_digests(new GenomeSetDigest(n_vars, n_seqs)); - - const DigestInfo dinfo(bind_sites, len5s); - - if (chunk_size < (2 * dinfo.max_size - 1)) { - stop("chunk size needs to be at least 2 times the maximum binding " - "site length minus 1 (i.e., `chunk_size >= 2 * " - "max() - 1`)"); - } - - - #ifdef _OPENMP - #pragma omp parallel for default(shared) num_threads(n_cores) schedule(dynamic) collapse(2) - #endif - for (uint32 v = 0; v < n_vars; v++) { - for (uint32 s = 0; s < n_seqs; s++) { - VarSeqDigest vsd(*var_set, v, s, chunk_size, (*out_digests)[v][s]); - vsd.digest(dinfo); - } - } - - return out_digests; -} - - - - - - - - - - - -/* - ============================================================================= - ============================================================================= - ============================================================================= - ============================================================================= - - Reference genome digestion - - ============================================================================= - ============================================================================= - ============================================================================= - ============================================================================= - */ - - - - -// Digest chunk of a sequence - -void RefSeqChunk::digest(const DigestInfo& dinfo) { - - // Reset this if it's been digested already - if (!cut_sites.empty()) cut_sites.clear(); - - /* - `next_start` is in the RefSeqChunk class and keeps track of earliest possible - start for the next chunk in this sequence (`start` is const, `next_start` isn't): - */ - next_start = start; - - digest_seq(cut_sites, next_start, ref, dinfo, 0, end); - - return; -} - - -/* - Make sure no positions overlap in the "seam" between two chunks. - `prev` should come before the focal `RefSeqChunk` - */ -void RefSeqChunk::merge_seam(const RefSeqChunk& prev, const DigestInfo& dinfo) { - - // If they're not on the same sequence, don't do anything - if (prev.seq != seq) return; - // If there were no cut sites on this chunk or the previous one, don't do anything - if (cut_sites.empty() || prev.cut_sites.empty()) return; - - std::deque& pos_vec2(cut_sites); - - // Earliest possible starting position in `pos_vec2` - uint32 start_pos = prev.next_start; - - if (pos_vec2.front() >= start_pos) return; - - // First remove position(ref) from `pos_vec2` that are < `start_pos` - while (pos_vec2.front() < start_pos) { - pos_vec2.pop_front(); - if (pos_vec2.empty()) break; - } - - // Now search through sequence again until positions are "merged" - MultiOut search_obj; - multi_search(search_obj, dinfo, ref, start_pos); - while (search_obj.new_pos != std::string::npos) { - uint32 cut_site_pos = search_obj.new_pos + dinfo.len5s[search_obj.j]; - while (pos_vec2.front() < cut_site_pos) pos_vec2.pop_front(); - if (pos_vec2.front() == cut_site_pos) { // <---------- how I'm defining "merged" - break; - } else { - pos_vec2.push_front(cut_site_pos); - } - start_pos = search_obj.new_pos + dinfo.to_skip(search_obj.j); - multi_search(search_obj, dinfo, ref, start_pos); - } - - /* - If we've changed the whole deque of cut sites, then adjust the `next_start` field - (this should virtually never happen with adequately large chunk sizes) - */ - if (start_pos > next_start) next_start = start_pos; - - return; -} - - - - - - -//' Internal C++ function to digest all sequences in a reference genome. -//' -//' -//' -//' @param ref_genome_ptr An external pointer to a C++ \code{RefGenome} object -//' representing the reference genome. -//' @param bind_sites Vector of enzyme full recognition site(s). -//' @param len5s A vector of the numbers of characters of the prime5 sites for each -//' recognition site. -//' @param chunk_size Size of chunks to break sequences into for processing. -//' This value is ignored if it's set to zero. -//' Ideally this is set to a value that results in a number of chunks divisible by -//' the number of cores you're using, and is most useful when `n_cores` is greater -//' than the number of scaffolds. -//' Breaking into increasingly small chunks results in increasing overhead, so -//' beware of making this argument very small. -//' Reference genome sequences are not copied during this function, so using -//' this argument for a reference genome does NOT decrease memory usage -//' appreciably. -//' Defaults to \code{0}. -//' @param n_cores The number of cores to use for processing. This value is ignored -//' if the input reference genome is merged and \code{chunk_size == 0}. -//' Defaults to \code{1}. -//' -//' @return A list of vectors, each vector representing the locations of cut sites -//' on a given sequence. -//' Indexing the output list would be done as such: -//' \code{output_list[[sequence_index]][position_index]}. -//' -//' @noRd -//' -//[[Rcpp::export]] -SEXP digest_ref( - SEXP ref_genome_ptr, - const std::vector& bind_sites, - const std::vector& len5s, - const uint32& chunk_size = 0, - const uint32& n_cores = 1) { - - const XPtr ref_genome(ref_genome_ptr); - - XPtr out_digest_xptr(new GenomeDigest(ref_genome->size())); - GenomeDigest& out_digest(*out_digest_xptr); - - DigestInfo dinfo(bind_sites, len5s); - - if (chunk_size > 0) { - if (chunk_size < (2 * dinfo.max_size - 1)) { - stop("chunk size needs to be at least 2 times the maximum binding " - "site length minus 1 (i.e., `chunk_size >= 2 * " - "max() - 1`)"); - } - /* - I'm iterating by `chunk_size - (max_size - 1)` to make sure I catch any - matches that overlap the "seam" between chunks. - */ - uint32 iter_by = chunk_size - (dinfo.max_size - 1); - // Set up temporary deque containing a deque of RefSeqChunk objects: - std::deque chunk_dq; - for (uint32 i = 0; i < ref_genome->size(); i++) { - const std::string& seq((*ref_genome)[i].nucleos); - - if (chunk_size >= seq.size()) { - RefSeqChunk tmp_sc(seq, i, 0, chunk_size); - chunk_dq.push_back(tmp_sc); - } else { - /* - The last chunk start position (without allowing only overlap - at the end): - */ - uint32 last_chunk = seq.size() - dinfo.max_size; - for (uint32 s = 0; s < last_chunk; s += iter_by) { - RefSeqChunk tmp_sc(seq, i, s, chunk_size); - chunk_dq.push_back(tmp_sc); - } - } - } - - #ifdef _OPENMP - #pragma omp parallel for default(shared) num_threads(n_cores) schedule(static) - #endif - for (uint32 i = 0; i < chunk_dq.size(); i++) { - chunk_dq[i].digest(dinfo); - } - - /* - Now check for points over the seams and make sure they aren't too close to - each other. - */ - #ifdef _OPENMP - #pragma omp parallel for default(shared) num_threads(n_cores) schedule(static) - #endif - for (uint32 i = 1; i < chunk_dq.size(); i++) { - chunk_dq[i].merge_seam(chunk_dq[(i-1)], dinfo); - } - - /* - Lastly combine all RefSeqChunk objects for a given sequence together into - one deque. - */ - for (uint32 i = 0; i < ref_genome->size(); i++) { - std::deque& out_dq(out_digest[i]); - while (chunk_dq.front().seq == i) { - chunk_dq.front().combine(out_dq); - chunk_dq.pop_front(); - if (chunk_dq.empty()) break; - } - } - } else if (ref_genome->merged) { - uint32 i = 0, j = 0; // j is only used for the call to digest_seq - const std::string& seq((*ref_genome)[i].nucleos); - digest_seq(out_digest[i], j, seq, dinfo); - } else { - #ifdef _OPENMP - #pragma omp parallel for default(shared) num_threads(n_cores) schedule(dynamic) - #endif - for (uint32 i = 0; i < ref_genome->size(); i++) { - const std::string& seq_s((*ref_genome)[i].nucleos); - uint32 j = 0; // j is only used for the call to digest_seq - digest_seq(out_digest[i], j, seq_s, dinfo); - } - } - - return out_digest_xptr; -} - diff --git a/src/digest.h b/src/digest.h deleted file mode 100644 index 29cfcfc..0000000 --- a/src/digest.h +++ /dev/null @@ -1,209 +0,0 @@ -#ifndef __JACKAL_DIGEST_H -#define __JACKAL_DIGEST_H - -#include -#include // find, string::npos, string::size_type, ... -#include // vector class -#include // deque class -#include // unordered_map - -#include "seq_classes_ref.h" // Ref* classes -#include "seq_classes_var.h" // Var* classes - - -namespace digest { - -// Degenerate bases and what nucleotides they represent: -std::unordered_map deg_bases = { - {'N', "ACGT"}, - {'M', "AC"}, - {'R', "AG"}, - {'W', "AT"}, - {'Y', "CT"}, - {'S', "CG"}, - {'K', "GT"}, - {'H', "ACT"}, - {'B', "CGT"}, - {'V', "ACG"}, - {'D', "AGT"} -}; - -} - - - - -/* - Organizes info needed for digestion - */ -struct DigestInfo { - std::vector bind_sites; - std::vector len5s; - uint32 max_size; - - // Constructors - DigestInfo() : bind_sites(), len5s(), max_size() {}; - - DigestInfo(const std::vector& bind_sites_, - const std::vector& len5s_) - : bind_sites(bind_sites_), len5s(len5s_), max_size(bind_sites_[0].size()) { - - for (uint32 i = 1; i < bind_sites_.size(); i++) { - if (bind_sites_[i].size() > max_size) max_size = bind_sites_[i].size(); - } - - }; - - // Get the # bp to skip before looking for another match - uint32 to_skip(const uint32& j) const noexcept { - return bind_sites[j].size(); - } -}; - - - - - -/* - When searching for multiple strings simultaneously, this is a way to return the - first position where one of them match and the string match's index - */ -struct MultiOut { - // fields - std::string::size_type new_pos; - uint32 j; - // Constructor - MultiOut() { - new_pos = std::string::npos; - j = 1000000; - } - // reset to default values - void reset() { - new_pos = std::string::npos; - j = 1000000; - } -}; - - - -// Info on the digestion of a sequence's chunk - -class RefSeqChunk { - -public: - - const uint32 seq; // index of the sequence this refers to - const uint32 start; // index of the starting position in the sequence of this chunk - const uint32 end; // index of the ending position in the sequence of this chunk - std::deque cut_sites; // cut sites found - /* - Earliest possible start for the next chunk in this sequence. This is used - when merging the "seam" between this chunk and the one after it. - */ - uint32 next_start; - - RefSeqChunk(const std::string& ref_, - const uint32& seq_, - const uint32& start_, - const uint32& chunk_size) - : seq(seq_), start(start_), - end(std::min(static_cast(start_ + chunk_size - 1), - static_cast(ref_.size() - 1))), - cut_sites() , - ref(ref_) {}; - - // To return the number of cut sites - uint32 size() const noexcept { - return cut_sites.size(); - } - // For combining with an output deque - void combine(std::deque& out_dq) { - for (uint32 j = 0; j < cut_sites.size(); j++) { - out_dq.push_back(cut_sites[j]); - } - } - - // Digest this chunk - void digest(const DigestInfo& dinfo); - // Make sure no positions overlap in the "seam" between two chunks. - void merge_seam(const RefSeqChunk& prev, const DigestInfo& dinfo); - -private: - const std::string& ref; // reference genome string this refers to -}; - - - - -// Info on the digestion of a variant's sequence -class VarSeqDigest { - -public: - - const VarSequence& var_info; // info on this variant's sequence - const uint32 var; // index of the variant this refers to - const uint32 seq; // index of the sequence this refers to - const uint32 chunk_size; // size of chunks to digest sequence by - std::deque& cut_sites; // cut sites found - - VarSeqDigest(VarSet& var_info_, - const uint32& var_, - const uint32& seq_, - const uint32& chunk_size_, - std::deque& cut_sites_) - : var_info(var_info_[var_][seq_]), var(var_), seq(seq_), - chunk_size(chunk_size_), cut_sites(cut_sites_), start(0) {}; - - // Digest this sequence - void digest(const DigestInfo& dinfo); - -private: - uint32 start; // starting positions of chunks used as it's digested -}; - - - -// Used to store a single genome's digestion info: - -class GenomeDigest { - -public: - - std::vector< std::deque > cut_sites; - - GenomeDigest() {} - GenomeDigest(const uint32& n_seqs) : cut_sites(n_seqs) {} - - std::deque& operator[](const uint& idx) { - return cut_sites[idx]; - } - const std::deque& operator[](const uint& idx) const { - return cut_sites[idx]; - } -}; - - -// Used to store multiple digestions, like from a `VarSet` object -class GenomeSetDigest { - -public: - - std::vector< GenomeDigest > digestions; - - GenomeSetDigest() {} - GenomeSetDigest(const uint32& n_vars) : digestions(n_vars) {} - GenomeSetDigest(const uint32& n_vars, const uint32& n_seqs) - : digestions(n_vars, GenomeDigest(n_seqs)) {} - - GenomeDigest& operator[](const uint& idx) { - return digestions[idx]; - } - const GenomeDigest& operator[](const uint& idx) const { - return digestions[idx]; - } -}; - - - - -#endif diff --git a/tests/testthat/test-R_classes.R b/tests/testthat/test-R_classes.R index 0bc990a..bac7bbe 100644 --- a/tests/testthat/test-R_classes.R +++ b/tests/testthat/test-R_classes.R @@ -39,7 +39,6 @@ test_that(paste("Random sequences using `create_genome` aren't significantly dif ref <- ref_genome$new(jackal:::make_ref_genome(seqs)) test_that("ref_genome class starts with the correct fields", { expect_is(ref$genome, "externalptr") - expect_null(ref$digests) }) test_that("ref_genome class methods", { @@ -89,7 +88,6 @@ phy <- ape::rcoal(n_vars) vars <- create_variants(ref, "phy", phy, mevo_obj = mev) test_that("variants class starts with the correct fields", { expect_is(vars$genomes, "externalptr") - expect_null(vars$digests) })