diff --git a/.gitignore b/.gitignore index 03984b8..3fbbeaa 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,6 @@ src/*.dll *.desc gfiles Gac -man gemino_*.tgz *.pdf methods diff --git a/man/add_mutations.Rd b/man/add_mutations.Rd new file mode 100644 index 0000000..64ef3ed --- /dev/null +++ b/man/add_mutations.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{add_mutations} +\alias{add_mutations} +\alias{add_substitution} +\alias{add_insertion} +\alias{add_deletion} +\title{Add mutations manually from R.} +\usage{ +add_substitution(vs_, var_ind, seq_ind, nucleo_, new_pos_) + +add_insertion(vs_, var_ind, seq_ind, nucleos_, new_pos_) + +add_deletion(vs_, var_ind, seq_ind, size_, new_pos_) +} +\arguments{ +\item{vs_}{External pointer to a C++ \code{VarSet} object} + +\item{var_ind}{Integer index to the desired variant. Uses 0-based indexing!} + +\item{seq_ind}{Integer index to the desired sequence. Uses 0-based indexing!} + +\item{nucleo_}{Character to substitute for existing one.} + +\item{new_pos_}{Integer index to the desired subsitution location. +Uses 0-based indexing!} + +\item{nucleos_}{Nucleotides to insert at the desired location.} + +\item{size_}{Size of deletion.} +} +\description{ +Note that all indices are in 0-based C++ indexing. This means that the first +item is indexed by \code{0}, and so forth. +} +\section{Functions}{ +\itemize{ +\item \code{add_substitution}: Add a substitution. + +\item \code{add_insertion}: Add an insertion. + +\item \code{add_deletion}: Add a deletion. +}} + diff --git a/man/binding_sites.Rd b/man/binding_sites.Rd new file mode 100644 index 0000000..6e94bf9 --- /dev/null +++ b/man/binding_sites.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{binding_sites} +\alias{binding_sites} +\title{Binding sites for selected restriction enzymes.} +\format{A list of length 11. For each element in the list... +\describe{ +\item{name}{The element's name is the restriction enzyme's name. +The enzymes present in this list are +\emph{AclI}, \emph{ApeKI}, \emph{AscI}, \emph{BspEI}, \emph{BstBI}, +\emph{EcoT22I}, \emph{FspI}, \emph{MluI-HF}, \emph{NruI-HF}, \emph{PstI}, +and \emph{SbfI}.} +\item{sites}{Enzyme binding site sequences. This vector is of the binding sites, +5' then 3', for each unique site that the enzyme can bind to.} +}} +\source{ +\url{http://www.neb.com} +} +\usage{ +binding_sites +} +\description{ +Binding sites for selected restriction enzymes. +} +\keyword{datasets} diff --git a/man/create_genome.Rd b/man/create_genome.Rd new file mode 100644 index 0000000..b99a49e --- /dev/null +++ b/man/create_genome.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{create_genome} +\alias{create_genome} +\alias{rando_seqs} +\title{Create \code{RefGenome} pointer based on nucleotide equilibrium frequencies.} +\usage{ +create_genome(n_seqs, len_mean, len_sd = 0, equil_freqs = numeric(0), + n_cores = 1L) + +rando_seqs(n_seqs, len_mean, len_sd = 0, equil_freqs = numeric(0), + n_cores = 1L) +} +\arguments{ +\item{n_seqs}{Number of sequences.} + +\item{len_mean}{Mean for the gamma distribution for sequence sizes.} + +\item{len_sd}{Standard deviation for the gamma distribution for sequence sizes. +If set to \code{<= 0}, all sequences will be the same length. Defaults to \code{0}.} + +\item{equil_freqs}{Vector of nucleotide equilibrium frequencies for +"T", "C", "A", and "G", respectively. Defaults to \code{rep(0.25, 4)}.} + +\item{n_cores}{Number of cores to use via OpenMP.} +} +\value{ +External pointer to a \code{RefGenome} C++ object. + +Character vector of sequence strings. +} +\description{ +Function to create random sequences for a new reference genome object. +} +\details{ +Note that this function will never return empty sequences. +} +\section{Functions}{ +\itemize{ +\item \code{rando_seqs}: create random sequences as a character vector. +}} + +\examples{ + +\dontrun{ +genome <- create_genome(10, 100e6, 10e6, equil_freqs = c(0.1, 0.2, 0.3, 0.4)) +} + +\dontrun{ +randos <- rando_seqs(10, 1000, 10) +} + +} diff --git a/man/digest.Rd b/man/digest.Rd new file mode 100644 index 0000000..729aa2e --- /dev/null +++ b/man/digest.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/digest.R +\name{digest} +\alias{digest} +\title{Digest genome(s).} +\usage{ +digest(object, enzyme_names, n_cores = 1, chunk_size = 1000, + enz_list = binding_sites, in_place = FALSE) +} +\arguments{ +\item{object}{Either a \code{dna_set} or \code{variants} object.} + +\item{enzyme_names}{Name of enzyme(s).} + +\item{n_cores}{Number of cores to use for parallel processing. This argument is +ignored if OpenMP is not enabled. Defaults to \code{1}.} + +\item{chunk_size}{The size of chunks to break up scaffolds into when digesting +a \code{variants} object. +(This argument is ignored if digesting a \code{dna_set}.) +Changing this might affect performance, for better or worse. +The default worked best on my computer. Defaults to \code{1000}.} + +\item{enz_list}{List of enzymes with binding sites. Default is the internal +\code{binding_sites} list (see \code{\link{binding_sites}}).} + +\item{in_place}{Boolean for whether to edit the object in place without +making a new copy. Defaults to \code{FALSE}.} +} +\value{ +If \code{in_place == FALSE}, a \code{variants} or \code{dna_set} object +with the \code{digests} field filled in. +If \code{in_place == TRUE}, it returns \code{NULL}, but it changes the input +object in place. +} +\description{ +\emph{Note:} This will override any digestions currently in place in the +object. If you want to add a new digestion, re-run this function with the names +of all enzymes you're interested in included in the \code{enzyme_names} argument. +} +\examples{ + +\dontrun{ + +ref_genome <- dna_set$new(rando_seqs(100, mean_len = 1e3, sd_len = 1e2)) +digest(ref_genome, 'ApeKI', n_cores = 1, in_place = TRUE) +ref_genome + +variants_obj <- make_variants(ref_genome, n_vars = 10) +variants_obj + +# Returns a new variants object +digest(variants_obj, 'ApeKI') + +# Returns nothing, but changes variants_obj object +digest(variants_obj, 'AscI', in_place = TRUE) +# To see the changes... +variants_obj + +} + +} diff --git a/man/evo_rates.Rd b/man/evo_rates.Rd new file mode 100644 index 0000000..2709401 --- /dev/null +++ b/man/evo_rates.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{evo_rates} +\alias{evo_rates} +\title{Table of evolutionary rates.} +\format{A data frame with 15 rows and 4 variables: +\describe{ +\item{domain}{Either \code{Bacteria} or \code{Eukarya} for what type of organism +the species is.} +\item{species}{Species name.} +\item{indels}{Rate of insertions and deletions (events per site per generation).} +\item{subs}{Base-substitution mutation rate (events per site per generation).} +}} +\source{ +\url{http://dx.doi.org/10.1534/g3.116.030890} +} +\usage{ +evo_rates +} +\description{ +From Table 1 in Sung et al. (2016). +} +\references{ +Sung, W., M. S. Ackerman, M. M. Dillon, T. G. Platt, C. Fuqua, V. S. Cooper, and +M. Lynch. 2016. Evolution of the insertion-deletion mutation rate across the +tree of life. \emph{G3: Genes | Genomes | Genetics} \strong{6}:2583–2591. +} +\keyword{datasets} diff --git a/man/gemino.Rd b/man/gemino.Rd new file mode 100644 index 0000000..cbeb207 --- /dev/null +++ b/man/gemino.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gemino.R +\docType{package} +\name{gemino} +\alias{gemino} +\alias{gemino-package} +\title{gemino: An efficient, flexible molecular evolution and sequencing simulator.} +\description{ +\code{gemino} efficiently (i) reads and simulates reference genomes; +(ii) generates variants using summary statistics, phylogenies, Variant +Call Format (VCF) files, and coalescent simulations—the latter of which can include +selection, recombination, and demographic fluctuations; +(iii) simulates sequencing error, mapping qualities, restriction-enzyme digestion, +and variance in coverage among sites; and +(iv) writes outputs to standard file formats. +\code{gemino} can simulate single or paired-ended reads for WGS on the Illumina platform, +and can be extended to simulate different methods (e.g., original RADseq, +double-digest RADseq, and genotyping-by-sequencing), and sequencing technologies +(e.g., Pacific BioSciences, Oxford Nanopore Technologies). +} +\section{gemino functions}{ + +} + diff --git a/man/random_variants.Rd b/man/random_variants.Rd new file mode 100644 index 0000000..f094bec --- /dev/null +++ b/man/random_variants.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/random_variants.R +\name{random_variants} +\alias{random_variants} +\title{Construct a \code{VarSet} object from a reference genome and summary statistics.} +\usage{ +random_variants(dna_set_in, n_vars, theta_w, theta_pi, + indel_probs = exp(-1:-10), snp_probs = rep(0.25, 4), + snp_proportion = NULL, n_cores = 1, n2N = 50, alpha = 0.8) +} +\arguments{ +\item{dna_set_in}{A \code{dna_set} object of sequences representing the reference +genome.} + +\item{n_vars}{The number of variants to create.} + +\item{theta_w}{Watterson's estimator for the focal population.} + +\item{theta_pi}{Average nucleotide diversity for the focal population.} + +\item{indel_probs}{Relative probabilities of indel types and sizes. +If insertions and deletions have the same probabilities, this is simply +a numeric vector where the value in location \code{i} indicates the relative +probability of an indel of size \code{i}. +If insertions and deletions do not have the same probabilities, then +this argument should be a list where the \code{insertion} and \code{deletion} fields are +numeric vectors specifying relative probabilities for insertions and deletions, +respectively. +Note that if specifying a list, the proportion of insertions to deletions will +be \code{sum(indel_probs$insertions) / sum(indel_probs$deletions)}. +Defaults to \code{exp(-1:-10)}.} + +\item{snp_probs}{Relative probabilities of substitution types: +"A", "C", "G", and "T" respectively. Defaults to \code{rep(0.25, 4)}.} + +\item{snp_proportion}{The proportion of mutations (not sites) that are SNPs. +Defaults to the proportion calculated using the average ratio of indels +to substitutions in eukaryotes from Sung et al. (2016).} + +\item{n_cores}{Number of cores to use. Defaults to 1.} + +\item{n2N}{A numeric threshold placed on the algorithm used to find new locations. +This is not recommended to be changed. Defaults to 50.} + +\item{alpha}{A numeric threshold placed on the algorithm used to find new locations. +This is not recommended to be changed. Defaults to 0.8.} +} +\value{ +A \code{VarSet} object. +} +\description{ +This function creates a \code{VarSet} object, which is designed to be a +low-memory way to store variants. +(The variants class also prevents what might otherwise be an annoyingly long list +from ever printing in the console.) +} +\examples{ +\dontrun{ +n_vars <- 10 +dna_set_in <- dna_set$new(rando_seqs(100, 100)) +set.seed(1) +varseq_out <- random_variants(dna_set_in, n_vars, + theta_w = 0.0045, theta_pi = 0.005) +} + +} +\references{ +Sung, W., M. S. Ackerman, M. M. Dillon, T. G. Platt, C. Fuqua, V. S. Cooper, and M. +Lynch. 2016. +Evolution of the insertion-deletion mutation rate across the tree of life. +\emph{G3: Genes|Genomes|Genetics} \strong{6}:2583-2591. +} diff --git a/man/read_fasta.Rd b/man/read_fasta.Rd new file mode 100644 index 0000000..7fbf27c --- /dev/null +++ b/man/read_fasta.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_write.R +\name{read_fasta} +\alias{read_fasta} +\title{Read a fasta file to an external pointer object.} +\usage{ +read_fasta(fasta_file, fai_file = NULL, cut_names = TRUE, + rm_soft_mask = TRUE) +} +\arguments{ +\item{fasta_file}{File name of the fasta file.} + +\item{fai_file}{File name of the fasta file. +Providing this argument speeds up the reading process significantly. +Defaults to \code{NULL}, which indicates the fasta file is not indexed.} + +\item{cut_names}{Boolean for whether to cut sequence names at the first space. +Defaults to \code{TRUE}.} + +\item{rm_soft_mask}{Boolean for whether to remove soft-masking by making +sequences all uppercase. Defaults to \code{TRUE}.} +} +\value{ +An external pointer to a \code{RefGenome} object in C++. +} +\description{ +Accepts uncompressed and gzipped fasta files. +} diff --git a/man/table_gammas.Rd b/man/table_gammas.Rd new file mode 100644 index 0000000..b8ae4b2 --- /dev/null +++ b/man/table_gammas.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{table_gammas} +\alias{table_gammas} +\title{Faster version of table function to count the number of mutations in Gamma regions.} +\usage{ +table_gammas(gamma_ends, positions) +} +\arguments{ +\item{gamma_ends}{Vector of endpoints for gamma regions} + +\item{positions}{Vector of positions that you want to bin into gamma regions.} +} +\description{ +Faster version of table function to count the number of mutations in Gamma regions. +} diff --git a/man/write_fasta.Rd b/man/write_fasta.Rd new file mode 100644 index 0000000..1a91855 --- /dev/null +++ b/man/write_fasta.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_write.R +\name{write_fasta} +\alias{write_fasta} +\title{Write to FASTA file.} +\usage{ +write_fasta(file_name, ptr, text_width = 80, compression = "none") +} +\arguments{ +\item{file_name}{File name of the output fasta file.} + +\item{ptr}{External pointer to a \code{RefGenome} C++ object.} + +\item{text_width}{The number of characters per line in the output fasta file. +Defaults to 80.} + +\item{compression}{Type of compression. Takes either \code{"none"} or \code{"gzip"}. +Defaults to \code{"none"}.} +} +\value{ +Nothing. +} +\description{ +Write to FASTA file. +}