diff --git a/vignettes/biogram_pub.bib b/vignettes/biogram_pub.bib new file mode 100644 index 0000000..5d1709c --- /dev/null +++ b/vignettes/biogram_pub.bib @@ -0,0 +1,268 @@ + +@article{choi_cancer-associated_2014, + title = {Cancer-associated fibroblast promote transmigration through endothelial brain cells in 3D in vitro models}, + issn = {1097-0215}, + doi = {10.1002/ijc.28848}, + abstract = {Brain metastases are associated with high morbidity as well as with poor prognosis and survival in breast cancer patients. Despite its clinical importance, metastasis of breast cancer cells through the blood-brain barrier (BBB) is poorly understood. The objective of our study was to investigate whether cancer-associated fibroblasts (CAFs) play crucial roles in breast cancer brain metastasis. Using a cell adhesion assays, in vitro BBB permeability and transmigration assays, and soft agar colony formation assays, we investigated the physical roles of CAFs in breast cancer brain metastasis. We also performed immunofluorescence, flow cytometric analysis, Droplet Digital PCR, and Simon(TM) Simple Western System to confirm changes in expression levels. We established two novel three-dimensional (3-D) culture systems using a perpendicular slide chamber and applying 3-D embedded culture method to reflect brain metastasis conditions. With a newly developed device, CAFs was proven to promote cell adhesion to human brain microvascular endothelial cells, in vitro BBB permeability and transmigration, and colony formation of breast cancer cells. Furthermore, CAFs enhanced the invasive migration of breast cancer cells in two kinds of 3-D cultures. These 3-D models also reliably recapitulate the initial steps of BBB transmigration, micro-metastasis, and colonization. Expression of integrin α5β1 and αvβ3, c-MET, and α2,6-siayltransferase was increased in breast cancer cells that migrated through the BBB. In conclusion, based on our in vitro BBB and co-culture models, our data suggest that CAFs may play a role in breast cancer brain metastasis. © 2014 Wiley Periodicals, Inc.}, + language = {ENG}, + journal = {International journal of cancer. Journal international du cancer}, + author = {Choi, Yoon Pyo and Lee, Joo Hyun and Gao, Ming-Qing and Kim, Baek Gil and Kang, Suki and Kim, Se Hoon and Cho, Nam Hoon}, + month = mar, + year = {2014}, + pmid = {24643985}, + keywords = {Algorithms, Algorithms Amino Acid Sequence Aspartic Acid Endopeptidases Binding Sites Databases, Algorithms Apicomplexa/*metabolism Computational Biology/*methods *Computer Simulation Discriminant Analysis Plastids/*metabolism Protozoan Proteins/*metabolism Species Specificity, Amino Acid Sequence, Aneuploidy, Animals, Aspartic Acid Endopeptidases, Assay, Assay, Binding Sites, Biological Markers, Biological Models, Cell Line, Cytomegalovirus, Databases, Factual, Diagnostics, diagnostics, Digital techniques, Digital techniques, DNA, DNA, DNA Cleavage, DNA Copy Number Variations, DNA Fragmentation, DNA, Neoplasm, DNA, Plant, Edetic Acid, Factual HIV-1 HIV-2 HIV Protease Humans Markov Chains Mathematics Models, Fetus, Gene Expression Profiling, Gene Library, Genetically Modified, Genetic Diseases, Genetic Diseases, Inborn, Genome, Genome, Human, Genotype, Gentisates, Heparin, High-Throughput Nucleotide Sequencing, HIV-1, HIV-2, HIV Protease, Human, Humans, Humans, Huntington Disease, Inborn, Individualized Medicine, Lung Neoplasms, Markov Chains, Mathematics, Membrane Glycoproteins, Messenger, Microfluidic Analytical Techniques, Models, Models, Biological, Models, Molecular, Models, Statistical, Molecular Diagnostic Techniques, Molecular Molecular Sequence Data Molecular Structure Oligopeptides Protein Conformation Proteins, Molecular Sequence Data, Molecular Structure, Multiplex Polymerase Chain Reaction, Mutation, Neoplasm, Nerve Tissue Proteins, Oligopeptides, PCR, PCR, Pharmacogenetics, Plant, Plants, Plants, Genetically Modified, Polymerase Chain Reaction, Prenatal Diagnosis, Protein Conformation, Proteins, Real-Time Polymerase Chain Reaction, Reference Standards, Reproducibility of Results, Reverse Transcriptase Polymerase Chain Reaction, RNA, RNA, Messenger, RNA, Untranslated, Sensitivity and Specificity, Sequence Analysis, Sequence Analysis, DNA, Sodium Dodecyl Sulfate, Statistical, Surface Tension, Untranslated, Viral Envelope Proteins, Viral Proteins, Viscosity, Zea mays}, + file = {Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/D2GRJQKI/Choo i Ranganathan - 2008 - Flanking signal and mature peptide residues influe.pdf:application/pdf;ScienceDirect Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/6WWSFBU4/Debski i Garstecki - Designing and interpretation of digital assays Co.pdf:application/pdf;ScienceDirect Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/J8MAEJQS/S2214753516300079.html:text/html;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/V77KQW7D/S15.html:text/html} +} + +@article{hindson_absolute_2013, + title = {Absolute quantification by droplet digital {PCR} versus analog real-time {PCR}}, + volume = {10}, + issn = {1548-7105}, + doi = {10.1038/nmeth.2633}, + abstract = {Nanoliter-sized droplet technology paired with digital PCR (ddPCR) holds promise for highly precise, absolute nucleic acid quantification. Our comparison of microRNA quantification by ddPCR and real-time PCR revealed greater precision (coefficients of variation decreased 37-86\%) and improved day-to-day reproducibility (by a factor of seven) of ddPCR but with comparable sensitivity. When we applied ddPCR to serum microRNA biomarker analysis, this translated to superior diagnostic performance for identifying individuals with cancer.}, + language = {eng}, + number = {10}, + journal = {Nature methods}, + author = {Hindson, Christopher M and Chevillet, John R and Briggs, Hilary A and Gallichotte, Emily N and Ruf, Ingrid K and Hindson, Benjamin J and Vessella, Robert L and Tewari, Muneesh}, + month = oct, + year = {2013}, + pmid = {23995387}, + keywords = {16S, Adenoviridae, Aged, Alanine, Alanine Alkaline Phosphatase Amino Acid Sequence Biological Transport DNA Mutational Analysis Endopeptidases Enzyme Precursors Escherichia coli Glutamine Membrane Proteins Molecular Sequence Data Protein Conformation Protein Processing, Algorithms, Alkaline Phosphatase, Allelic Imbalance, Amino Acid, Amino Acid Motifs, amino acid physical properties, amino acid physical properties, Amino acids, Amino acids, Amino Acids/*chemistry *Databases, Amino Acid Sequence, Amino Acid Sequence, Amino Acid Substitution Amino Acids/*chemistry *Databases, Amyloid, Aneuploidy, Animal Anatomy / Morphology / Histology, Animal Anatomy / Morphology / Histology, Animals, Animals, Archaeal, Artificial Intelligence, Artificial Intelligence, Bacterial Proteins, Bacteriological Techniques, Base Sequence, bend-structure preference, bend-structure preference, beta-Globins, beta-Thalassemia, Biochemistry, Biochemistry, general, Biological, Biological Transport, Bioorganic Chemistry, Bioorganic Chemistry, Biophysical Phenomena, Biophysical Phenomena, bulk, bulk, Calorimetry, Calorimetry, Calorimetry, Differential Scanning, Carrier State, Case-Control Studies, characteristic properties, characteristic properties, Circular Dichroism, Circular Dichroism, classification, Cluster Analysis, Cluster Analysis, Cobalt, Cobalt, Computational Biology, computational biology, Computers, Computer Simulation, Crystallography, Crystallography, X-Ray, decision\_trees, Differential Scanning, DNA, DNA, Archaeal, DNA Mutational Analysis, DNA Primers, DNA, Viral, Down Syndrome, Endopeptidases, Enzyme Precursors, Equipment Design, Escherichia coli, Evolution, Evolution, Molecular, factor analysis, factor analysis, Factual Information Storage and Retrieval Internet Molecular Conformation Mutation Protein Structure, Factual Internet Mutation, Female, Fetus, Fluorescent Dyes, Gene Dosage, Gene Frequency, general, Genome, Genome, Viral, Geologic Sediments, Glutamine, Guidelines as Topic, Haplotypes, Heterozygote, Hierarchical approach, Hierarchical approach HIV-1 protease Karhunen-Loeve transform, Histocompatibility Antigens Class I, Histocompatibility Antigens Class I, HIV, HIV-1 protease, HIV Infections, Humans, Humans, Humic Substances, hydrophobicity, hydrophobicity, In Vitro Techniques, Karhunen-Loeve transform, Limit of Detection, Loss of Heterozygosity, Lung Neoplasms, machine\_learning, Male, Mass, Maternal-Fetal Exchange, Matrix-Assisted Laser Desorption-Ionization, Membrane Proteins, Membrane Proteins Models, Methicillin-Resistant Staphylococcus aureus, Mice, Mice, Microarray Analysis, Microfluidic Analytical Techniques, Microfluidics, MicroRNAs, Models, Models, Molecular, Models, Theoretical, Molecular, Molecular Conformation, Molecular Conformation, Molecular Diagnostic Techniques, Molecular Protein Sorting Signals Protein Structure, Molecular Sequence Data, Molecular Sequence Data, Multiplex Polymerase Chain Reaction, Mutagenesis, Mutagenesis, Mutagenesis, Insertional, Nasal Mucosa, Oligopeptides, Organic Chemistry, Organic Chemistry, Origin of Life, Origin of Life, Pedigree, Peptides, Peptides, Plasma, Polymerase Chain Reaction, Polymorphism, Polymorphism, Single Nucleotide, Post-Translational Protein Sorting Signals Serine Endopeptidases Substrate Specificity, Pregnancy, Prenatal Diagnosis, Prostatic Neoplasms, Protein Binding, Protein Binding, Protein Conformation, Protein Folding, Protein Folding, Protein Internet Proteins/*chemistry, Protein Processing, Post-Translational, Proteins, Proteins, Protein Sorting Signals, Protein Stability, Protein Stability, Protein Structure, Protein Structure, Secondary, r, random\_forest, Real-Time Polymerase Chain Reaction, regression, Reproducibility of Results, Reverse Transcriptase Polymerase Chain Reaction, Ribosomal, RNA, RNA, Ribosomal, 16S, Sample Size, Secondary, Secondary Sequence Alignment Sequence Homology, Sensitivity and Specificity, Sequence Analysis, Serine Endopeptidases, Single Nucleotide, Software, Soil Microbiology, Spectrometry, Spectrometry, Mass, Matrix-Assisted Laser Desorption-Ionization, Spectrophotometry, Spectrophotometry, Ultraviolet, Sputum, Staphylococcal Infections, Static Electricity, Static Electricity, statistical analysis, statistical analysis, Substrate Specificity, Theoretical, Thermodynamics, Thermodynamics, Tumor Markers, Tumor Markers, Biological, Ultraviolet, Viral, Viral Load, X-Ray, Zinc Fingers, Zinc Fingers, α-helix preference, α-helix preference, β-structure preference, β-structure preference}, + pages = {1003--1005}, + file = {Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/CVZ69JX5/Kawashima i in. - 2008 - AAindex amino acid index database, progress repor.pdf:application/pdf;Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/UPZPQSZD/Launay i in. - 2007 - Recognizing protein–protein interfaces with empiri.pdf:application/pdf;Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/ESD7J3H4/Launay i in. - 2007 - Recognizing protein–protein interfaces with empiri.pdf:application/pdf;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/5T6X5RC6/D202.html:text/html;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/Z3FIXKAF/1471-2105-8-270.html:text/html;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/K2ZN9SGF/10.html:text/html;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/PQV23XJH/abstract.html:text/html;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/PBUNSDZN/01621459.1995.html:text/html;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/T8RURSWW/1471-2105-8-270.html:text/html} +} + +@article{chen_allelic_2011, + title = {Allelic discrimination of cis-trans relationships by digital polymerase chain reaction: {GJB}2 (p.{V}27I/p.{E}114G) and {CFTR} (p.{R}117H/5T)}, + volume = {13}, + issn = {1530-0366}, + doi = {10.1097/GIM.0b013e3182272e0b}, + abstract = {PURPOSE: : To distinguish the cis-trans relationship of two sequence changes and to arrive at an accurate molecular diagnosis for autosomal recessive disorders, methods such as Sanger sequencing cannot differentiate whether sequence changes are in cis or trans. In addition, most techniques theoretically appropriate for allelic discrimination depend on the specific identified sequence changes for assay design, need extensive optimization, or may not be suitable. We developed a method that does not fully depend on the specific nucleotide changes. It enables efficient assay design and practical implementation of allelic discrimination. +METHODS: : Digital polymerase chain reaction (PCR) was used to separate and amplify alleles. Sanger sequencing was subsequently used to identify sequence changes. +RESULTS: : We developed a cost-effective digital PCR method for allelic discrimination of short amplicons and demonstrated it with p.Val27Ile and p.Glu114Gly in GJB2 as an example. We also successfully developed a long-range digital PCR approach to determine the cis-trans relationship of p.Arg117His and 5T in the CFTR gene. +CONCLUSION: : Digital PCR for allelic discrimination can be clinically implemented to determine the allelic configuration of relatively common sequence changes which frequently appear together and have clinical ramifications, such as the combination of p.Val27Ile and p.Glu114Gly in the GJB2 gene and p.Arg117His and 5T in CFTR.}, + language = {eng}, + number = {12}, + journal = {Genetics in medicine: official journal of the American College of Medical Genetics}, + author = {Chen, Neng and Schrijver, Iris}, + month = dec, + year = {2011}, + pmid = {21836520}, + keywords = {Algorithms, Algorithms, Alleles, Amino Acid, Amino Acid Sequence, Amino Acid Sequence, Aneuploidy, Base Sequence, Computational Biology/methods/standards Databases, Computer-Assisted, Computer Systems *Databases, Connexins, Cystic Fibrosis Transmembrane Conductance Regulator, Database Management Systems, Database Management Systems, Databases, Databases, Protein, Decision Trees, Decision Trees, DNA, DNA Copy Number Variations, DNA Mutational Analysis, Evaluation Studies as Topic, Evaluation Studies as Topic, Female, Fetal Diseases, Genetic Disease/*genetics *Genetic Variation Humans Information Dissemination/*methods, Genetic/*standards *Genes *Genetic Variation Humans Internet Software, Genetic Vectors, Genetic Vectors Hydrolysis Proteins Protein Sorting Signals, Humans, Hydrolysis, Information Storage and Retrieval, Information Storage and Retrieval, Internet, Internet, Male, Models, Models, Statistical, Molecular, Molecular Sequence Data, Molecular Sequence Data, Mutation, Pathology, Pathology, Molecular, Pregnancy, Prenatal Diagnosis, Programming Languages, Programming Languages, Protein, Proteins, Protein Sorting Signals, Real-Time Polymerase Chain Reaction, Restriction Mapping, Sensitivity and Specificity, Sensitivity and Specificity, Sequence Analysis, Sequence Analysis, DNA, Sequence Analysis, Protein, Sequence Homology, Sequence Homology, Amino Acid, Signal Processing, Signal Processing, Computer-Assisted, Software Design, Software Design, Statistical, Stereoisomerism}, + pages = {1025--1031}, + file = {ScienceDirect Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/78QXZ3WN/0022519382901916.html:text/html} +} + +@article{tsui_synergy_2010, + title = {Synergy of total {PLAC}4 {RNA} concentration and measurement of the {RNA} single-nucleotide polymorphism allelic ratio for the noninvasive prenatal detection of trisomy 21}, + volume = {56}, + issn = {1530-8561}, + doi = {10.1373/clinchem.2009.132662}, + abstract = {BACKGROUND: Maternal plasma mRNA encoded by the PLAC4 gene (placenta-specific 4), which is transcribed from chromosome 21 in placental cells, is a potential marker for the noninvasive assessment of chromosome 21 dosage in the fetus. We evaluated the diagnostic sensitivities and specificities of 2 trisomy 21-screening approaches that use maternal plasma PLAC4 mRNA. +METHODS: We studied maternal plasma samples from 153 pregnant women carrying euploid and trisomy 21 fetuses. For the samples in which the fetuses were heterozygous for the studied PLAC4 single-nucleotide polymorphism (SNP), we measured the ratio between 2 alleles of the SNP in maternal plasma PLAC4 mRNA (RNA-SNP) by mass spectrometric (MS) and digital PCR methods. For pregnancies involving fetuses homozygous for the SNP, we quantified the total PLAC4 mRNA concentration in maternal plasma by real-time PCR and digital PCR. +RESULTS: For the RNA-SNP approach, we achieved a diagnostic sensitivity and specificity of 100\% (95\% CI, 40.2\%-100\%) and 89.7\% (95\% CI, 78.8\%-96.1\%), respectively, for both the MS and the digital PCR methods. For the mRNA-quantification approach, the areas under the ROC curves were 0.859 (95\% CI, 0.741-0.903) and 0.833 (95\% CI, 0.770-0.923) for plasma PLAC4 mRNA concentrations measured by the real-time PCR and the digital PCR methods, respectively. +CONCLUSIONS: For prenatal screening of trisomy 21, the quantification of the total PLAC4 mRNA concentration can be used in a synergistic manner with the RNA-SNP allelic ratio approach to increase the population coverage of cases in which diagnostic information can be obtained.}, + language = {eng}, + number = {1}, + journal = {Clinical chemistry}, + author = {Tsui, Nancy B Y and Akolekar, Ranjit and Chiu, Rossa W K and Chow, Katherine C K and Leung, Tak Y and Lau, Tze K and Nicolaides, Kypros H and Lo, Y M Dennis}, + month = jan, + year = {2010}, + pmid = {19892844}, + keywords = {80 and over, Adult, Agammaglobulinemia/*genetics Amino Acid Sequence Computer Communication Networks *Databases, Agammaglobulinemia/*genetics Computer Communication Networks *Databases, Agammaglobulinemia/*genetics *Databases, Aged, Aged, 80 and over, Alcohol Dehydrogenase, Algorithms, Algorithms Artificial Intelligence Computational Biology/*methods Predictive Value of Tests Protein Stability RNA, Alleles, Amino acids, Amino Acids Animals Models, Amino Acid Substitution, Animals, Animals Aspartic Acid Endopeptidases Cell Membrane Eukaryotic Cells Gene Expression Regulation Humans Intracellular Membranes Phylogeny Plants Proteolysis Signal Transduction Substrate Specificity, Arabidopsis, Aspartic Acid Endopeptidases, Base Sequence, Breast Neoplasms, Cell Line, Cell Line, Tumor, Cell Membrane, Cells, Cells, Cultured, Chromosome Aberrations, Chromosomes, Chromosomes, Human, X, Colorectal Neoplasms, Complementary, Computational Biology/methods/standards Database Management Systems Databases, Computer-Assisted, CpG Islands, Cultured, Disease Progression, DNA, DNA, Complementary, DNA Copy Number Variations, DNA Methylation, DNA Mutational Analysis, DNA, Neoplasm, DNA, Plant, DNA Primers, Down Syndrome, Epidermal Growth Factor, erbB-2, Eukaryotic Cells, Factual Dinucleoside Phosphates/genetics Exons/genetics Genetic Linkage Humans *Mutation Protein Structure, Factual *Genetic Linkage Humans *Mutation *X Chromosome src Homology Domains, Factual Humans Information Storage and Retrieval Molecular Sequence Data *Mutation Protein-Tyrosine Kinases/*genetics *X Chromosome, Female, Fetus, Flaviviridae Infections, Fluorescence, Fluorescent Dyes, GB virus C, Gene Amplification, Gene Dosage, Gene Expression Regulation, Genes, Genes, erbB-2, Genes, Plant, Genes, ras, Genetic, Genetic/ethics/*standards Documentation Genes Genetic Association Studies *Genetic Loci Guidelines as Topic Humans Information Dissemination, Genome, Genome, Human, Genomics, Genotype, Gestational Age, G-Protein-Coupled, Hemophilia A, Hepatitis, Hepatitis, Viral, Human, Human, Humans, Humans, Intracellular Membranes, Lung Neoplasms, Male, Mass, Matrix-Assisted Laser Desorption-Ionization, Messenger, Messenger/genetics, Methylation, Microfluidic Analytical Techniques, Microfluidics, Middle Aged, Models, Models, Statistical, Models, Statistical, Monocytes, Monte Carlo Method, Mothers, Mutation, Neoplasm, Neoplasm Staging, Nucleic Acids, Peptide Mapping, Phylogeny, Plant, Plants, Plasmids, Point Mutation, Polymerase Chain Reaction, Polymorphism, Polymorphism, Single Nucleotide, Predictive Value of Tests, Pregnancy, Pregnancy Proteins, Prenatal Diagnosis, Prospective Studies, Proteins, Proteolysis, Proto-Oncogene Proteins p21(ras), ras, Real-Time Polymerase Chain Reaction, Receptor, Receptor, Epidermal Growth Factor, Receptors, Receptors, G-Protein-Coupled, Reproducibility of Results, Reverse Transcriptase Polymerase Chain Reaction, RNA, RNA, Messenger, Secondary Protein-Tyrosine Kinases/chemistry/*genetics X Chromosome/*genetics, Sensitivity and Specificity, Sequence Analysis, Sequence Analysis, DNA, Sex Determination Analysis, Signal Processing, Signal Processing, Computer-Assisted, Signal Transduction, Single Nucleotide, Spectrometry, Spectrometry, Fluorescence, Spectrometry, Mass, Matrix-Assisted Laser Desorption-Ionization, Spectrophotometry, Spectrophotometry, Ultraviolet, Statistical, Statistical Monte Carlo Method Peptide Mapping Proteins, Substrate Specificity, Sulfites, Taq Polymerase, Transcription, Transcription, Genetic, Tumor, Tumor Cells, Tumor Cells, Cultured, Ultraviolet, Viral, Viral Load, X}, + pages = {73--81}, + file = {Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/CUMRCQPN/Walsh i in. - 2014 - PASTA 2.0 an improved server for protein aggregat.pdf:application/pdf;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/JEEKWJ73/nar.gku399.html:text/html} +} + +@article{tomovic_n-gram-based_2006, + title = {n-gram-based classification and unsupervised hierarchical clustering of genome sequences}, + volume = {81}, + issn = {0169-2607}, + doi = {10.1016/j.cmpb.2005.11.007}, + abstract = {In this paper we address the problem of automated classification of isolates, i.e., the problem of determining the family of genomes to which a given genome belongs. Additionally, we address the problem of automated unsupervised hierarchical clustering of isolates according only to their statistical substring properties. For both of these problems we present novel algorithms based on nucleotide n-grams, with no required preprocessing steps such as sequence alignment. Results obtained experimentally are very positive and suggest that the proposed techniques can be successfully used in a variety of related problems. The reported experiments demonstrate better performance than some of the state-of-the-art methods. We report on a new distance measure between n-gram profiles, which shows superior performance compared to many other measures, including commonly used Euclidean distance.}, + language = {eng}, + number = {2}, + journal = {Computer Methods and Programs in Biomedicine}, + author = {Tomović, Andrija and Janicić, Predrag and Keselj, Vlado}, + month = feb, + year = {2006}, + pmid = {16423423}, + keywords = {Algorithms, DNA, Genome, Human, Humans, Multigene Family, Sequence Analysis}, + pages = {137--153} +} + +@article{murphy_simplified_2000, + title = {Simplified amino acid alphabets for protein fold recognition and implications for folding}, + volume = {13}, + issn = {1741-0126, 1741-0134}, + url = {http://peds.oxfordjournals.org/content/13/3/149}, + doi = {10.1093/protein/13.3.149}, + abstract = {Protein design experiments have shown that the use of specific subsets of amino acids can produce foldable proteins. This prompts the question of whether there is a minimal amino acid alphabet which could be used to fold all proteins. In this work we make an analogy between sequence patterns which produce foldable sequences and those which make it possible to detect structural homologs by aligning sequences, and use it to suggest the possible size of such a reduced alphabet. We estimate that reduced alphabets containing 10–12 letters can be used to design foldable sequences for a large number of protein families. This estimate is based on the observation that there is little loss of the information necessary to pick out structural homologs in a clustered protein sequence database when a suitable reduction of the amino acid alphabet from 20 to 10 letters is made, but that this information is rapidly degraded when further reductions in the alphabet are made.}, + language = {en}, + number = {3}, + urldate = {2016-01-24}, + journal = {Protein Engineering}, + author = {Murphy, Lynne Reed and Wallqvist, Anders and Levy, Ronald M.}, + month = mar, + year = {2000}, + pmid = {10775656}, + keywords = {minimal alphabet, protein fold recognition, Sequence Alignment}, + pages = {149--152}, + file = {Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/TPKZ4RTK/Murphy i in. - 2000 - Simplified amino acid alphabets for protein fold r.pdf:application/pdf;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/IH2NUBJG/149.html:text/html} +} + +@article{navarro-gomez_phy-mer:_2015, + title = {Phy-{Mer}: a novel alignment-free and reference-independent mitochondrial haplogroup classifier}, + volume = {31}, + issn = {1367-4811}, + shorttitle = {Phy-{Mer}}, + doi = {10.1093/bioinformatics/btu825}, + abstract = {MOTIVATION: All current mitochondrial haplogroup classification tools require variants to be detected from an alignment with the reference sequence and to be properly named according to the canonical nomenclature standards for describing mitochondrial variants, before they can be compared with the haplogroup determining polymorphisms. With the emergence of high-throughput sequencing technologies and hence greater availability of mitochondrial genome sequences, there is a strong need for an automated haplogroup classification tool that is alignment-free and agnostic to reference sequence. RESULTS: We have developed a novel mitochondrial genome haplogroup-defining algorithm using a k-mer approach namely Phy-Mer. Phy-Mer performs equally well as the leading haplogroup classifier, HaploGrep, while avoiding the errors that may occur when preparing variants to required formats and notations. We have further expanded Phy-Mer functionality such that next-generation sequencing data can be used directly as input. AVAILABILITY AND IMPLEMENTATION: Phy-Mer is publicly available under the GNU Affero General Public License v3.0 on GitHub (https://github.com/danielnavarrogomez/phy-mer). CONTACT: Xiaowu\_Gai@meei.harvard.edu SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.}, + language = {eng}, + number = {8}, + journal = {Bioinformatics (Oxford, England)}, + author = {Navarro-Gomez, Daniel and Leipzig, Jeremy and Shen, Lishuang and Lott, Marie and Stassen, Alphons P. M. and Wallace, Douglas C. and Wiggs, Janey L. and Falk, Marni J. and van Oven, Mannis and Gai, Xiaowu}, + month = apr, + year = {2015}, + pmid = {25505086}, + pmcid = {PMC4393525}, + keywords = {Algorithms, DNA, Genetic Variation, Haplotypes, High-Throughput Nucleotide Sequencing, Humans, Mitochondrial, Sequence Analysis, Software}, + pages = {1310--1312} +} + +@article{osmanbeyoglu_n-gram_2011, + title = {N-gram analysis of 970 microbial organisms reveals presence of biological language models}, + volume = {12}, + issn = {1471-2105}, + url = {http://dx.doi.org/10.1186/1471-2105-12-12}, + doi = {10.1186/1471-2105-12-12}, + abstract = {It has been suggested previously that genome and proteome sequences show characteristics typical of natural-language texts such as "signature-style" word usage indicative of authors or topics, and that the algorithms originally developed for natural language processing may therefore be applied to genome sequences to draw biologically relevant conclusions. Following this approach of 'biological language modeling', statistical n-gram analysis has been applied for comparative analysis of whole proteome sequences of 44 organisms. It has been shown that a few particular amino acid n-grams are found in abundance in one organism but occurring very rarely in other organisms, thereby serving as genome signatures. At that time proteomes of only 44 organisms were available, thereby limiting the generalization of this hypothesis. Today nearly 1,000 genome sequences and corresponding translated sequences are available, making it feasible to test the existence of biological language models over the evolutionary tree.}, + urldate = {2016-01-24}, + journal = {BMC Bioinformatics}, + author = {Osmanbeyoglu, Hatice Ulku and Ganapathiraju, Madhavi K.}, + year = {2011}, + pages = {12}, + file = {Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/THQZ5NXF/Osmanbeyoglu i Ganapathiraju - 2011 - N-gram analysis of 970 microbial organisms reveals.pdf:application/pdf;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/ZWJ477E3/1471-2105-12-12.html:text/html} +} + +@incollection{radivojac_feature_2004, + series = {Lecture {Notes} in {Computer} {Science}}, + title = {Feature {Selection} {Filters} {Based} on the {Permutation} {Test}}, + copyright = {©2004 Springer-Verlag Berlin Heidelberg}, + isbn = {978-3-540-23105-9 978-3-540-30115-8}, + url = {http://link.springer.com/chapter/10.1007/978-3-540-30115-8_32}, + abstract = {We investigate the problem of supervised feature selection within the filtering framework. In our approach, applicable to the two-class problems, the feature strength is inversely proportional to the p-value of the null hypothesis that its class-conditional densities, p(X {\textbackslash}textbar Y = 0) and p(X {\textbackslash}textbar Y = 1), are identical. To estimate the p-values, we use Fisher’s permutation test combined with the four simple filtering criteria in the roles of test statistics: sample mean difference, symmetric Kullback-Leibler distance, information gain, and chi-square statistic. The experimental results of our study, performed using naive Bayes classifier and support vector machines, strongly indicate that the permutation test improves the above-mentioned filters and can be used effectively when sample size is relatively small and number of features relatively large.}, + language = {en}, + number = {3201}, + urldate = {2016-02-03}, + booktitle = {Machine {Learning}: {ECML} 2004}, + publisher = {Springer Berlin Heidelberg}, + author = {Radivojac, Predrag and Obradovic, Zoran and Dunker, A. Keith and Vucetic, Slobodan}, + editor = {Boulicaut, Jean-François and Esposito, Floriana and Giannotti, Fosca and Pedreschi, Dino}, + month = sep, + year = {2004}, + note = {DOI: 10.1007/978-3-540-30115-8\_32}, + keywords = {Algorithm Analysis and Problem Complexity, Artificial Intelligence (incl. Robotics), Database Management, Mathematical Logic and Formal Languages}, + pages = {334--346}, + file = {Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/JTVEMAMG/10.html:text/html} +} + +@inproceedings{francois_permutation_2006, + title = {The permutation test for feature selection by mutual information}, + abstract = {Abstract. The estimation of mutual information for feature selection is often subject to inaccuracies due to noise, small sample size, bad choice of parameter for the estimator, etc. The choice of a threshold above which a feature will be considered useful is thus difficult to make. Therefore, the use of the permutation test to assess the reliability of the estimation is proposed. The permutation test allows performing a non-parametric hypothesis test to select the relevant features and to build a Feature Relevance Diagram that visually synthesizes the result of the test. 1}, + booktitle = {in: {ESANN} 2006, {European} {Symposium} on {Artificial} {Neural} {Networks}}, + author = {François, D. and Wertz, V. and Verleysen, M.}, + year = {2006}, + pages = {239--244}, + file = {Citeseer - Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/N7PSC5ZC/François i in. - 2006 - The permutation test for feature selection by mutu.pdf:application/pdf;Citeseer - Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/S8937H5Z/summary.html:text/html} +} + +@article{morse_exact_2014, + title = {Exact {Permutation} {Algorithm} for {Paired} {Observations}: {A} {General} and {Efficient} {Version}}, + volume = {10}, + issn = {1549-3644}, + shorttitle = {Exact {Permutation} {Algorithm} for {Paired} {Observations}}, + url = {http://thescipub.com/abstract/10.3844/jmssp.2014.448.452}, + doi = {10.3844/jmssp.2014.448.452}, + number = {4}, + urldate = {2016-02-03}, + journal = {Journal of Mathematics and Statistics}, + author = {Morse, David T.}, + month = apr, + year = {2014}, + pages = {448--452} +} + +@article{solis_amino_2015, + title = {Amino acid alphabet reduction preserves fold information contained in contact interactions in proteins}, + volume = {83}, + issn = {1097-0134}, + url = {http://onlinelibrary.wiley.com/doi/10.1002/prot.24936/abstract}, + doi = {10.1002/prot.24936}, + abstract = {To reduce complexity, understand generalized rules of protein folding, and facilitate de novo protein design, the 20-letter amino acid alphabet is commonly reduced to a smaller alphabet by clustering amino acids based on some measure of similarity. In this work, we seek the optimal alphabet that preserves as much of the structural information found in long-range (contact) interactions among amino acids in natively-folded proteins. We employ the Information Maximization Device, based on information theory, to partition the amino acids into well-defined clusters. Numbering from 2 to 19 groups, these optimal clusters of amino acids, while generated automatically, embody well-known properties of amino acids such as hydrophobicity/polarity, charge, size, and aromaticity, and are demonstrated to maintain the discriminative power of long-range interactions with minimal loss of mutual information. Our measurements suggest that reduced alphabets (of less than 10) are able to capture virtually all of the information residing in native contacts and may be sufficient for fold recognition, as demonstrated by extensive threading tests. In an expansive survey of the literature, we observe that alphabets derived from various approaches—including those derived from physicochemical intuition, local structure considerations, and sequence alignments of remote homologs—fare consistently well in preserving contact interaction information, highlighting a convergence in the various factors thought to be relevant to the folding code. Moreover, we find that alphabets commonly used in experimental protein design are nearly optimal and are largely coherent with observations that have arisen in this work. Proteins 2015; 83:2198–2216. © 2015 Wiley Periodicals, Inc.}, + language = {en}, + number = {12}, + urldate = {2016-07-14}, + journal = {Proteins: Structure, Function, and Bioinformatics}, + author = {Solis, Armando D.}, + month = dec, + year = {2015}, + keywords = {Amino Acid Sequence, contact potential, knowledge-based potential, Protein Structure, sequence representation, threading}, + pages = {2198--2216}, + file = {Full Text PDF:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/E2GZBRCE/Solis - 2015 - Amino acid alphabet reduction preserves fold infor.pdf:application/pdf;Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/872QM6ID/abstract.html:text/html} +} + +@article{melo_accuracy_2006, + title = {Accuracy of sequence alignment and fold assessment using reduced amino acid alphabets}, + volume = {63}, + issn = {1097-0134}, + doi = {10.1002/prot.20881}, + abstract = {Reduced or simplified amino acid alphabets group the 20 naturally occurring amino acids into a smaller number of representative protein residues. To date, several reduced amino acid alphabets have been proposed, which have been derived and optimized by a variety of methods. The resulting reduced amino acid alphabets have been applied to pattern recognition, generation of consensus sequences from multiple alignments, protein folding, and protein structure prediction. In this work, amino acid substitution matrices and statistical potentials were derived based on several reduced amino acid alphabets and their performance assessed in a large benchmark for the tasks of sequence alignment and fold assessment of protein structure models, using as a reference frame the standard alphabet of 20 amino acids. The results showed that a large reduction in the total number of residue types does not necessarily translate into a significant loss of discriminative power for sequence alignment and fold assessment. Therefore, some definitions of a few residue types are able to encode most of the relevant sequence/structure information that is present in the 20 standard amino acids. Based on these results, we suggest that the use of reduced amino acid alphabets may allow to increasing the accuracy of current substitution matrices and statistical potentials for the prediction of protein structure of remote homologs.}, + language = {eng}, + number = {4}, + journal = {Proteins}, + author = {Melo, Francisco and Marti-Renom, Marc A.}, + month = jun, + year = {2006}, + pmid = {16506243}, + keywords = {Amino acids, Amino Acid Sequence, Consensus Sequence, Molecular Sequence Data, Oxidation-Reduction, Protein, Protein Folding, Proteins, Sequence Alignment, Structural Homology}, + pages = {986--995} +} + +@article{stephenson_unearthing_2013, + title = {Unearthing the root of amino acid similarity}, + volume = {77}, + issn = {1432-1432}, + doi = {10.1007/s00239-013-9565-0}, + abstract = {Similarities and differences between amino acids define the rates at which they substitute for one another within protein sequences and the patterns by which these sequences form protein structures. However, there exist many ways to measure similarity, whether one considers the molecular attributes of individual amino acids, the roles that they play within proteins, or some nuanced contribution of each. One popular approach to representing these relationships is to divide the 20 amino acids of the standard genetic code into groups, thereby forming a simplified amino acid alphabet. Here, we develop a method to compare or combine different simplified alphabets, and apply it to 34 simplified alphabets from the scientific literature. We use this method to show that while different suggestions vary and agree in non-intuitive ways, they combine to reveal a consensus view of amino acid similarity that is clearly rooted in physico-chemistry.}, + language = {eng}, + number = {4}, + journal = {Journal of Molecular Evolution}, + author = {Stephenson, James D. and Freeland, Stephen J.}, + month = oct, + year = {2013}, + pmid = {23743923}, + keywords = {Algorithms, Amino acids, Genetic Code, Protein, Proteins, Sequence Alignment, Sequence Analysis}, + pages = {159--169} +} + +@article{zuo_psekraac:_2016, + title = {{PseKRAAC}: a flexible web server for generating pseudo {K}-tuple reduced amino acids composition}, + issn = {1367-4803, 1460-2059}, + shorttitle = {{PseKRAAC}}, + url = {http://bioinformatics.oxfordjournals.org/content/early/2016/09/12/bioinformatics.btw564}, + doi = {10.1093/bioinformatics/btw564}, + abstract = {Summary: The reduced amino acids perform powerful ability for both simplifying protein complexity and identifying functional conserved regions. However, dealing with different protein problems may need different kinds of cluster methods. Encouraged by the success of pseudo-amino acid composition algorithm, we developed a freely available web server, called PseKRAAC (the pseudo K-tuple reduced amino acids composition). By implementing reduced amino acid alphabets, the protein complexity can be significantly simplified, which leads to decrease chance of overfitting, lower computational handicap and reduce information redundancy. PseKRAAC delivers more capability for protein research by incorporating three crucial parameters that describes protein composition. Users can easily generate many different modes of PseKRAAC tailored to their needs by selecting various reduced amino acids alphabets and other characteristic parameters. It is anticipated that the PseKRAAC web server will become a very useful tool in computational proteomics and protein sequence analysis. +Availability and Implementation: Freely available on the web at http://bigdata.imu.edu.cn/psekraac +Contacts: yczuo@imu.edu.cn or imu.hema@foxmail.com or yanglei\_hmu@163.com. +Supplementary information: Supplementary data are available at Bioinformatics online.}, + language = {en}, + urldate = {2016-09-30}, + journal = {Bioinformatics}, + author = {Zuo, Yongchun and Li, Yuan and Chen, Yingli and Li, Guangpeng and Yan, Zhenhe and Yang, Lei}, + month = aug, + year = {2016}, + pmid = {27565583}, + pages = {btw564}, + file = {Snapshot:/home/michal/.zotero/zotero/pwethkyi.default/zotero/storage/Z8I5EII7/bioinformatics.btw564.html:text/html} +} \ No newline at end of file diff --git a/vignettes/overview.Rmd b/vignettes/overview.Rmd index a8040e5..76afaad 100644 --- a/vignettes/overview.Rmd +++ b/vignettes/overview.Rmd @@ -1,10 +1,11 @@ --- title: "biogram package" author: "Michał Burdukiewicz, Piotr Sobczyk" -date: "17.10.2015" +date: "5.01.2017" output: rmarkdown::html_vignette: toc: true +bibliography: "biogram_pub.bib" vignette: > %\VignetteIndexEntry{biogram package - an overview} %\VignetteEngine{knitr::rmarkdown} @@ -40,21 +41,23 @@ my_theme <- theme(plot.background=element_rect(fill = "transparent", # Reduction of dimensionality -Since the number of potential n-grams grows exponentially with the $n$, n-gram datasets are often very large. To deal with the curse of dimensionality, the **biogram** package offers two solutions. The first relies on the reduction of an alphabet, which is a common approach in case of analysis of amino acid sequences (CITATIONS) and less applied in studies of nucleic acids. +Since the number of potential n-grams grows exponentially with the $n$, n-gram datasets are often very large. To deal with the curse of dimensionality, the **biogram** package offers two solutions. The first relies on the reduction of an alphabet, which is a common approach in case of analysis of amino acid sequences (@murphy_simplified_2000) and less applied in studies of nucleic acids. The alternate solution is to filter the non-informative n-grams. The **biogram** package employs feature selection algorithm QuiPT, which allows very fast feature filtering. ## Alphabet reduction -*degenerate* +In many cases the properties of the sequences are not depending on the exact sequence of the amino acids, but rather on their physicochemical properties. In this case, the full amino acid alphabet may be replaced with a shorter alphabet, where amino acids are aggregated to larger groups using some design criteria as physicochemical properties. The **biogram** package supports creation of reduced amino acid alphabets and their analysis by two distance measures: similarity index and encoding distance. ### Similarity index -The similarity index was firstly introduced as the unnamed distance measure for reduced alphabets by XXX. Briefly, if a pair of elements is in both encodings in the same group or in different groups, the pair scores 1 and in the opposite case 0. The pairs of the identical elements are ignored. The score is later divided by the number of possible pairs ($20 \times 19$ in case of the amino acid alphabet). +The similarity index, as computed by the *calc_si* function, was firstly introduced as the unnamed distance measure for reduced alphabets by @stephenson_unearthing_2013. Briefly, if a pair of elements is in both encodings in the same group or in different groups, the pair scores 1 and in the opposite case 0. The pairs of the identical elements are ignored. The score is later divided by the number of possible pairs ($20 \times 19$ in case of the amino acid alphabet). -$A$: an alphabet. -$a_1$: an element in the reduced alphabet 1. -$a_2$: an element in the reduced alphabet 2. +$A$: an alphabet. + +$a_1$: an element in the reduced alphabet 1. + +$a_2$: an element in the reduced alphabet 2. $$ S = \sum_{a_1 \in A} \sum_{a_2 \in A, a_2 \neq a_1 } \sum_{enc_1 \in encodings_1} \sum_{enc_2 \in encodings_2} 1_{a_1 \in enc_1 \land a_2 \in enc_2} @@ -122,7 +125,7 @@ ggplot(dat, aes(x = f1, y = f2, colour = pair, label = label)) + geom_line() + geom_point(aes(x = f1, y = f2, colour = enc), size = 4) + facet_wrap(~ id) + - geom_text(aes(x = f1, y = f2, colour = enc, label = label), vjust = 1.5, size = 4) + + geom_text(aes(x = f1, y = f2, colour = enc, label = label), vjust = 1.8, size = 4) + scale_color_brewer(palette="Dark2", guide = "none") + my_theme ``` @@ -232,12 +235,16 @@ $$P((Target, Feature) = (1,0)) = p \cdot (1-q)$$ $$P((Target, Feature) = (0,1)) = (1-p) \cdot q$$ $$P((Target, Feature) = (0,0)) = (1-p) \cdot (1-q)$$ -This means that a target-feature can be described as multinomial distribution. +This means that a target-feature can be described as multinomial distribution: -$$ {n \choose n_{1,1}} (p\cdot q)^{n_{1,1}} +$$ +\begin{aligned} +{n \choose n_{1,1}} (p\cdot q)^{n_{1,1}} {n - n_{1,1} \choose n_{1,0}} (p\cdot (1-q))^{n_{1,0}} -{n - n_{1,1} - n_{1,0} \choose n_{0,1}} ((1-p)\cdot q)^{n_{0,1}} -{n - n_{1,1} - n_{1,0} -n_{0,1}\choose n_{0,0}} ((1-p)\cdot (1-q))^{n_{0,0}}$$ +{n - n_{1,1} - n_{1,0} \choose n_{0,1}} ((1-p)\cdot q)^{n_{0,1}} \\ +{n - n_{1,1} - n_{1,0} -n_{0,1}\choose n_{0,0}} ((1-p)\cdot (1-q))^{n_{0,0}} +\end{aligned} +$$ However we have important restriction that $n_{1,\cdot} = n_{1,1} + n_{1,0}$ and $n_{\cdot, 1} = n_{1,1} + n_{0,1}$ are known and fixed as they describe the number @@ -258,3 +265,4 @@ no longer need to perform any replications. Furthermore, by using exact test we will get precise values of tails which was not guaranteed with random permutations. +# References \ No newline at end of file