Chapter3.bib

%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/


%% Created for Michael P. Cummings at 2015-02-03 22:42:20 +0100


%% Saved with string encoding Unicode (UTF-8)

@article{hartigan1979algorithm,
      title={Algorithm {AS} 136: A k-means clustering algorithm},
      author={Hartigan, John A and Wong, Manchek A},
      journal={Applied statistics},
      pages={100--108},
      year={1979},
      publisher={JSTOR}}

@article{zhou2014compression,
    Author = {Zhou, Jiarui and Ji, Zhen and Zhu, Zexuan and He, Shan},
    Date-Added = {2015-02-03 20:08:27 +0000},
    Date-Modified = {2015-02-03 20:08:27 +0000},
    Journal = {BMC bioinformatics},
    Number = {Suppl 15},
    Pages = {S10},
    Publisher = {BioMed Central},
    Title = {Compression of next-generation sequencing quality scores using memetic algorithm},
    Volume = {15},
    Year = {2014}}

@article{janin2013adaptive,
    Author = {Janin, Lilian and Rosone, Giovanna and Cox, Anthony J},
    Date-Added = {2015-02-03 19:53:44 +0000},
    Date-Modified = {2015-02-03 19:53:44 +0000},
    Journal = {Bioinformatics},
    Pages = {btt257},
    Publisher = {Oxford Univ Press},
    Title = {Adaptive reference-free compression of sequence quality scores},
    Year = {2013}}

@electronic{sickle,
    Author = {Joshi NA and Fass JN},
    Date-Added = {2015-02-03 09:52:02 +0000},
    Date-Modified = {2015-02-03 10:14:06 +0000},
    Title = {Sickle: A sliding-window, adaptive, quality-based trimming tool for {FastQ} files (Version 1.33)},
    Url = {https://github.com/najoshi/sickle},
    Year = {2013}}

@article{asnani2012lossy,
    Author = {Asnani, Himanshu and Bharadia, Dinesh and Chowdhury, Mainak and Ochoa, Idoia and Sharon, Itai and Weissman, Tsachy},
    Date-Added = {2015-02-02 18:12:19 +0000},
    Date-Modified = {2015-02-02 18:12:19 +0000},
    Journal = {arXiv preprint arXiv:1207.5184},
    Title = {Lossy compression of quality values via rate distortion theory},
    Year = {2012}}

@article{Deorowicz:2013hq,
    Abstract = {: Post-Sanger sequencing methods produce tons of data, and there is a general agreement that the challenge to store and process them must be addressed with data compression. In this review we first answer the question "why compression" in a quantitative manner. Then we also answer the questions "what" and "how", by sketching the fundamental compression ideas, describing the main sequencing data types and formats, and comparing the specialized compression algorithms and tools. Finally, we go back to the question "why compression" and give other, perhaps surprising answers, demonstrating the pervasiveness of data compression techniques in computational biology.},
    Author = {Deorowicz, Sebastian and Grabowski, Szymon},
    Date-Added = {2015-02-02 17:58:05 +0000},
    Date-Modified = {2015-02-02 17:58:05 +0000},
    Doi = {10.1186/1748-7188-8-25},
    Journal = {Algorithms Mol Biol},
    Journal-Full = {Algorithms for molecular biology : AMB},
    Number = {1},
    Pages = {25},
    Pmc = {PMC3868316},
    Pmid = {24252160},
    Pst = {epublish},
    Title = {Data compression for sequencing data},
    Volume = {8},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/1748-7188-8-25}}

@article{Koren:2013ye,
    Abstract = {BACKGROUND: The short reads output by first- and second-generation DNA sequencing instruments cannot completely reconstruct microbial chromosomes. Therefore, most genomes have been left unfinished due to the significant resources required to manually close gaps in draft assemblies. Third-generation, single-molecule sequencing addresses this problem by greatly increasing sequencing read length, which simplifies the assembly problem.
RESULTS: To measure the benefit of single-molecule sequencing on microbial genome assembly, we sequenced and assembled the genomes of six bacteria and analyzed the repeat complexity of 2,267 complete bacteria and archaea. Our results indicate that the majority of known bacterial and archaeal genomes can be assembled without gaps, at finished-grade quality, using a single PacBio RS sequencing library. These single-library assemblies are also more accurate than typical short-read assemblies and hybrid assemblies of short and long reads.
CONCLUSIONS: Automated assembly of long, single-molecule sequencing data reduces the cost of microbial finishing to $1,000 for most genomes, and future advances in this technology are expected to drive the cost lower. This is expected to increase the number of completed genomes, improve the quality of microbial genome databases, and enable high-fidelity, population-scale studies of pan-genomes and chromosomal organization.},
    Author = {Koren, Sergey and Harhay, Gregory P and Smith, Timothy P L and Bono, James L and Harhay, Dayna M and Mcvey, Scott D and Radune, Diana and Bergman, Nicholas H and Phillippy, Adam M},
    Date-Added = {2015-02-02 13:28:10 +0000},
    Date-Modified = {2015-02-02 13:28:10 +0000},
    Doi = {10.1186/gb-2013-14-9-r101},
    Journal = {Genome Biol},
    Journal-Full = {Genome biology},
    Mesh = {Algorithms; Base Sequence; Contig Mapping; Escherichia coli; Francisella tularensis; Genome Size; Genome, Archaeal; Genome, Bacterial; Genomic Library; Mannheimia haemolytica; Molecular Sequence Data; Salmonella enterica; Sequence Analysis, DNA; Software},
    Number = {9},
    Pages = {R101},
    Pmc = {PMC4053942},
    Pmid = {24034426},
    Pst = {ppublish},
    Title = {Reducing assembly complexity of microbial genomes with single-molecule sequencing},
    Volume = {14},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2013-14-9-r101}}

@article{Ferrarini:2013vf,
    Abstract = {BACKGROUND: Second generation sequencing has permitted detailed sequence characterisation at the whole genome level of a growing number of non-model organisms, but the data produced have short read-lengths and biased genome coverage leading to fragmented genome assemblies. The PacBio RS long-read sequencing platform offers the promise of increased read length and unbiased genome coverage and thus the potential to produce genome sequence data of a finished quality containing fewer gaps and longer contigs. However, these advantages come at a much greater cost per nucleotide and with a perceived increase in error-rate. In this investigation, we evaluated the performance of the PacBio RS sequencing platform through the sequencing and de novo assembly of the Potentilla micrantha chloroplast genome.
RESULTS: Following error-correction, a total of 28,638 PacBio RS reads were recovered with a mean read length of 1,902 bp totalling 54,492,250 nucleotides and representing an average depth of coverage of 320× the chloroplast genome. The dataset covered the entire 154,959 bp of the chloroplast genome in a single contig (100% coverage) compared to seven contigs (90.59% coverage) recovered from an Illumina data, and revealed no bias in coverage of GC rich regions. Post-assembly the data were largely concordant with the Illumina data generated and allowed 187 ambiguities in the Illumina data to be resolved. The additional read length also permitted small differences in the two inverted repeat regions to be assigned unambiguously.
CONCLUSIONS: This is the first report to our knowledge of a chloroplast genome assembled de novo using PacBio sequence data. The PacBio RS data generated here were assembled into a single large contig spanning the P. micrantha chloroplast genome, with a higher degree of accuracy than an Illumina dataset generated at a much greater depth of coverage, due to longer read lengths and lower GC bias in the data. The results we present suggest PacBio data will be of immense utility for the development of genome sequence assemblies containing fewer unresolved gaps and ambiguities and a significantly smaller number of contigs than could be produced using short-read sequence data alone.},
    Author = {Ferrarini, Marco and Moretto, Marco and Ward, Judson A and {\v S}urbanovski, Nada and Stevanovi{\'c}, Vladimir and Giongo, Lara and Viola, Roberto and Cavalieri, Duccio and Velasco, Riccardo and Cestaro, Alessandro and Sargent, Daniel J},
    Date-Added = {2015-02-02 13:26:39 +0000},
    Date-Modified = {2015-02-02 13:30:22 +0000},
    Doi = {10.1186/1471-2164-14-670},
    Journal = {BMC Genomics},
    Journal-Full = {BMC genomics},
    Mesh = {Base Composition; Base Sequence; Databases, Genetic; Genome, Chloroplast; Potentilla; Sequence Analysis, DNA; Software},
    Pages = {670},
    Pmc = {PMC3853357},
    Pmid = {24083400},
    Pst = {epublish},
    Title = {An evaluation of the {PacBio RS} platform for sequencing and de novo assembly of a chloroplast genome},
    Volume = {14},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2164-14-670}}

@article{pedregosa2011scikit,
    Author = {Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others},
    Date-Added = {2015-02-02 13:10:41 +0000},
    Date-Modified = {2015-02-02 13:10:41 +0000},
    Journal = {The Journal of Machine Learning Research},
    Pages = {2825--2830},
    Publisher = {JMLR. org},
    Title = {Scikit-learn: Machine learning in Python},
    Volume = {12},
    Year = {2011}}

@inproceedings{macqueen1967some,
    Author = {MacQueen, James and others},
    Booktitle = {Proceedings of the fifth Berkeley symposium on mathematical statistics and probability},
    Date-Added = {2015-02-02 13:08:18 +0000},
    Date-Modified = {2015-02-02 13:08:18 +0000},
    Number = {14},
    Organization = {Oakland, CA, USA.},
    Pages = {281--297},
    Title = {Some methods for classification and analysis of multivariate observations},
    Volume = {1},
    Year = {1967}}

@article{mcgill1978variations,
    Author = {McGill, Robert and Tukey, John W and Larsen, Wayne A},
    Date-Added = {2015-02-02 13:03:37 +0000},
    Date-Modified = {2015-02-02 13:03:37 +0000},
    Journal = {The American Statistician},
    Number = {1},
    Pages = {12--16},
    Publisher = {Taylor \& Francis Group},
    Title = {Variations of box plots},
    Volume = {32},
    Year = {1978}}

@article{Kozanitis:2011kl,
    Abstract = {With the advent of next generation sequencing technologies, the cost of sequencing whole genomes is poised to go below $1000 per human individual in a few years. As more and more genomes are sequenced, analysis methods are undergoing rapid development, making it tempting to store sequencing data for long periods of time so that the data can be re-analyzed with the latest techniques. The challenging open research problems, huge influx of data, and rapidly improving analysis techniques have created the need to store and transfer very large volumes of data. Compression can be achieved at many levels, including trace level (compressing image data), sequence level (compressing a genomic sequence), and fragment-level (compressing a set of short, redundant fragment reads, along with quality-values on the base-calls). We focus on fragment-level compression, which is the pressing need today. Our article makes two contributions, implemented in a tool, SlimGene. First, we introduce a set of domain specific loss-less compression schemes that achieve over 40× compression of fragments, outperforming bzip2 by over 6×. Including quality values, we show a 5× compression using less running time than bzip2. Second, given the discrepancy between the compression factor obtained with and without quality values, we initiate the study of using "lossy" quality values. Specifically, we show that a lossy quality value quantization results in 14× compression but has minimal impact on downstream applications like SNP calling that use the quality values. Discrepancies between SNP calls made between the lossy and loss-less versions of the data are limited to low coverage areas where even the SNP calls made by the loss-less version are marginal.},
    Author = {Kozanitis, Christos and Saunders, Chris and Kruglyak, Semyon and Bafna, Vineet and Varghese, George},
    Date-Added = {2015-02-02 12:39:19 +0000},
    Date-Modified = {2015-02-02 12:39:37 +0000},
    Doi = {10.1089/cmb.2010.0253},
    Journal = {J Comput Biol},
    Journal-Full = {Journal of computational biology : a journal of computational molecular cell biology},
    Mesh = {Algorithms; Data Compression; Genome, Human; Genomics; Humans; Polymorphism, Single Nucleotide; Sequence Analysis, DNA},
    Month = {Mar},
    Number = {3},
    Pages = {401-13},
    Pmc = {PMC3123913},
    Pmid = {21385043},
    Pst = {ppublish},
    Title = {Compressing genomic sequence fragments using {SlimGene}},
    Volume = {18},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1089/cmb.2010.0253}}

@article{McKenna:2010bh,
    Abstract = {Next-generation DNA sequencing (NGS) projects, such as the 1000 Genomes Project, are already revolutionizing our understanding of genetic variation among individuals. However, the massive data sets generated by NGS--the 1000 Genome pilot alone includes nearly five terabases--make writing feature-rich, efficient, and robust analysis tools difficult for even computationally sophisticated individuals. Indeed, many professionals are limited in the scope and the ease with which they can answer scientific questions by the complexity of accessing and manipulating the data produced by these machines. Here, we discuss our Genome Analysis Toolkit (GATK), a structured programming framework designed to ease the development of efficient and robust analysis tools for next-generation DNA sequencers using the functional programming philosophy of MapReduce. The GATK provides a small but rich set of data access patterns that encompass the majority of analysis tool needs. Separating specific analysis calculations from common data management infrastructure enables us to optimize the GATK framework for correctness, stability, and CPU and memory efficiency and to enable distributed and shared memory parallelization. We highlight the capabilities of the GATK by describing the implementation and application of robust, scale-tolerant tools like coverage calculators and single nucleotide polymorphism (SNP) calling. We conclude that the GATK programming framework enables developers and analysts to quickly and easily write efficient and robust NGS tools, many of which have already been incorporated into large-scale sequencing projects like the 1000 Genomes Project and The Cancer Genome Atlas.},
    Author = {McKenna, Aaron and Hanna, Matthew and Banks, Eric and Sivachenko, Andrey and Cibulskis, Kristian and Kernytsky, Andrew and Garimella, Kiran and Altshuler, David and Gabriel, Stacey and Daly, Mark and DePristo, Mark A},
    Date-Added = {2015-02-02 12:29:22 +0000},
    Date-Modified = {2015-02-02 13:34:27 +0000},
    Doi = {10.1101/gr.107524.110},
    Journal = {Genome Res},
    Journal-Full = {Genome research},
    Mesh = {Base Sequence; Genome; Genomics; Sequence Analysis, DNA; Software},
    Month = {Sep},
    Number = {9},
    Pages = {1297-303},
    Pmc = {PMC2928508},
    Pmid = {20644199},
    Pst = {ppublish},
    Title = {The {Genome Analysis Toolkit}: a {MapReduce} framework for analyzing next-generation {DNA} sequencing data},
    Volume = {20},
    Year = {2010},
    Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.107524.110}}

@article{Pathak:2014zl,
    Abstract = {MOTIVATION: Next-generation sequencing (NGS) technologies have revolutionized genomic research by reducing the cost of whole-genome sequencing. One of the biggest challenges posed by modern sequencing technology is economic storage of NGS data. Storing raw data is infeasible because of its enormous size and high redundancy. In this article, we address the problem of storage and transmission of large Fastq files using innovative compression techniques.
RESULTS: We introduce a new lossless non-reference-based fastq compression algorithm named lossless FastQ compressor. We have compared our algorithm with other state of the art big data compression algorithms namely gzip, bzip2, fastqz, fqzcomp, G-SQZ, SCALCE, Quip, DSRC, DSRC-LZ etc. This comparison reveals that our algorithm achieves better compression ratios. The improvement obtained is up to 225%. For example, on one of the datasets (SRR065390_1), the average improvement (over all the algorithms compared) is 74.62%. Availability and implementation: The implementations are freely available for non-commercial purposes. They can be downloaded from http://engr.uconn.edu/∼rajasek/FastqPrograms.zip.
CONTACT: rajasek@engr.uconn.edu.},
    Author = {Pathak, Sudipta and Rajasekaran, Sanguthevar},
    Date-Added = {2015-02-02 11:52:40 +0000},
    Date-Modified = {2015-02-02 11:52:40 +0000},
    Doi = {10.1093/bioinformatics/btu701},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Month = {Oct},
    Pmid = {25344499},
    Pst = {aheadofprint},
    Title = {LFQC: a lossless compression algorithm for FASTQ files},
    Year = {2014},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btu701}}

@article{Canovas:2014fr,
    Abstract = {MOTIVATION: Next-generation sequencing technologies are revolutionizing medicine. Data from sequencing technologies are typically represented as a string of bases, an associated sequence of per-base quality scores and other metadata, and in aggregate can require a large amount of space. The quality scores show how accurate the bases are with respect to the sequencing process, that is, how confident the sequencer is of having called them correctly, and are the largest component in datasets in which they are retained. Previous research has examined how to store sequences of bases effectively; here we add to that knowledge by examining methods for compressing quality scores. The quality values originate in a continuous domain, and so if a fidelity criterion is introduced, it is possible to introduce flexibility in the way these values are represented, allowing lossy compression over the quality score data.
RESULTS: We present existing compression options for quality score data, and then introduce two new lossy techniques. Experiments measuring the trade-off between compression ratio and information loss are reported, including quantifying the effect of lossy representations on a downstream application that carries out single nucleotide polymorphism and insert/deletion detection. The new methods are demonstrably superior to other techniques when assessed against the spectrum of possible trade-offs between storage required and fidelity of representation.
AVAILABILITY AND IMPLEMENTATION: An implementation of the methods described here is available at https://github.com/rcanovas/libCSAM.
CONTACT: rcanovas@student.unimelb.edu.au
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
    Author = {C{\'a}novas, Rodrigo and Moffat, Alistair and Turpin, Andrew},
    Date-Added = {2015-02-02 11:50:05 +0000},
    Date-Modified = {2015-02-02 11:50:05 +0000},
    Doi = {10.1093/bioinformatics/btu183},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Algorithms; Base Sequence; Data Compression; Genome; Genomics; High-Throughput Nucleotide Sequencing; Polymorphism, Single Nucleotide; Quality Control},
    Month = {Aug},
    Number = {15},
    Pages = {2130-6},
    Pmid = {24728856},
    Pst = {ppublish},
    Title = {Lossy compression of quality scores in genomic data},
    Volume = {30},
    Year = {2014},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btu183}}

@article{Ochoa:2013rt,
    Abstract = {BACKGROUND: Next Generation Sequencing technologies have revolutionized many fields in biology by reducing the time and cost required for sequencing. As a result, large amounts of sequencing data are being generated. A typical sequencing data file may occupy tens or even hundreds of gigabytes of disk space, prohibitively large for many users. This data consists of both the nucleotide sequences and per-base quality scores that indicate the level of confidence in the readout of these sequences. Quality scores account for about half of the required disk space in the commonly used FASTQ format (before compression), and therefore the compression of the quality scores can significantly reduce storage requirements and speed up analysis and transmission of sequencing data.
RESULTS: In this paper, we present a new scheme for the lossy compression of the quality scores, to address the problem of storage. Our framework allows the user to specify the rate (bits per quality score) prior to compression, independent of the data to be compressed. Our algorithm can work at any rate, unlike other lossy compression algorithms. We envisage our algorithm as being part of a more general compression scheme that works with the entire FASTQ file. Numerical experiments show that we can achieve a better mean squared error (MSE) for small rates (bits per quality score) than other lossy compression schemes. For the organism PhiX, whose assembled genome is known and assumed to be correct, we show that it is possible to achieve a significant reduction in size with little compromise in performance on downstream applications (e.g., alignment).
CONCLUSIONS: QualComp is an open source software package, written in C and freely available for download at https://sourceforge.net/projects/qualcomp.},
    Author = {Ochoa, Idoia and Asnani, Himanshu and Bharadia, Dinesh and Chowdhury, Mainak and Weissman, Tsachy and Yona, Golan},
    Date-Added = {2015-02-02 11:49:42 +0000},
    Date-Modified = {2015-02-02 13:35:38 +0000},
    Doi = {10.1186/1471-2105-14-187},
    Journal = {BMC Bioinformatics},
    Journal-Full = {BMC bioinformatics},
    Mesh = {Algorithms; Animals; Data Compression; Genome; Genomics; High-Throughput Nucleotide Sequencing; Mice; Polymorphism, Single Nucleotide; Software},
    Pages = {187},
    Pmc = {PMC3698011},
    Pmid = {23758828},
    Pst = {epublish},
    Title = {{QualComp}: a new lossy compressor for quality scores based on rate distortion theory},
    Volume = {14},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-14-187}}

@article{Wan:2012kq,
    Abstract = {MOTIVATION: The growth of next-generation sequencing means that more effective and efficient archiving methods are needed to store the generated data for public dissemination and in anticipation of more mature analytical methods later. This article examines methods for compressing the quality score component of the data to partly address this problem.
RESULTS: We compare several compression policies for quality scores, in terms of both compression effectiveness and overall efficiency. The policies employ lossy and lossless transformations with one of several coding schemes. Experiments show that both lossy and lossless transformations are useful, and that simple coding methods, which consume less computing resources, are highly competitive, especially when random access to reads is needed.
AVAILABILITY AND IMPLEMENTATION: Our C++ implementation, released under the Lesser General Public License, is available for download at http://www.cb.k.u-tokyo.ac.jp/asailab/members/rwan.
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
    Author = {Wan, Raymond and Anh, Vo Ngoc and Asai, Kiyoshi},
    Date-Added = {2015-02-02 11:44:29 +0000},
    Date-Modified = {2015-02-03 20:18:20 +0000},
    Doi = {10.1093/bioinformatics/btr689},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Data Compression; Sequence Analysis, DNA; Software},
    Month = {Mar},
    Number = {5},
    Pages = {628-35},
    Pmid = {22171329},
    Pst = {ppublish},
    Title = {Transformations for the compression of {FASTQ} quality scores of next-generation sequencing data},
    Volume = {28},
    Year = {2012},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btr689}}

@misc{webp,
    Date-Added = {2014-10-14 13:13:26 +0000},
    Date-Modified = {2014-10-14 13:21:51 +0000},
    Description = {Documentation for {WebP}},
    Howpublished = {https://developers.google.com/speed/webp/},
    Title = {Documentation for {WebP}},
    Url = {https://developers.google.com/speed/webp/},
    Viewport = {width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1, user-scalable=0},
    Xsrf_Token = {6s9fUrp-qHI4xuQD4IXT-joxNDEzMjkyMzkyNDY1NTkw},
    Bdsk-Url-1 = {https://developers.google.com/speed/webp/}}

@article{Giancarlo:2014rw,
    Abstract = {High-throughput sequencing technologies produce large collections of data, mainly DNA sequences with additional information, requiring the design of efficient and effective methodologies for both their compression and storage. In this context, we first provide a classification of the main techniques that have been proposed, according to three specific research directions that have emerged from the literature and, for each, we provide an overview of the current techniques. Finally, to make this review useful to researchers and technicians applying the existing software and tools, we include a synopsis of the main characteristics of the described approaches, including details on their implementation and availability. Performance of the various methods is also highlighted, although the state of the art does not lend itself to a consistent and coherent comparison among all the methods presented here.},
    Author = {Giancarlo, R and Rombo, SE and Utro, F},
    Date-Added = {2014-10-09 15:45:39 +0000},
    Date-Modified = {2014-10-11 17:37:09 +0000},
    Doi = {10.1093/bib/bbt088},
    Journal = {Brief Bioinform},
    Journal-Full = {Briefings in bioinformatics},
    Keywords = {analysis of large biological sequence collections; compressive sequence analysis; data compression in bioinformatics; data compression of large sequence collections; storage and management of HTS data; succinct data structures for bioinformatics},
    Month = {May},
    Number = {3},
    Pages = {390-406},
    Pmid = {24347576},
    Pst = {ppublish},
    Title = {Compressive biological sequence analysis and archival in the era of high-throughput sequencing technologies},
    Volume = {15},
    Year = {2014},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bib/bbt088}}

@article{Zhu:2013qr,
    Abstract = {The exponential growth of high-throughput DNA sequence data has posed great challenges to genomic data storage, retrieval and transmission. Compression is a critical tool to address these challenges, where many methods have been developed to reduce the storage size of the genomes and sequencing data (reads, quality scores and metadata). However, genomic data are being generated faster than they could be meaningfully analyzed, leaving a large scope for developing novel compression algorithms that could directly facilitate data analysis beyond data transfer and storage. In this article, we categorize and provide a comprehensive review of the existing compression methods specialized for genomic data and present experimental results on compression ratio, memory usage, time for compression and decompression. We further present the remaining challenges and potential directions for future research.},
    Author = {Zhu, Z and Zhang, Y and Ji, Z and He, S and Yang, X},
    Date-Added = {2014-10-09 15:45:03 +0000},
    Date-Modified = {2014-10-11 17:38:02 +0000},
    Doi = {10.1093/bib/bbt087},
    Journal = {Brief Bioinform},
    Journal-Full = {Briefings in bioinformatics},
    Keywords = {compression; next-generation sequencing; reference-based compression; reference-free compression},
    Month = {Dec},
    Pmid = {24300111},
    Pst = {aheadofprint},
    Title = {High-throughput {DNA} sequence data compression},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bib/bbt087}}

@electronic{wc3-png:qy,
    Author = {WC3},
    Date-Added = {2014-09-27 20:42:43 +0000},
    Date-Modified = {2014-09-27 20:43:54 +0000},
    Title = {{Portable Network Graphics (PNG) Specification (Second Edition)}},
    Url = {http://www.w3.org/TR/PNG/},
    Bdsk-Url-1 = {http://www.w3.org/TR/PNG/}}

@article{Minoche:2011km,
    Abstract = {BACKGROUND: The generation and analysis of high-throughput sequencing data are becoming a major component of many studies in molecular biology and medical research. Illumina's Genome Analyzer (GA) and HiSeq instruments are currently the most widely used sequencing devices. Here, we comprehensively evaluate properties of genomic HiSeq and GAIIx data derived from two plant genomes and one virus, with read lengths of 95 to 150 bases.
RESULTS: We provide quantifications and evidence for GC bias, error rates, error sequence context, effects of quality filtering, and the reliability of quality values. By combining different filtering criteria we reduced error rates 7-fold at the expense of discarding 12.5% of alignable bases. While overall error rates are low in HiSeq data we observed regions of accumulated wrong base calls. Only 3% of all error positions accounted for 24.7% of all substitution errors. Analyzing the forward and reverse strands separately revealed error rates of up to 18.7%. Insertions and deletions occurred at very low rates on average but increased to up to 2% in homopolymers. A positive correlation between read coverage and GC content was found depending on the GC content range.
CONCLUSIONS: The errors and biases we report have implications for the use and the interpretation of Illumina sequencing data. GAIIx and HiSeq data sets show slightly different error profiles. Quality filtering is essential to minimize downstream analysis artifacts. Supporting previous recommendations, the strand-specificity provides a criterion to distinguish sequencing errors from low abundance polymorphisms.},
    Author={Minoche, Andr{\'e} E and Dohm, Juliane C and Himmelbauer, Heinz and others},
    Date-Added = {2014-09-27 14:59:19 +0000},
    Date-Modified = {2014-09-27 15:00:13 +0000},
    Doi = {10.1186/gb-2011-12-11-r112},
    Journal = {Genome Biology},
    Journal-Full = {Genome Biology},
    Mesh = {Arabidopsis; Artifacts; Automation, Laboratory; Bacteriophage phi X 174; Base Composition; Base Sequence; Beta vulgaris; Genomics; High-Throughput Nucleotide Sequencing; Molecular Sequence Data; Mutagenesis, Insertional; Polymorphism, Genetic; Reproducibility of Results; Sensitivity and Specificity; Sequence Analysis, DNA; Sequence Deletion},
    Number = {11},
    Pages = {R112},
    Pmc = {PMC3334598},
    Pmid = {22067484},
    Pst = {epublish},
    Title = {Evaluation of genomic high-throughput sequencing data generated on {Illumina HiSeq} and {Genome Analyzer} systems},
    Volume = {12},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2011-12-11-r112}}

@article{Carneiro:2012xw,
    Abstract = {BACKGROUND: Pacific Biosciences technology provides a fundamentally new data type that provides the potential to overcome some limitations of current next generation sequencing platforms by providing significantly longer reads, single molecule sequencing, low composition bias and an error profile that is orthogonal to other platforms. With these potential advantages in mind, we here evaluate the utility of the Pacific Biosciences RS platform for human medical amplicon resequencing projects.
RESULTS: We evaluated the Pacific Biosciences technology for SNP discovery in medical resequencing projects using the Genome Analysis Toolkit, observing high sensitivity and specificity for calling differences in amplicons containing known true or false SNPs. We assessed data quality: most errors were indels (~14%) with few apparent miscalls (~1%). In this work, we define a custom data processing pipeline for Pacific Biosciences data for human data analysis.
CONCLUSION: Critically, the error properties were largely free of the context-specific effects that affect other sequencing technologies. These data show excellent utility for follow-up validation and extension studies in human data and medical genetics projects, but can be extended to other organisms with a reference genome.},
    Author = {Carneiro, MO and Russ, C and Ross, MG and Gabriel, SB and Nusbaum, C and DePristo, MA},
    Date-Added = {2014-09-27 14:01:15 +0000},
    Date-Modified = {2014-09-27 14:02:23 +0000},
    Doi = {10.1186/1471-2164-13-375},
    Journal = {BMC Genomics},
    Journal-Full = {BMC genomics},
    Mesh = {Genetic Variation; Genome, Human; Genotype; Humans; Polymorphism, Single Nucleotide; Sequence Analysis, DNA; Software; User-Computer Interface},
    Pages = {375},
    Pmc = {PMC3443046},
    Pmid = {22863213},
    Pst = {epublish},
    Title = {{Pacific Biosciences} sequencing technology for genotyping and variation discovery in human data},
    Volume = {13},
    Year = {2012},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2164-13-375}}

@inproceedings{DBLP:conf/icip/Queiroz02b,
    Author = {de Queiroz, RL},
    Bibsource = {dblp computer science bibliography, http://dblp.org},
    Biburl = {http://dblp.uni-trier.de/rec/bib/conf/icip/Queiroz02b},
    Booktitle = {{ICIP} {(2)}},
    Date-Added = {2014-09-26 20:21:17 +0000},
    Date-Modified = {2014-09-26 20:21:43 +0000},
    Doi = {10.1109/ICIP.2002.1039967},
    Pages = {381--384},
    Timestamp = {Fri, 26 Sep 2014 22:20:58 +0200},
    Title = {Improved transforms for the compression of color and multispectral images},
    Url = {http://dx.doi.org/10.1109/ICIP.2002.1039967},
    Year = {2002},
    Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICIP.2002.1039967}}

@electronic{png:yb,
    Date-Added = {2014-09-26 16:40:39 +0000},
    Date-Modified = {2014-09-26 16:41:38 +0000},
    Title = {{Portable Network Graphics (PNG) Specification and Extensions}},
    Url = {http://www.libpng.org/pub/png/spec/},
    Bdsk-Url-1 = {http://www.libpng.org/pub/png/spec/}}

@book{png-book,
    Added-At = {2011-04-20T00:00:00.000+0200},
    Author = {Roelofs, G},
    Biburl = {http://www.bibsonomy.org/bibtex/2a9f1eaa558e79224b93774fcf9c8e2ac/dblp},
    Date-Added = {2014-09-26 16:32:16 +0000},
    Date-Modified = {2014-09-26 16:37:44 +0000},
    Interhash = {9dfe1dae8aabff346ce36dc20c96637f},
    Intrahash = {a9f1eaa558e79224b93774fcf9c8e2ac},
    Isbn = {978-1-56592-542-7},
    Keywords = {dblp},
    Pages = {I-XIX, 1-321},
    Publisher = {O'Reilly},
    Timestamp = {2011-04-20T00:00:00.000+0200},
    Title = {{PNG} --- the definitive guide: creating and programming portable network graphics.},
    Year = 1999}

@electronic{mng-vlc:jo,
    Date-Added = {2014-09-26 16:16:38 +0000},
    Date-Modified = {2014-09-26 16:17:31 +0000},
    Title = {{MNG-VLC (Multiple-image Network Graphics--Very Low Complexity) Format Version 1.0}},
    Url = {http://www.libpng.org/pub/mng/spec/mng-vlc.html},
    Bdsk-Url-1 = {http://www.libpng.org/pub/mng/spec/mng-vlc.html}}

@electronic{mng-lc:bv,
    Date-Added = {2014-09-26 16:13:16 +0000},
    Date-Modified = {2014-09-26 16:18:30 +0000},
    Title = {{MNG-LC (Multiple-image Network Graphics--Low Complexity) Format Version 1.0}},
    Url = {http://www.libpng.org/pub/mng/spec/mng-lc.html},
    Bdsk-Url-1 = {http://www.libpng.org/pub/mng/spec/mng-lc.html}}

@electronic{mng:hb,
    Date-Added = {2014-09-26 15:54:19 +0000},
    Date-Modified = {2014-09-26 16:15:22 +0000},
    Lastchecked = {26 September 2014},
    Title = {{MNG (Multiple-image Network Graphics) Format Version 1.0}},
    Url = {http://www.libpng.org/pub/mng/spec/},
    Bdsk-Url-1 = {http://www.libpng.org/pub/mng/spec/}}

@electronic{libpng:ph,
    Date-Added = {2014-09-26 15:01:44 +0000},
    Date-Modified = {2014-09-26 15:50:51 +0000},
    Lastchecked = {26 September 2014},
    Title = {libpng},
    Url = {http://www.libpng.org/pub/png/libpng.html},
    Bdsk-Url-1 = {http://www.libpng.org/pub/png/libpng.html}}

@article{Ziv77auniversal,
    Author={Ziv, Jacob and Lempel, Abraham},
    Date-Added = {2014-09-25 12:19:25 +0000},
    Date-Modified = {2014-09-25 12:20:50 +0000},
    Journal = {IEEE Transactions on Information Theory},
    Number = {3},
    Pages = {337--343},
    Title = {A universal algorithm for sequential data compression},
    Volume = {23},
    Year = {1977}}

@webpage{zopfli:kx,
    Author={Alakuijala, Jyrki and Vandevenne, Lode},
    Date-Added = {2014-09-24 21:59:04 +0000},
    Date-Modified = {2014-09-24 22:02:10 +0000},
    Keywords = {PNG compression},
    Lastchecked = {24 September 2014},
    Robots = {NOARCHIVE},
    Title = {Zopfli Compression Algorithm},
    Url = {https://code.google.com/p/zopfli/},
    Bdsk-Url-1 = {https://code.google.com/p/zopfli/}}

@inproceedings{DBLP:conf/recomb/YuYB14,
    Author={Yu, Y William and Y{\"{o}}r{\"{u}}koglu, Deniz and Berger, Bonnie},
    Bibsource = {dblp computer science bibliography, http://dblp.org},
    Biburl = {http://dblp.uni-trier.de/rec/bib/conf/recomb/YuYB14},
    Booktitle = {Research in Computational Molecular Biology - 18th Annual International Conference, {RECOMB} 2014, Pittsburgh, PA, USA, April 2-5, 2014, Proceedings},
    Crossref = {DBLP:conf/recomb/2014},
    Date-Added = {2014-09-24 14:04:37 +0000},
    Date-Modified = {2015-02-03 20:31:56 +0000},
    Pages = {385--399},
    Timestamp = {Wed, 24 Sep 2014 16:04:29 +0200},
    Title = {Traversing the $k$-mer Landscape of {NGS} Read Datasets for Quality Score Sparsification},
    Year = {2014},
    Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-319-05269-4_31}}

@proceedings{DBLP:conf/recomb/2014,
    Bibsource = {dblp computer science bibliography, http://dblp.org},
    Biburl = {http://dblp.uni-trier.de/rec/bib/conf/recomb/2014},
    Date-Added = {2014-09-24 14:04:37 +0000},
    Date-Modified = {2014-09-24 14:17:12 +0000},
    Editor = {Roded Sharan},
    Isbn = {978-3-319-05268-7},
    Publisher = {Springer},
    Series = {Lecture Notes in Computer Science},
    Timestamp = {Wed, 24 Sep 2014 16:04:29 +0200},
    Title = {Research in Computational Molecular Biology - 18th Annual International Conference, {RECOMB} 2014, Pittsburgh, PA, USA, April 2-5, 2014, Proceedings},
    Volume = {8394},
    Year = {2014},
    Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-319-05269-4}}

@misc{beck2001agile,
    Added-At = {2007-12-30T11:39:05.000+0100},
    Author={Beck, Kent and Beedle, Mike and Van Bennekum, Arie and Cockburn, Alistair and Cunningham, Ward and Fowler, Martin and Grenning, James and Highsmith, Jim and Hunt, Andrew and Jeffries, Ron and others},
    Biburl = {http://www.bibsonomy.org/bibtex/28954248a545d88dd2c0e688d1c7e2f9d/juve},
    Booktitle = {Manifesto for Agile Software Development},
    Date-Added = {2014-09-24 12:44:59 +0000},
    Date-Modified = {2014-09-24 12:46:27 +0000},
    Description = {Manifesto for Agile Software Development},
    Interhash = {098cc7e185f10c3da390459a01e0d535},
    Intrahash = {8954248a545d88dd2c0e688d1c7e2f9d},
    Keywords = {imported},
    Timestamp = {2007-12-30T11:39:05.000+0100},
    Title = {Manifesto for Agile Software Development},
    Url = {http://www.agilemanifesto.org/},
    Year = 2001,
    Bdsk-Url-1 = {http://www.agilemanifesto.org/}}

@article{Earl:2011fv,
    Abstract = {Low-cost short read sequencing technology has revolutionized genomics, though it is only just becoming practical for the high-quality de novo assembly of a novel large genome. We describe the Assemblathon 1 competition, which aimed to comprehensively assess the state of the art in de novo assembly methods when applied to current sequencing technologies. In a collaborative effort, teams were asked to assemble a simulated Illumina HiSeq data set of an unknown, simulated diploid genome. A total of 41 assemblies from 17 different groups were received. Novel haplotype aware assessments of coverage, contiguity, structure, base calling, and copy number were made. We establish that within this benchmark: (1) It is possible to assemble the genome to a high level of coverage and accuracy, and that (2) large differences exist between the assemblies, suggesting room for further improvements in current methods. The simulated benchmark, including the correct answer, the assemblies, and the code that was used to evaluate the assemblies is now public and freely available from http://www.assemblathon.org/.},
    Author = {Earl, D and Bradnam, K and St John, J and Darling, A and Lin, D and Fass, J and Yu, HOK and Buffalo, V and Zerbino, DR and Diekhans, M and Nguyen, N and Ariyaratne, PN and Sung, W-K and Ning, Z and Haimel, M and Simpson, JT and Fonseca, NA and Birol, {\.I} and Docking, TR and Ho, IY and Rokhsar, DS and Chikhi, R and Lavenier, D and Chapuis, G and Naquin, D and Maillet, N and Schatz, MC and Kelley, DR and Phillippy, AM and Koren, S and Yang, S-P and Wu, W and Chou, W-C and Srivastava, A and Shaw, TI and Ruby, JG and Skewes-Cox, P and Betegon, M and Dimon, MT and Solovyev, V and Seledtsov, I and Kosarev, P and Vorobyev, D and Ramirez-Gonzalez, R and Leggett, R and MacLean, D and Xia, F and Luo, R and Li, Z and Xie, Y and Liu, B and Gnerre, S and MacCallum, I and Przybylski, D and Ribeiro, FJ and Yin, S and Sharpe, T and Hall, G and Kersey, PJ and Durbin, R and Jackman, SD and Chapman, JA and Huang, X and DeRisi, JL and Caccamo, M and Li, Y and Jaffe, DB and Green, RE and Haussler, D and Korf, I and Paten, B},
    Date-Added = {2014-09-23 20:38:14 +0000},
    Date-Modified = {2014-09-24 11:57:06 +0000},
    Doi = {10.1101/gr.126599.111},
    Journal = {Genome Res},
    Journal-Full = {Genome research},
    Mesh = {Genome; Genomics; Sequence Analysis, DNA},
    Month = {Dec},
    Number = {12},
    Pages = {2224-41},
    Pmc = {PMC3227110},
    Pmid = {21926179},
    Pst = {ppublish},
    Title = {Assemblathon 1: a competitive assessment of de novo short read assembly methods},
    Volume = {21},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.126599.111}}

@article{Schmieder:2011gd,
    Abstract = {SUMMARY: Here, we present PRINSEQ for easy and rapid quality control and data preprocessing of genomic and metagenomic datasets. Summary statistics of FASTA (and QUAL) or FASTQ files are generated in tabular and graphical form and sequences can be filtered, reformatted and trimmed by a variety of options to improve downstream analysis.
AVAILABILITY AND IMPLEMENTATION: This open-source application was implemented in Perl and can be used as a stand alone version or accessed online through a user-friendly web interface. The source code, user help and additional information are available at http://prinseq.sourceforge.net/.},
    Author={Schmieder, Robert and Edwards, Robert},
    Date-Added = {2014-09-23 20:32:49 +0000},
    Date-Modified = {2014-09-23 20:57:32 +0000},
    Doi = {10.1093/bioinformatics/btr026},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Computer Graphics; Information Storage and Retrieval; Internet; Metagenomics; Programming Languages; Quality Control; Sequence Analysis, DNA; Software},
    Month = {Mar},
    Number = {6},
    Pages = {863-4},
    Pmc = {PMC3051327},
    Pmid = {21278185},
    Pst = {ppublish},
    Title = {Quality control and preprocessing of metagenomic datasets},
    Volume = {27},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btr026}}

@article{EJ200,
    Abstract = {When small RNA is sequenced on current sequencing machines, the resulting reads are usually longer than the RNA and therefore contain parts of the 3' adapter. That adapter must be found and removed error-tolerantly from each read before read mapping. Previous solutions are either hard to use or do not offer required features, in particular support for color space data. As an easy to use alternative, we developed the command-line tool cutadapt, which supports 454, Illumina and SOLiD (color space) data, offers two adapter trimming algorithms, and has other useful features. Cutadapt, including its MIT-licensed source code, is available for download at http://code.google.com/p/cutadapt/},
    Author = {Marcel Martin},
    Date-Added = {2014-09-23 20:30:35 +0000},
    Date-Modified = {2014-09-23 20:30:35 +0000},
    Issn = {2226-6089},
    Journal = {EMBnet.journal},
    Keywords = {next generation sequencing; small RNA; microRNA; adapter removal},
    Number = {1},
    Title = {Cutadapt removes adapter sequences from high-throughput sequencing reads},
    Url = {http://journal.embnet.org/index.php/embnetjournal/article/view/200},
    Volume = {17},
    Year = {2011},
    Bdsk-Url-1 = {http://journal.embnet.org/index.php/embnetjournal/article/view/200}}

@electronic{citeulike:11583827,
    Author = {Andrews, S.},
    Citeulike-Article-Id = {11583827},
    Citeulike-Linkout-0 = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/},
    Date-Added = {2014-09-23 20:28:40 +0000},
    Date-Modified = {2014-09-23 20:28:40 +0000},
    Journal = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/},
    Keywords = {bioinformatics, ngs, qc},
    Posted-At = {2012-10-30 18:10:53},
    Priority = {2},
    Title = {{FastQC} A Quality Control tool for High Throughput Sequence Data},
    Url = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/},
    Bdsk-Url-1 = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}}

@book{hastie_09_elements-of.statistical-learning,
    Added-At = {2010-06-03T15:15:09.000+0200},
    Author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome and Franklin, James},
    Biburl = {http://www.bibsonomy.org/bibtex/200d858c0bd2826d4eb5f39450192d1f5/ukoethe},
    Date-Added = {2014-09-23 20:07:42 +0000},
    Date-Modified = {2014-09-23 20:10:46 +0000},
    Edition = 2,
    File = {:Books\\HastieTibshiraniFriedman-09-Elements-of-Statistical-Learning-2nd-edition\\hastie_09_elements-of.statistical-learning.pdf:PDF},
    Interhash = {52d1772f39be836e3b298d37b8c0cfa1},
    Intrahash = {00d858c0bd2826d4eb5f39450192d1f5},
    Keywords = {inference mathmatics dataanalysis method clutering statistics},
    Publisher = {Springer},
    Timestamp = {2010-06-03T15:15:09.000+0200},
    Title = {The elements of statistical learning: data mining, inference and prediction},
    Url = {http://www-stat.stanford.edu/~tibs/ElemStatLearn/},
    Year = 2009,
    Bdsk-Url-1 = {http://www-stat.stanford.edu/~tibs/ElemStatLearn/}}

@misc{seq-squeeze,
    Author = {Pistoia Alliance},
    Date-Added = {2013-05-31 17:12:30 +0000},
    Date-Modified = {2013-05-31 17:18:52 +0000},
    Note = {http://www.sequencesqueeze.org},
    Title = {{The Pistoia Alliance Sequence Squeeze Competition}},
    Urldate = {31 May 2013},
    Year = 2013}

@article{Nagarajan:2011fk,
    Abstract = {Reassortments in the influenza virus--a process where strains exchange genetic segments--have been implicated in two out of three pandemics of the 20th century as well as the 2009 H1N1 outbreak. While advances in sequencing have led to an explosion in the number of whole-genome sequences that are available, an understanding of the rate and distribution of reassortments and their role in viral evolution is still lacking. An important factor in this is the paucity of automated tools for confident identification of reassortments from sequence data due to the challenges of analyzing large, uncertain viral phylogenies. We describe here a novel computational method, called GiRaF (Graph-incompatibility-based Reassortment Finder), that robustly identifies reassortments in a fully automated fashion while accounting for uncertainties in the inferred phylogenies. The algorithms behind GiRaF search large collections of Markov chain Monte Carlo (MCMC)-sampled trees for groups of incompatible splits using a fast biclique enumeration algorithm coupled with several statistical tests to identify sets of taxa with differential phylogenetic placement. GiRaF correctly finds known reassortments in human, avian, and swine influenza populations, including the evolutionary events that led to the recent 'swine flu' outbreak. GiRaF also identifies several previously unreported reassortments via whole-genome studies to catalog events in H5N1 and swine influenza isolates.},
    Author={Nagarajan, Niranjan and Kingsford, Carl},
    Date-Added = {2013-05-29 16:13:42 +0000},
    Date-Modified = {2013-05-29 16:14:08 +0000},
    Doi = {10.1093/nar/gkq1232},
    Journal = {Nucleic Acids Res},
    Journal-Full = {Nucleic acids research},
    Mesh = {Algorithms; Computational Biology; Data Mining; Influenza A Virus, H1N1 Subtype; Influenza A Virus, H3N2 Subtype; Influenza A virus; Phylogeny; Reassortant Viruses},
    Month = {Mar},
    Number = {6},
    Pages = {e34},
    Pmc = {PMC3064795},
    Pmid = {21177643},
    Pst = {ppublish},
    Title = {{GiRaF}: robust, computational identification of influenza reassortments via graph mining},
    Volume = {39},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkq1232}}

@inproceedings{Traina00slim-trees:high,
    Author = {Traina, C and Traina, A and Seeger, B and Faloutsos, C},
    Booktitle = {7th International Conference on Extending Database Technology (EDBT 2000)},
    Date-Added = {2013-05-28 16:42:02 +0000},
    Date-Modified = {2013-05-28 16:44:19 +0000},
    Pages = {51--65},
    Publisher = {Springer-Verlag},
    Title = {Slim-trees: High performance metric trees minimizing overlap between nodes},
    Year = {2000}}

@article{Roberts:2013ly,
    Abstract = {We present eXpress, a software package for efficient probabilistic assignment of ambiguously mapping sequenced fragments. eXpress uses a streaming algorithm with linear run time and constant memory use. It can determine abundances of sequenced molecules in real time and can be applied to ChIP-seq, metagenomics and other large-scale sequencing data. We demonstrate its use on RNA-seq data and show that eXpress achieves greater efficiency than other quantification methods.},
    Author = {Roberts, A and Pachter, L},
    Date-Added = {2013-05-27 19:31:41 +0000},
    Date-Modified = {2013-05-27 19:32:00 +0000},
    Doi = {10.1038/nmeth.2251},
    Journal = {Nat Methods},
    Journal-Full = {Nature methods},
    Mesh = {Algorithms; Chromatin Immunoprecipitation; Computational Biology; Gene Expression Profiling; High-Throughput Nucleotide Sequencing; Humans; Oligonucleotide Array Sequence Analysis; RNA; Sequence Analysis, DNA; Sequence Analysis, RNA; Software},
    Month = {Jan},
    Number = {1},
    Pages = {71-3},
    Pmid = {23160280},
    Pst = {ppublish},
    Title = {Streaming fragment assignment for real-time analysis of sequencing experiments},
    Volume = {10},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1038/nmeth.2251}}

@article{21816040,
    Abstract = {BACKGROUND:RNA-Seq is revolutionizing the way transcript abundances are measured. A key challenge in transcript quantification from RNA-Seq data is the handling of reads that map to multiple genes or isoforms. This issue is particularly important for quantification with de novo transcriptome assemblies in the absence of sequenced genomes, as it is difficult to determine which transcripts are isoforms of the same gene. A second significant issue is the design of RNA-Seq experiments, in terms of the number of reads, read length, and whether reads come from one or both ends of cDNA fragments.RESULTS:We present RSEM, an user-friendly software package for quantifying gene and isoform abundances from single-end or paired-end RNA-Seq data. RSEM outputs abundance estimates, 95% credibility intervals, and visualization files and can also simulate RNA-Seq data. In contrast to other existing tools, the software does not require a reference genome. Thus, in combination with a de novo transcriptome assembler, RSEM enables accurate transcript quantification for species without sequenced genomes. On simulated and real data sets, RSEM has superior or comparable performance to quantification methods that rely on a reference genome. Taking advantage of RSEM's ability to effectively use ambiguously-mapping reads, we show that accurate gene-level abundance estimates are best obtained with large numbers of short single-end reads. On the other hand, estimates of the relative frequencies of isoforms within single genes may be improved through the use of paired-end reads, depending on the number of possible splice forms for each gene.CONCLUSIONS:RSEM is an accurate and user-friendly software tool for quantifying transcript abundances from RNA-Seq data. As it does not rely on the existence of a reference genome, it is particularly useful for quantification with de novo transcriptome assemblies. In addition, RSEM has enabled valuable guidance for cost-efficient design of quantification experiments with RNA-Seq, which is currently relatively expensive.},
    Author = {Li, B and Dewey, C},
    Date-Added = {2013-05-27 19:26:28 +0000},
    Date-Modified = {2013-05-27 19:26:56 +0000},
    Doi = {10.1186/1471-2105-12-323},
    Issn = {1471-2105},
    Journal = {BMC Bioinformatics},
    Number = {1},
    Pages = {323},
    Pubmedid = {21816040},
    Title = {{RSEM}: accurate transcript quantification from {RNA-Seq} data with or without a reference genome},
    Url = {http://www.biomedcentral.com/1471-2105/12/323},
    Volume = {12},
    Year = {2011},
    Bdsk-Url-1 = {http://www.biomedcentral.com/1471-2105/12/323},
    Bdsk-Url-2 = {http://dx.doi.org/10.1186/1471-2105-12-323}}

@article{NCBI-Resource-Coordinators:2013zr,
    Abstract = {In addition to maintaining the GenBank{\textregistered} nucleic acid sequence database, the National Center for Biotechnology Information (NCBI, http://www.ncbi.nlm.nih.gov) provides analysis and retrieval resources for the data in GenBank and other biological data made available through the NCBI web site. NCBI resources include Entrez, the Entrez Programming Utilities, MyNCBI, PubMed, PubMed Central, Gene, the NCBI Taxonomy Browser, BLAST, BLAST Link (BLink), Primer-BLAST, COBALT, Splign, RefSeq, UniGene, HomoloGene, ProtEST, dbMHC, dbSNP, dbVar, Epigenomics, the Genetic Testing Registry, Genome and related tools, the Map Viewer, Model Maker, Evidence Viewer, Trace Archive, Sequence Read Archive, BioProject, BioSample, Retroviral Genotyping Tools, HIV-1/Human Protein Interaction Database, Gene Expression Omnibus, Probe, Online Mendelian Inheritance in Animals, the Molecular Modeling Database, the Conserved Domain Database, the Conserved Domain Architecture Retrieval Tool, Biosystems, Protein Clusters and the PubChem suite of small molecule databases. Augmenting many of the web applications are custom implementations of the BLAST program optimized to search specialized data sets. All of these resources can be accessed through the NCBI home page.},
    Author = {{NCBI Resource Coordinators}},
    Date-Added = {2013-05-27 18:03:26 +0000},
    Date-Modified = {2013-05-27 18:03:53 +0000},
    Doi = {10.1093/nar/gks1189},
    Journal = {Nucleic Acids Res},
    Journal-Full = {Nucleic acids research},
    Mesh = {Animals; Databases, Chemical; Databases, Genetic; Databases, Nucleic Acid; Databases, Protein; Disease; Gene Expression; Genetic Testing; Genomics; Humans; Internet; National Library of Medicine (U.S.); Protein Structure, Tertiary; PubMed; Registries; Sequence Alignment; United States},
    Month = {Jan},
    Number = {Database issue},
    Pages = {D8-D20},
    Pmc = {PMC3531099},
    Pmid = {23193264},
    Pst = {ppublish},
    Title = {Database resources of the {National Center for Biotechnology Information}},
    Volume = {41},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gks1189}}

@article{Benson01012013,
    Abstract = {GenBank{\textregistered} (http://www.ncbi.nlm.nih.gov) is a comprehensive database that contains publicly available nucleotide sequences for almost 260 000 formally described species. These sequences are obtained primarily through submissions from individual laboratories and batch submissions from large-scale sequencing projects, including whole-genome shotgun (WGS) and environmental sampling projects. Most submissions are made using the web-based BankIt or standalone Sequin programs, and GenBank staff assigns accession numbers upon data receipt. Daily data exchange with the European Nucleotide Archive (ENA) and the DNA Data Bank of Japan (DDBJ) ensures worldwide coverage. GenBank is accessible through the NCBI Entrez retrieval system, which integrates data from the major DNA and protein sequence databases along with taxonomy, genome, mapping, protein structure and domain information, and the biomedical journal literature via PubMed. BLAST provides sequence similarity searches of GenBank and other sequence databases. Complete bimonthly releases and daily updates of the GenBank database are available by FTP. To access GenBank and its related retrieval and analysis services, begin at the NCBI home page: www.ncbi.nlm.nih.gov.},
    Author = {Benson, DA and Cavanaugh, M and Clark, K and Karsch-Mizrachi, I and Lipman, DJ. and Ostell, J and Sayers, EW},
    Date-Added = {2013-05-27 17:56:11 +0000},
    Date-Modified = {2013-05-27 17:57:40 +0000},
    Doi = {10.1093/nar/gks1195},
    Eprint = {http://nar.oxfordjournals.org/content/41/D1/D36.full.pdf+html},
    Journal = {Nucleic Acids Research},
    Number = {D1},
    Pages = {D36-D42},
    Title = {{GenBank}},
    Url = {http://nar.oxfordjournals.org/content/41/D1/D36.abstract},
    Volume = {41},
    Year = {2013},
    Bdsk-Url-1 = {http://nar.oxfordjournals.org/content/41/D1/D36.abstract},
    Bdsk-Url-2 = {http://dx.doi.org/10.1093/nar/gks1195}}

@article{Hach:2012ys,
    Abstract = {MOTIVATION: The high throughput sequencing (HTS) platforms generate unprecedented amounts of data that introduce challenges for the computational infrastructure. Data management, storage and analysis have become major logistical obstacles for those adopting the new platforms. The requirement for large investment for this purpose almost signalled the end of the Sequence Read Archive hosted at the National Center for Biotechnology Information (NCBI), which holds most of the sequence data generated world wide. Currently, most HTS data are compressed through general purpose algorithms such as gzip. These algorithms are not designed for compressing data generated by the HTS platforms; for example, they do not take advantage of the specific nature of genomic sequence data, that is, limited alphabet size and high similarity among reads. Fast and efficient compression algorithms designed specifically for HTS data should be able to address some of the issues in data management, storage and communication. Such algorithms would also help with analysis provided they offer additional capabilities such as random access to any read and indexing for efficient sequence similarity search. Here we present SCALCE, a 'boosting' scheme based on Locally Consistent Parsing technique, which reorganizes the reads in a way that results in a higher compression speed and compression rate, independent of the compression algorithm in use and without using a reference genome.
RESULTS: Our tests indicate that SCALCE can improve the compression rate achieved through gzip by a factor of 4.19-when the goal is to compress the reads alone. In fact, on SCALCE reordered reads, gzip running time can improve by a factor of 15.06 on a standard PC with a single core and 6 GB memory. Interestingly even the running time of SCALCE + gzip improves that of gzip alone by a factor of 2.09. When compared with the recently published BEETL, which aims to sort the (inverted) reads in lexicographic order for improving bzip2, SCALCE + gzip provides up to 2.01 times better compression while improving the running time by a factor of 5.17. SCALCE also provides the option to compress the quality scores as well as the read names, in addition to the reads themselves. This is achieved by compressing the quality scores through order-3 Arithmetic Coding (AC) and the read names through gzip through the reordering SCALCE provides on the reads. This way, in comparison with gzip compression of the unordered FASTQ files (including reads, read names and quality scores), SCALCE (together with gzip and arithmetic encoding) can provide up to 3.34 improvement in the compression rate and 1.26 improvement in running time.
AVAILABILITY: Our algorithm, SCALCE (Sequence Compression Algorithm using Locally Consistent Encoding), is implemented in C++ with both gzip and bzip2 compression options. It also supports multithreading when gzip option is selected, and the pigz binary is available. It is available at http://scalce.sourceforge.net.
CONTACT: fhach@cs.sfu.ca or cenk@cs.sfu.ca
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
    Author={Hach, Faraz and Numanagi{\'c}, Ibrahim and Alkan, Can and Sahinalp, S Cenk},
    Date-Added = {2013-05-27 17:39:21 +0000},
    Date-Modified = {2013-05-27 17:39:57 +0000},
    Doi = {10.1093/bioinformatics/bts593},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Month = {Dec},
    Number = {23},
    Pages = {3051-7},
    Pmc = {PMC3509486},
    Pmid = {23047557},
    Pst = {ppublish},
    Title = {{SCALCE}: boosting sequence compression algorithms using locally consistent encoding},
    Volume = {28},
    Year = {2012},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/bts593}}

@article{Ayres:2012vn,
    Abstract = {Phylogenetic inference is fundamental to our understanding of most aspects of the origin and evolution of life, and in recent years, there has been a concentration of interest in statistical approaches such as Bayesian inference and maximum likelihood estimation. Yet, for large data sets and realistic or interesting models of evolution, these approaches remain computationally demanding. High-throughput sequencing can yield data for thousands of taxa, but scaling to such problems using serial computing often necessitates the use of nonstatistical or approximate approaches. The recent emergence of graphics processing units (GPUs) provides an opportunity to leverage their excellent floating-point computational performance to accelerate statistical phylogenetic inference. A specialized library for phylogenetic calculation would allow existing software packages to make more effective use of available computer hardware, including GPUs. Adoption of a common library would also make it easier for other emerging computing architectures, such as field programmable gate arrays, to be used in the future. We present BEAGLE, an application programming interface (API) and library for high-performance statistical phylogenetic inference. The API provides a uniform interface for performing phylogenetic likelihood calculations on a variety of compute hardware platforms. The library includes a set of efficient implementations and can currently exploit hardware including GPUs using NVIDIA CUDA, central processing units (CPUs) with Streaming SIMD Extensions and related processor supplementary instruction sets, and multicore CPUs via OpenMP. To demonstrate the advantages of a common API, we have incorporated the library into several popular phylogenetic software packages. The BEAGLE library is free open source software licensed under the Lesser GPL and available from http://beagle-lib.googlecode.com. An example client program is available as public domain software.},
    Author = {Ayres, DL and Darling, A and Zwickl, DJ and Beerli, P and Holder, MT and Lewis, PO and Huelsenbeck, JP and Ronquist, F and Swofford, DL and Cummings, MP and Rambaut, A and Suchard, MA},
    Date-Added = {2013-05-27 15:26:59 +0000},
    Date-Modified = {2013-05-27 15:28:18 +0000},
    Doi = {10.1093/sysbio/syr100},
    Journal = {Syst Biol},
    Journal-Full = {Systematic biology},
    Mesh = {Algorithms; Computational Biology; Computing Methodologies; Evolution, Molecular; Genome; Phylogeny; Software},
    Month = {Jan},
    Number = {1},
    Pages = {170-3},
    Pmc = {PMC3243739},
    Pmid = {21963610},
    Pst = {ppublish},
    Title = {{BEAGLE}: an application programming interface and high-performance computing library for statistical phylogenetics},
    Volume = {61},
    Year = {2012},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/sysbio/syr100}}

@article{Breiman:2001kx,
    Author = {Breiman, L},
    Journal = {Machine Learning},
    Number = {1},
    Pages = {5-32},
    Publisher = {Kluwer Academic Publishers},
    Title = {Random Forests},
    Volume = {45},
    Year = {2001}}

@article{Hansen:2010uq,
    Abstract = {Generation of cDNA using random hexamer priming induces biases in the nucleotide composition at the beginning of transcriptome sequencing reads from the Illumina Genome Analyzer. The bias is independent of organism and laboratory and impacts the uniformity of the reads along the transcriptome. We provide a read count reweighting scheme, based on the nucleotide frequencies of the reads, that mitigates the impact of the bias.},
    Author = {Hansen, KD and Brenner, SE and Dudoit, S},
    Date-Added = {2013-05-25 16:13:34 +0000},
    Date-Modified = {2013-05-25 16:14:03 +0000},
    Doi = {10.1093/nar/gkq224},
    Journal = {Nucleic Acids Res},
    Journal-Full = {Nucleic acids research},
    Mesh = {DNA Primers; Gene Expression Profiling; Nucleotides; Sequence Analysis, DNA},
    Month = {Jul},
    Number = {12},
    Pages = {e131},
    Pmc = {PMC2896536},
    Pmid = {20395217},
    Pst = {ppublish},
    Title = {Biases in {Illumina} transcriptome sequencing caused by random hexamer priming},
    Volume = {38},
    Year = {2010},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkq224}}

@article{Gibbs:2004fk,
    Abstract = {The laboratory rat (Rattus norvegicus) is an indispensable tool in experimental medicine and drug development, having made inestimable contributions to human health. We report here the genome sequence of the Brown Norway (BN) rat strain. The sequence represents a high-quality 'draft' covering over 90% of the genome. The BN rat sequence is the third complete mammalian genome to be deciphered, and three-way comparisons with the human and mouse genomes resolve details of mammalian evolution. This first comprehensive analysis includes genes and proteins and their relation to human disease, repeated sequences, comparative genome-wide studies of mammalian orthologous chromosomal regions and rearrangement breakpoints, reconstruction of ancestral karyotypes and the events leading to existing species, rates of variation, and lineage-specific and lineage-independent evolutionary events such as expansion of gene families, orthology relations and protein evolution.},
    Author = {Gibbs, RA and Weinstock, GM and Metzker, ML and Muzny, DM and Sodergren, EJ and Scherer, S and Scott, G and Steffen, D and Worley, KC and Burch, PE and Okwuonu, G and Hines, S and Lewis, L and DeRamo, C and Delgado, O and Dugan-Rocha, S and Miner, G and Morgan, M and Hawes, A and Gill, R and Celera and Holt, RA and Adams, MD and Amanatides, PG and Baden-Tillson, H and Barnstead, M and Chin, S and Evans, CA and Ferriera, S and Fosler, C and Glodek, A and Gu, Z and Jennings, D and Kraft, CL and Nguyen, T and Pfannkoch, CM and Sitter, C and Sutton, GG and Venter, J and Woodage, T and Smith, D and Lee, H-M and Gustafson, E and Cahill, P and Kana, A and Doucette-Stamm, L and Weinstock, K and Fechtel, K and Weiss, RB and Dunn, DM and Green, ED and Blakesley, RW and Bouffard, GG and De Jong, PJ and Osoegawa, K and Zhu, B and Marra, M and Schein, J and Bosdet, I and Fjell, C and Jones, S and Krzywinski, M and Mathewson, C and Siddiqui, A and Wye, N and McPherson, J and Zhao, S and Fraser, CM and Shetty, J and Shatsman, S and Geer, K and Chen, Y and Abramzon, S and Nierman, WC and Havlak, PH and Chen, R and Durbin, KJ and Egan, A and Ren, Y and Song, X-Z and Li, B and Liu, Y and Qin, X and Cawley, S and Worley, KC and Cooney, A J and D'Souza, LM and Martin, K and Wu, JQ and Gonzalez-Garay, ML and Jackson, AR and Kalafus, KJ and McLeod, MP and Milosavljevic, A and Virk, D and Volkov, A and Wheeler, DA and Zhang, Z and Bailey, JA and Eichler, EE and Tuzun, E and Birney, E and Mongin, E and Ureta-Vidal, A and Woodwark, C and Zdobnov, E and Bork, P and Suyama, M and Torrents, D and Alexandersson, M and Trask, BJ and Young, JM and Huang, H and Wang, H and Xing, H and Daniels, S and Gietzen, D and Schmidt, J and Stevens, K and Vitt, U and Wingrove, J and Camara, F and Mar Alb\`{a}, M and Abril, JF and Guigo, R and Smit, A and Dubchak, I and Rubin, EM and Couronne, O and Poliakov, A and H\"{u}bner, N and Ganten, D and Goesele, C and Hummel, O and Kreitler, T and Lee, Y-A and Monti, J and Schulz, H and Zimdahl, H and Himmelbauer, H and Lehrach, H and Jacob, HJ and Bromberg, S and Gullings-Handley, J and Jensen-Seaman, MI and Kwitek, AE and Lazar, Jf and Pasko, D and Tonellato, PJ and Twigger, S and Ponting, CP and Duarte, JM and Rice, S and Goodstadt, L and Beatson, SA and Emes, RD and Winter, EE and Webber, C and Brandt, P and Nyakatura, G and Adetobi, M and Chiaromonte, F and Elnitski, L and Eswara, P and Hardison, RC and Hou, M and Kolbe, D and Makova, K and Miller, W and Nekrutenko, A and Riemer, C and Schwartz, S and Taylor, J and Yang, S and Zhang, Y and Lindpaintner, K and Andrews, TD and Caccamo, M and Clamp, M and Clarke, L and Curwen, V and Durbin, R and Eyras, E and Searle, SM and Cooper, GM and Batzoglou, S and Brudno, M and Sidow, A and Stone, EA and Venter, JC and Payseur, BA and Bourque, G and L\'{o}pez-Ot\'{i}n, C and Puente, XS and Chakrabarti, K and Chatterji, S and Dewey, C and Pachter, L and Bray, N and Yap, VB and Caspi, A and Tesler, G and Pevzner, PA and Haussler, D and Roskin, KM and Baertsch, R and Clawson, H and Furey, TS and Hinrichs, AS and Karolchik, D and Kent, WJ and Rosenbloom, KR and Trumbower, H and Weirauch, M and Cooper, DN and Stenson, PD and Ma, B and Brent, M and Arumugam, M and Shteynberg, D and Copley, RR and Taylor, MS and Riethman, H and Mudunuri, U and Peterson, J and Guyer, M and Felsenfeld, A and Old, S and Mockrin, S and Collins, F and {Rat Genome Sequencing Project Consortium}},
    Date-Added = {2013-05-25 14:55:41 +0000},
    Date-Modified = {2013-05-25 15:24:15 +0000},
    Doi = {10.1038/nature02426},
    Journal = {Nature},
    Journal-Full = {Nature},
    Mesh = {Animals; Base Composition; Centromere; Chromosomes, Mammalian; CpG Islands; DNA Transposable Elements; DNA, Mitochondrial; Evolution, Molecular; Gene Duplication; Genome; Genomics; Humans; Introns; Male; Mice; Models, Molecular; Mutagenesis; Polymorphism, Single Nucleotide; RNA Splice Sites; RNA, Untranslated; Rats; Rats, Inbred BN; Regulatory Sequences, Nucleic Acid; Retroelements; Sequence Analysis, DNA; Telomere},
    Month = {Apr},
    Number = {6982},
    Pages = {493-521},
    Pmid = {15057822},
    Pst = {ppublish},
    Title = {Genome sequence of the {Brown Norway} rat yields insights into mammalian evolution},
    Volume = {428},
    Year = {2004},
    Bdsk-Url-1 = {http://dx.doi.org/10.1038/nature02426}}

@article{Havlak01042004,
    Abstract = {Atlas is a suite of programs developed for assembly of genomes by a ``combined approach'' that uses DNA sequence reads from both BACs and whole-genome shotgun (WGS) libraries. The BAC clones afford advantages of localized assembly with reduced computational load, and provide a robust method for dealing with repeated sequences. Inclusion of WGS sequences facilitates use of different clone insert sizes and reduces data production costs. A core function of Atlas software is recruitment of WGS sequences into appropriate BACs based on sequence overlaps. Because construction of consensus sequences is from local assembly of these reads, only small (<0.1%) units of the genome are assembled at a time. Once assembled, each BAC is used to derive a genomic layout. This ``sequence-based'' growth of the genome map has greater precision than with non-sequence-based methods. Use of BACs allows correction of artifacts due to repeats at each stage of the process. This is aided by ancillary data such as BAC fingerprint, other genomic maps, and syntenic relations with other genomes. Atlas was used to assemble a draft DNA sequence of the rat genome; its major components including overlapper and split-scaffold are also being used in pure WGS projects.},
    Author = {Havlak, P and Chen, R and Durbin, KJ and Egan, A and Ren, Y and Song, X-Z and Weinstock, GM and Gibbs, RA},
    Date-Added = {2013-05-25 14:49:11 +0000},
    Date-Modified = {2013-05-25 14:53:24 +0000},
    Doi = {10.1101/gr.2264004},
    Eprint = {http://genome.cshlp.org/content/14/4/721.full.pdf+html},
    Journal = {Genome Research},
    Number = {4},
    Pages = {721-732},
    Title = {The {Atlas} Genome Assembly System},
    Url = {http://genome.cshlp.org/content/14/4/721.abstract},
    Volume = {14},
    Year = {2004},
    Bdsk-Url-1 = {http://genome.cshlp.org/content/14/4/721.abstract},
    Bdsk-Url-2 = {http://dx.doi.org/10.1101/gr.2264004}}

@article{VidalRuiz1986145,
    Author = {Ruiz, EV},
    Date-Added = {2013-05-25 14:02:54 +0000},
    Date-Modified = {2013-05-25 14:04:13 +0000},
    Doi = {10.1016/0167-8655(86)90013-9},
    Issn = {0167-8655},
    Journal = {Pattern Recognition Letters},
    Keywords = {pattern recognition},
    Number = {3},
    Pages = {145-157},
    Title = {An algorithm for finding nearest neighbours in (approximately) constant average time},
    Url = {http://www.sciencedirect.com/science/article/pii/0167865586900139},
    Volume = {4},
    Year = {1986},
    Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/pii/0167865586900139},
    Bdsk-Url-2 = {http://dx.doi.org/10.1016/0167-8655(86)90013-9}}

@inproceedings{DBLP:conf/spire/NavarroPC02,
    Abstract = {A t-spanner, a subgraph that approximates graph distances within a precision factor t, is a well known concept in graph theory. In this paper we use it in a novel way, namely as a data structure for searching metric spaces. The key idea is to consider the t-spanner as an approximation of the complete graph of distances among the objects, and use it as a compact device to simulate the large matrix of distances required by successful search algorithms like AESA [Vidal 1986]. The t-spanner provides a time-space tradeoff where full AESA is just one extreme. We show that the resulting algorithm is competitive against current approaches, e.g., 1.5 times the time cost of AESA using only 3.21% of its space requirement, in a metric space of strings; and 1.09 times the time cost of AESA using only 3.83 % of its space requirement, in a metric space of documents. We also show that t-spanners provide better space-time tradeoffs than classical alternatives such as pivot-based indexes. Furthermore, we show that the concept of t-spanners has potential for large improvements.},
    Author = {Navarro, G and Paredes, R and Ch\'{a}vez, E},
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {SPIRE},
    Crossref = {DBLP:conf/spire/2002},
    Date-Added = {2013-05-25 13:49:35 +0000},
    Date-Modified = {2013-05-25 14:20:23 +0000},
    Pages = {298-309},
    Title = {$t$-{Spanners} as a Data Structure for Metric Space Searching},
    Year = {2002},
    Bdsk-Url-1 = {http://dx.doi.org/10.1007/3-540-45735-6_26}}

@proceedings{DBLP:conf/spire/2002,
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {SPIRE},
    Date-Added = {2013-05-25 13:49:35 +0000},
    Date-Modified = {2013-05-25 13:53:10 +0000},
    Editor = {Laender, AHF and Oliveira, AL},
    Isbn = {3-540-44158-1},
    Publisher = {Springer},
    Series = {Lecture Notes in Computer Science},
    Title = {String Processing and Information Retrieval, 9th International Symposium, SPIRE 2002, Lisbon, Portugal, September 11-13, 2002, Proceedings},
    Volume = {2476},
    Year = {2002}}

@article{Tatusov24101997,
    Abstract = {In order to extract the maximum amount of information from the rapidly accumulating genome sequences, all conserved genes need to be classified according to their homologous relationships. Comparison of proteins encoded in seven complete genomes from five major phylogenetic lineages and elucidation of consistent patterns of sequence similarities allowed the delineation of 720 clusters of orthologous groups (COGs). Each COG consists of individual orthologous proteins or orthologous sets of paralogs from at least three lineages. Orthologs typically have the same function, allowing transfer of functional information from one member to an entire COG. This relation automatically yields a number of functional predictions for poorly characterized genomes. The COGs comprise a framework for functional and evolutionary genome analysis.},
    Author = {Tatusov, RL and Koonin, EV and Lipman, DJ},
    Date-Added = {2013-05-24 14:18:02 +0000},
    Date-Modified = {2013-05-25 13:54:44 +0000},
    Doi = {10.1126/science.278.5338.631},
    Eprint = {http://www.sciencemag.org/content/278/5338/631.full.pdf},
    Journal = {Science},
    Number = {5338},
    Pages = {631-637},
    Title = {A Genomic Perspective on Protein Families},
    Url = {http://www.sciencemag.org/content/278/5338/631.abstract},
    Volume = {278},
    Year = {1997},
    Bdsk-Url-1 = {http://www.sciencemag.org/content/278/5338/631.abstract},
    Bdsk-Url-2 = {http://dx.doi.org/10.1126/science.278.5338.631}}

@article{Ostlund:2010ys,
    Abstract = {The InParanoid project gathers proteomes of completely sequenced eukaryotic species plus Escherichia coli and calculates pairwise ortholog relationships among them. The new release 7.0 of the database has grown by an order of magnitude over the previous version and now includes 100 species and their collective 1.3 million proteins organized into 42.7 million pairwise ortholog groups. The InParanoid algorithm itself has been revised and is now both more specific and sensitive. Based on results from our recent benchmarking of low-complexity filters in homology assignment, a two-pass BLAST approach was developed that makes use of high-precision compositional score matrix adjustment, but avoids the alignment truncation that sometimes follows. We have also updated the InParanoid web site (http://InParanoid.sbc.su.se). Several features have been added, the response times have been improved and the site now sports a new, clearer look. As the number of ortholog databases has grown, it has become difficult to compare among these resources due to a lack of standardized source data and incompatible representations of ortholog relationships. To facilitate data exchange and comparisons among ortholog databases, we have developed and are making available two XML schemas: SeqXML for the input sequences and OrthoXML for the output ortholog clusters.},
    Author = {Ostlund, G and Schmitt, T and Forslund, K and K\"{o}stler, T and Messina, DN and Roopra, S and Frings, O and Sonnhammer, ELL},
    Date-Added = {2013-05-24 14:13:19 +0000},
    Date-Modified = {2013-05-25 13:54:25 +0000},
    Doi = {10.1093/nar/gkp931},
    Journal = {Nucleic Acids Res},
    Journal-Full = {Nucleic acids research},
    Mesh = {Algorithms; Animals; Cluster Analysis; Computational Biology; Databases, Genetic; Databases, Nucleic Acid; Escherichia coli; Eukaryotic Cells; Genome, Bacterial; Humans; Information Storage and Retrieval; Internet; Protein Structure, Tertiary; Proteins; Proteomics; Software},
    Month = {Jan},
    Number = {Database issue},
    Pages = {D196-203},
    Pmc = {PMC2808972},
    Pmid = {19892828},
    Pst = {ppublish},
    Title = {{InParanoid} 7: new algorithms and tools for eukaryotic orthology analysis},
    Volume = {38},
    Year = {2010},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkp931}}

@article{DBLP:journals/tit/ZivL78,
    Author={Ziv, Jacob and Lempel, Abraham},
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Date-Added = {2013-05-23 17:22:22 +0000},
    Date-Modified = {2013-05-25 17:04:21 +0000},
    Journal = {IEEE Transactions on Information Theory},
    Number = {5},
    Pages = {530-536},
    Title = {Compression of Individual Sequences via Variable-Rate Coding},
    Volume = {24},
    Year = {1978}}

@inproceedings{berger2,
    Author = {Daniels, N and Gallant, A and Peng, J and Cowen, L and Baym, M and Berger, B},
    Booktitle = {Proceedings of the International Symposium on Intelligent Systems for Molecular Biology},
    Title = {Compressive Genomics for Protein Databases},
    Year = 2013}

@article{Smith:1981uq,
    Author = {Smith, TF and Waterman, MS},
    Date-Added = {2013-05-23 16:31:35 +0000},
    Date-Modified = {2013-05-25 13:53:42 +0000},
    Journal = {J Mol Biol},
    Journal-Full = {Journal of molecular biology},
    Mesh = {Base Sequence; Models, Chemical},
    Month = {Mar},
    Number = {1},
    Pages = {195-7},
    Pmid = {7265238},
    Pst = {ppublish},
    Title = {Identification of common molecular subsequences},
    Volume = {147},
    Year = {1981}}

@article{Altschul:1990fk,
    Abstract = {A new approach to rapid sequence comparison, basic local alignment search tool (BLAST), directly approximates alignments that optimize a measure of local similarity, the maximal segment pair (MSP) score. Recent mathematical results on the stochastic properties of MSP scores allow an analysis of the performance of this method as well as the statistical significance of alignments it generates. The basic algorithm is simple and robust; it can be implemented in a number of ways and applied in a variety of contexts including straightforward DNA and protein sequence database searches, motif searches, gene identification searches, and in the analysis of multiple regions of similarity in long DNA sequences. In addition to its flexibility and tractability to mathematical analysis, BLAST is an order of magnitude faster than existing sequence comparison tools of comparable sensitivity.},
    Author = {Altschul, SF and Gish, W and Miller, W and Myers, EW and Lipman, DJ},
    Date-Added = {2013-05-23 16:27:13 +0000},
    Date-Modified = {2013-05-25 13:55:54 +0000},
    Doi = {10.1016/S0022-2836(05)80360-2},
    Journal = {J Mol Biol},
    Journal-Full = {Journal of molecular biology},
    Mesh = {Algorithms; Amino Acid Sequence; Base Sequence; Databases, Factual; Mutation; Sensitivity and Specificity; Sequence Homology, Nucleic Acid; Software},
    Month = {Oct},
    Number = {3},
    Pages = {403-10},
    Pmid = {2231712},
    Pst = {ppublish},
    Title = {Basic local alignment search tool},
    Volume = {215},
    Year = {1990},
    Bdsk-Url-1 = {http://dx.doi.org/10.1016/S0022-2836(05)80360-2}}

@article{Gnerre:2011kx,
    Abstract = {Massively parallel DNA sequencing technologies are revolutionizing genomics by making it possible to generate billions of relatively short (similar to 100-base) sequence reads at very low cost. Whereas such data can be readily used for a wide range of biomedical applications, it has proven difficult to use them to generate high-quality de novo genome assemblies of large, repeat-rich vertebrate genomes. To date, the genome assemblies generated from such data have fallen far short of those obtained with the older (but much more expensive) capillary-based sequencing approach. Here, we report the development of an algorithm for genome assembly, ALLPATHS-LG, and its application to massively parallel DNA sequence data from the human and mouse genomes, generated on the Illumina platform. The resulting draft genome assemblies have good accuracy, short-range contiguity, long-range connectivity, and coverage of the genome. In particular, the base accuracy is high (>= 99.95\%) and the scaffold sizes (N50 size = 11.5 Mb for human and 7.2 Mb for mouse) approach those obtained with capillary-based sequencing. The combination of improved sequencing technology and improved computational methods should now make it possible to increase dramatically the de novo sequencing of large genomes. The ALLPATHS-LG program is available at http://www.broadinstitute.org/science/programs/genome-biology/crd.},
    Author = {Gnerre, S and MacCallum, I and Przybylski, D and Ribeiro, FJ and Burton, JN and Walker, BJ and Sharpe, T and Hall, G and Shea, TP and Sykes, S and Berlin, AM and Aird, D and Costello, M and Daza, R and Williams, L and Nicol, R and Gnirke, A and Nusbaum, C and Lander, ES and Jaffe, DB},
    Date-Added = {2013-05-19 15:00:51 +0000},
    Date-Modified = {2013-05-19 15:04:19 +0000},
    Doi = {DOI 10.1073/pnas.1017351108},
    Isi = {000286594800058},
    Isi-Recid = {194444511},
    Isi-Ref-Recids = {185031160 192666104 136600289 194056605 173883755 162176337 181836664 54070268 104563109 184456089 178848194 157247471 186083492 186083480 147665707 185031159 176303533 188715254 121272145 186721557 171604615 180943931 162219845 127311022 162176339},
    Iso-Source-Abbreviation = {P Natl Acad Sci Usa},
    Journal = {Proc Natl Acad Sci USA},
    Pages = {1513--1518},
    Times-Cited = {91},
    Title = {High-quality draft assemblies of mammalian genomes from massively parallel sequence data},
    Volume = {108},
    Year = {2011},
    Bdsk-Url-1 = {http://ws.isiknowledge.com/cps/openurl/service?url_ver=Z39.88-2004&rft_id=info:ut/000286594800058},
    Bdsk-Url-2 = {http://dx.doi.org/10.1073/pnas.1017351108}}

@article{Bryant:2009uq,
    Abstract = {BACKGROUND: New rapid high-throughput sequencing technologies have sparked the creation of a new class of assembler. Since all high-throughput sequencing platforms incorporate errors in their output, short-read assemblers must be designed to account for this error while utilizing all available data.
RESULTS: We have designed and implemented an assembler, Quality-value guided Short Read Assembler, created to take advantage of quality-value scores as a further method of dealing with error. Compared to previous published algorithms, our assembler shows significant improvements not only in speed but also in output quality.
CONCLUSION: QSRA generally produced the highest genomic coverage, while being faster than VCAKE. QSRA is extremely competitive in its longest contig and N50/N80 contig lengths, producing results of similar quality to those of EDENA and VELVET. QSRA provides a step closer to the goal of de novo assembly of complex genomes, improving upon the original VCAKE algorithm by not only drastically reducing runtimes but also increasing the viability of the assembly algorithm through further error handling capabilities.},
    Author={Bryant, Douglas W and Wong, Weng-Keen and Mockler, Todd C},
    Date-Added = {2013-05-19 14:24:05 +0000},
    Date-Modified = {2013-05-19 14:26:27 +0000},
    Doi = {10.1186/1471-2105-10-69},
    Journal = {BMC Bioinformatics},
    Journal-Full = {BMC bioinformatics},
    Mesh = {Algorithms; Computational Biology; Programming Languages; Sequence Analysis, DNA},
    Pages = {69},
    Pmc = {PMC2653489},
    Pmid = {19239711},
    Pst = {epublish},
    Title = {{QSRA}: a quality-value guided de novo short read assembler},
    Volume = {10},
    Year = {2009},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-10-69}}

@article{Bonfield:2013fk,
    Abstract = {Storage and transmission of the data produced by modern DNA sequencing instruments has become a major concern, which prompted the Pistoia Alliance to pose the SequenceSqueeze contest for compression of FASTQ files. We present several compression entries from the competition, Fastqz and Samcomp/Fqzcomp, including the winning entry. These are compared against existing algorithms for both reference based compression (CRAM, Goby) and non-reference based compression (DSRC, BAM) and other recently published competition entries (Quip, SCALCE). The tools are shown to be the new Pareto frontier for FASTQ compression, offering state of the art ratios at affordable CPU costs. All programs are freely available on SourceForge. Fastqz: https://sourceforge.net/projects/fastqz/, fqzcomp: https://sourceforge.net/projects/fqzcomp/, and samcomp: https://sourceforge.net/projects/samcomp/.},
    Author = {Bonfield, JK and Mahoney, MV},
    Date-Added = {2013-05-18 15:32:37 +0000},
    Date-Modified = {2013-05-19 14:27:09 +0000},
    Doi = {10.1371/journal.pone.0059190},
    Journal = {PLoS One},
    Journal-Full = {PloS one},
    Number = {3},
    Pages = {e59190},
    Pmc = {PMC3606433},
    Pmid = {23533605},
    Pst = {ppublish},
    Title = {Compression of {FASTQ} and {SAM} format sequencing data},
    Volume = {8},
    Year = {2013},
    Bdsk-Url-1 = {http://dx.doi.org/10.1371/journal.pone.0059190}}

@techreport{bwt,
    Address = {130 Lytton Avenue, Palo Alto, CA 94301 USA},
    Author = {Burrows, M and Wheeler, DJ},
    Date-Added = {2013-05-11 14:49:15 +0000},
    Date-Modified = {2013-05-11 14:51:17 +0000},
    Institution = {Digital Systems Research Center},
    Month = {May},
    Number = {124},
    Title = {A block-sorting lossless data compression algorithm},
    Type = {SRC Research Report},
    Year = {1994}}

@article{mtf,
    Author = {Ryabko, BY},
    Date-Added = {2013-05-11 14:43:39 +0000},
    Date-Modified = {2013-05-11 14:47:04 +0000},
    Journal = {Prob Inf Transm},
    Number = {4},
    Pages = {265--269},
    Title = {Data compression by means of a `book stack'},
    Volume = {16},
    Year = {1980}}

@inproceedings{DBLP:conf/dcc/ArnoldB97,
    Author = {Arnold, R and Bell, TC},
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {Data Compression Conference},
    Crossref = {DBLP:conf/dcc/1997},
    Date-Added = {2013-05-07 16:02:13 +0000},
    Date-Modified = {2013-05-07 16:03:06 +0000},
    Ee = {http://doi.ieeecomputersociety.org/10.1109/DCC.1997.582019, http://www.corpus.canterbury.ac.nz/},
    Pages = {201--210},
    Title = {A Corpus for the Evaluation of Lossless Compression Algorithms},
    Year = {1997}}

@proceedings{DBLP:conf/dcc/1997,
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {DCC},
    Date-Added = {2013-05-07 16:02:13 +0000},
    Date-Modified = {2013-05-07 16:09:35 +0000},
    Editor = {Storer, JA and Cohn, M},
    Isbn = {0-8186-7761-9},
    Publisher = {IEEE Computer Society},
    Title = {Proceedings of the 7th Data Compression Conference (DCC '97), Snowbird, Utah, March 25-27, 1997},
    Year = {1997},
    Bdsk-Url-1 = {http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=4456}}

@article{GRUMBACH:1994kx,
    Abstract = {Universal data compression algorithms fail to compress genetic sequences.  It is due to the specificity of this particular kind of ''text.''  We analyze in some detail the properties of the sequences, which cause the failure of classical algorithms.  We then present a lossless algorithm, biocompress-2, to compress the information contained in DNA and RNA sequences, based on the detection of regularities, such as the presence of palindromes.  The algorithm combines substitutional and statistical methods, and to the best of our knowledge, leads to the highest compression of DNA.  The results, although not satisfactory, give insight to the necessary correlation between compression and comprehension of genetic sequences.},
    Author = {Grumbach, S and Tahi, F},
    Date-Added = {2013-05-06 17:39:11 +0000},
    Date-Modified = {2013-05-06 17:40:05 +0000},
    Isi = {A1994PR26800014},
    Isi-Recid = {89266663},
    Isi-Ref-Recids = {89266664 72123623 70365582 69825790 68748744 89266665 1861079 89266666 11531509 69434316 28339058 84522062 79487235 70743458 47266230 74258599 89266667 31691147},
    Iso-Source-Abbreviation = {Inform Process Manag},
    Journal = {Information Processing \& Management},
    Pages = {875--886},
    Times-Cited = {83},
    Title = {A new challenge for compression algorithms --- genetic sequences},
    Volume = {30},
    Year = {1994},
    Bdsk-Url-1 = {http://ws.isiknowledge.com/cps/openurl/service?url_ver=Z39.88-2004&rft_id=info:ut/A1994PR26800014}}

@inproceedings{DBLP:conf/dcc/GrumbachT93,
    Author = {Grumbach, S and Tahi, F},
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {Data Compression Conference},
    Crossref = {DBLP:conf/dcc/1993},
    Date-Added = {2013-05-06 17:37:08 +0000},
    Date-Modified = {2013-05-06 17:38:43 +0000},
    Pages = {340--350},
    Title = {Compression of {DNA} Sequences},
    Year = {1993},
    Bdsk-Url-1 = {http://dx.doi.org/10.1109/DCC.1993.253115}}

@proceedings{DBLP:conf/dcc/1993,
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {DCC},
    Date-Added = {2013-05-06 17:37:08 +0000},
    Date-Modified = {2013-05-07 16:10:37 +0000},
    Editor = {Storer, JA and Cohn, M},
    Isbn = {0-8186-3392-1},
    Publisher = {IEEE Computer Society},
    Title = {Proceedings of the IEEE Data Compression Conference, DCC 1993, Snowbird, Utah, March 30 - April 1, 1993},
    Year = {1993},
    Bdsk-Url-1 = {http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=452}}

@article{Nalbantoglu:2010uq,
    Abstract = {Data compression at its base is concerned with how information is organized in data. Understanding this organization can lead to efficient ways of representing the information and hence data compression. In this paper we review the ways in which ideas and approaches fundamental to the theory and practice of data compression have been used in the area of bioinformatics. We look at how basic theoretical ideas from data compression, such as the notions of entropy, mutual information, and complexity have been used for analyzing biological sequences in order to discover hidden patterns, infer phylogenetic relationships between organisms and study viral populations. Finally, we look at how inferred grammars for biological sequences have been used to uncover structure in biological sequences.},
    Author = {Nalbanto\~{g}lu, OU and Russell, DJ and Sayood, K},
    Date-Added = {2013-05-06 17:29:36 +0000},
    Date-Modified = {2013-05-25 13:55:07 +0000},
    Doi = {10.3390/e12010034},
    Journal = {Entropy (Basel)},
    Journal-Full = {Entropy (Basel, Switzerland)},
    Month = {Jan},
    Number = {1},
    Pages = {34--52},
    Pmc = {PMC2821113},
    Pmid = {20157640},
    Pst = {ppublish},
    Title = {Data Compression Concepts and Algorithms and their Applications to Bioinformatics},
    Volume = {12},
    Year = {2010},
    Bdsk-Url-1 = {http://dx.doi.org/10.3390/e12010034}}

@article{Giancarlo:2009fk,
    Abstract = {MOTIVATION: Textual data compression, and the associated techniques coming from information theory, are often perceived as being of interest for data communication and storage. However, they are also deeply related to classification and data mining and analysis. In recent years, a substantial effort has been made for the application of textual data compression techniques to various computational biology tasks, ranging from storage and indexing of large datasets to comparison and reverse engineering of biological networks.
RESULTS: The main focus of this review is on a systematic presentation of the key areas of bioinformatics and computational biology where compression has been used. When possible, a unifying organization of the main ideas and techniques is also provided.
AVAILABILITY: It goes without saying that most of the research results reviewed here offer software prototypes to the bioinformatics community. The Supplementary Material provides pointers to software and benchmark datasets for a range of applications of broad interest. In addition to provide reference to software, the Supplementary Material also gives a brief presentation of some fundamental results and techniques related to this paper. It is at: http://www.math.unipa.it/ approximately raffaele/suppMaterial/compReview/},
    Author = {Giancarlo, R and Scaturro, D and Utro, F},
    Date-Added = {2013-05-06 17:29:05 +0000},
    Date-Modified = {2013-05-06 17:30:07 +0000},
    Doi = {10.1093/bioinformatics/btp117},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Computational Biology; Databases, Factual; Information Storage and Retrieval; Software},
    Month = {Jul},
    Number = {13},
    Pages = {1575--86},
    Pmid = {19251772},
    Pst = {ppublish},
    Title = {Textual data compression in computational biology: a synopsis},
    Volume = {25},
    Year = {2009},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btp117}}

@inproceedings{DBLP:conf/spire/KuruppuPZ11,
    Author = {Shanika, K and Puglisi, SJ and Zobel, J},
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {SPIRE},
    Crossref = {DBLP:conf/spire/2011},
    Date-Added = {2013-05-05 18:27:19 +0000},
    Date-Modified = {2013-05-25 17:06:12 +0000},
    Pages = {420--425},
    Title = {Reference Sequence Construction for Relative Compression of Genomes},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-24583-1_41}}

@proceedings{DBLP:conf/spire/2011,
    Bibsource = {DBLP, http://dblp.uni-trier.de},
    Booktitle = {SPIRE},
    Date-Added = {2013-05-05 18:27:19 +0000},
    Date-Modified = {2013-05-05 18:27:19 +0000},
    Editor = {Roberto Grossi and Fabrizio Sebastiani and Fabrizio Silvestri},
    Isbn = {978-3-642-24582-4},
    Publisher = {Springer},
    Series = {Lecture Notes in Computer Science},
    Title = {String Processing and Information Retrieval, 18th International Symposium, SPIRE 2011, Pisa, Italy, October 17-21, 2011. Proceedings},
    Volume = {7024},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-24583-1}}

@article{Wang:2011vn,
    Abstract = {With the advent of DNA sequencing technologies, more and more reference genome sequences are available for many organisms. Analyzing sequence variation and understanding its biological importance are becoming a major research aim. However, how to store and process the huge amount of eukaryotic genome data, such as those of the human, mouse and rice, has become a challenge to biologists. Currently available bioinformatics tools used to compress genome sequence data have some limitations, such as the requirement of the reference single nucleotide polymorphisms (SNPs) map and information on deletions and insertions. Here, we present a novel compression tool for storing and analyzing Genome ReSequencing data, named GRS. GRS is able to process the genome sequence data without the use of the reference SNPs and other sequence variation information and automatically rebuild the individual genome sequence data using the reference genome sequence. When its performance was tested on the first Korean personal genome sequence data set, GRS was able to achieve ∼159-fold compression, reducing the size of the data from 2986.8 to 18.8 MB. While being tested against the sequencing data from rice and Arabidopsis thaliana, GRS compressed the 361.0 MB rice genome data to 4.4 MB, and the A. thaliana genome data from 115.1 MB to 6.5 KB. This de novo compression tool is available at http://gmdd.shgmo.org/Computational-Biology/GRS.},
    Author = {Wang, C and Zhang, D},
    Date-Added = {2013-05-05 18:19:47 +0000},
    Date-Modified = {2013-05-05 18:20:17 +0000},
    Doi = {10.1093/nar/gkr009},
    Journal = {Nucleic Acids Res},
    Journal-Full = {Nucleic acids research},
    Mesh = {Arabidopsis; Genome, Human; Genome, Plant; Genomics; Humans; Oryza sativa; Sequence Analysis, DNA; Software},
    Month = {Apr},
    Number = {7},
    Pages = {e45},
    Pmc = {PMC3074166},
    Pmid = {21266471},
    Pst = {ppublish},
    Title = {A novel compression tool for efficient storage of genome resequencing data},
    Volume = {39},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkr009}}

@article{Deorowicz:2011kx,
    Abstract = {MOTIVATION: Storing, transferring and maintaining genomic databases becomes a major challenge because of the rapid technology progress in DNA sequencing and correspondingly growing pace at which the sequencing data are being produced. Efficient compression, with support for extraction of arbitrary snippets of any sequence, is the key to maintaining those huge amounts of data.
RESULTS: We present an LZ77-style compression scheme for relative compression of multiple genomes of the same species. While the solution bears similarity to known algorithms, it offers significantly higher compression ratios at compression speed over an order of magnitude greater. In particular, 69 differentially encoded human genomes are compressed over 400 times at fast compression, or even 1000 times at slower compression (the reference genome itself needs much more space). Adding fast random access to text snippets decreases the ratio to ~300.
AVAILABILITY: GDC is available at http://sun.aei.polsl.pl/gdc.
CONTACT: sebastian.deorowicz@polsl.pl.
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
    Author = {Deorowicz, S and Grabowski, S},
    Date-Added = {2013-05-05 18:18:42 +0000},
    Date-Modified = {2013-05-05 18:21:27 +0000},
    Doi = {10.1093/bioinformatics/btr505},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Algorithms; Data Compression; Databases, Nucleic Acid; Genome, Human; Genomics; Humans; Sequence Analysis, DNA},
    Month = {Nov},
    Number = {21},
    Pages = {2979-86},
    Pmid = {21896510},
    Pst = {ppublish},
    Title = {Robust relative compression of genomes with random access},
    Volume = {27},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btr505}}

@article{Daily:2010uq,
    Abstract = {BACKGROUND: High-throughput sequencing (HTS) technologies play important roles in the life sciences by allowing the rapid parallel sequencing of very large numbers of relatively short nucleotide sequences, in applications ranging from genome sequencing and resequencing to digital microarrays and ChIP-Seq experiments. As experiments scale up, HTS technologies create new bioinformatics challenges for the storage and sharing of HTS data.
RESULTS: We develop data structures and compression algorithms for HTS data. A processing stage maps short sequences to a reference genome or a large table of sequences. Then the integers representing the short sequence absolute or relative addresses, their length, and the substitutions they may contain are compressed and stored using various entropy coding algorithms, including both old and new fixed codes (e.g Golomb, Elias Gamma, MOV) and variable codes (e.g. Huffman). The general methodology is illustrated and applied to several HTS data sets. Results show that the information contained in HTS files can be compressed by a factor of 10 or more, depending on the statistical properties of the data sets and various other choices and constraints. Our algorithms fair well against general purpose compression programs such as gzip, bzip2 and 7zip; timing results show that our algorithms are consistently faster than the best general purpose compression programs.
CONCLUSIONS: It is not likely that exactly one encoding strategy will be optimal for all types of HTS data. Different experimental conditions are going to generate various data distributions whereby one encoding strategy can be more effective than another. We have implemented some of our encoding algorithms into the software package GenCompress which is available upon request from the authors. With the advent of HTS technology and increasingly new experimental protocols for using the technology, sequence databases are expected to continue rising in size. The methodology we have proposed is general, and these advanced compression techniques should allow researchers to manage and share their HTS data in a more timely fashion.},
    Author = {Daily, K and Rigor, P and Christley, S and Xie, X and Baldi, P},
    Date-Added = {2013-05-05 18:15:41 +0000},
    Date-Modified = {2013-05-18 15:03:08 +0000},
    Doi = {10.1186/1471-2105-11-514},
    Journal = {BMC Bioinformatics},
    Journal-Full = {BMC bioinformatics},
    Mesh = {Algorithms; Computational Biology; Data Compression; Databases, Factual; Entropy; High-Throughput Nucleotide Sequencing},
    Pages = {514},
    Pmc = {PMC2964686},
    Pmid = {20946637},
    Pst = {epublish},
    Title = {Data structures and compression algorithms for high-throughput sequencing technologies},
    Volume = {11},
    Year = {2010},
    Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-11-514}}

@article{Brandon:2009fk,
    Abstract = {MOTIVATION: The continuing exponential accumulation of full genome data, including full diploid human genomes, creates new challenges not only for understanding genomic structure, function and evolution, but also for the storage, navigation and privacy of genomic data. Here, we develop data structures and algorithms for the efficient storage of genomic and other sequence data that may also facilitate querying and protecting the data.
RESULTS: The general idea is to encode only the differences between a genome sequence and a reference sequence, using absolute or relative coordinates for the location of the differences. These locations and the corresponding differential variants can be encoded into binary strings using various entropy coding methods, from fixed codes such as Golomb and Elias codes, to variables codes, such as Huffman codes. We demonstrate the approach and various tradeoffs using highly variables human mitochondrial genome sequences as a testbed. With only a partial level of optimization, 3615 genome sequences occupying 56 MB in GenBank are compressed down to only 167 KB, achieving a 345-fold compression rate, using the revised Cambridge Reference Sequence as the reference sequence. Using the consensus sequence as the reference sequence, the data can be stored using only 133 KB, corresponding to a 433-fold level of compression, roughly a 23% improvement. Extensions to nuclear genomes and high-throughput sequencing data are discussed.
AVAILABILITY: Data are publicly available from GenBank, the HapMap web site, and the MITOMAP database. Supplementary materials with additional results, statistics, and software implementations are available from http://mammag.web.uci.edu/bin/view/Mitowiki/ProjectDNACompression.},
    Author = {Brandon, MC and Wallace, DC and Baldi, P},
    Date-Added = {2013-05-05 18:15:35 +0000},
    Date-Modified = {2013-05-05 18:16:45 +0000},
    Doi = {10.1093/bioinformatics/btp319},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Algorithms; Data Compression; Genome; Genomics; Humans; Sequence Analysis, DNA},
    Month = {Jul},
    Number = {14},
    Pages = {1731-8},
    Pmc = {PMC2705231},
    Pmid = {19447783},
    Pst = {ppublish},
    Title = {Data structures and compression algorithms for genomic sequence data},
    Volume = {25},
    Year = {2009},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btp319}}

@book{Press:1992:NRC:148286,
    Address = {New York, NY, USA},
    Author = {Press, WH and Teukolsky, A and Vetterling, WT and Flannery, BP},
    Date-Added = {2013-05-05 17:07:46 +0000},
    Date-Modified = {2013-05-05 17:08:32 +0000},
    Isbn = {0-521-43108-5},
    Publisher = {Cambridge University Press},
    Title = {Numerical recipes in {C} (2nd ed.): the art of scientific computing},
    Year = {1992}}

@book{Center:1976vn,
    Author = {Holborrow, C and McNemar, J and Stoneburner, P},
    Date-Added = {2013-05-05 15:26:11 +0000},
    Date-Modified = {2013-05-05 16:33:22 +0000},
    Publisher = {United States Defense Communications Agency Command \& Control Technical Center},
    Series = {AD/A-035},
    Title = {A review of data compression algorithms},
    Year = {1976},
    Bdsk-Url-1 = {http://books.google.com/books?id=yXA%5C_AQAAIAAJ}}

@article{Simpson:2012fk,
    Abstract = {De novo genome sequence assembly is important both to generate new sequence assemblies for previously uncharacterized genomes and to identify the genome sequence of individuals in a reference-unbiased way. We present memory efficient data structures and algorithms for assembly using the FM-index derived from the compressed Burrows-Wheeler transform, and a new assembler based on these called SGA (String Graph Assembler). We describe algorithms to error-correct, assemble, and scaffold large sets of sequence data. SGA uses the overlap-based string graph model of assembly, unlike most de novo assemblers that rely on de Bruijn graphs, and is simply parallelizable. We demonstrate the error correction and assembly performance of SGA on 1.2 billion sequence reads from a human genome, which we are able to assemble using 54 GB of memory. The resulting contigs are highly accurate and contiguous, while covering 95% of the reference genome (excluding contigs <200 bp in length). Because of the low memory requirements and parallelization without requiring inter-process communication, SGA provides the first practical assembler to our knowledge for a mammalian-sized genome on a low-end computing cluster.},
    Author = {Simpson, JT and Durbin, R},
    Date-Added = {2013-05-05 15:21:45 +0000},
    Date-Modified = {2013-05-05 15:22:09 +0000},
    Doi = {10.1101/gr.126953.111},
    Journal = {Genome Res},
    Journal-Full = {Genome research},
    Mesh = {Algorithms; Animals; Computational Biology; Data Compression; Genomics; Humans; Internet; Reproducibility of Results; Sequence Analysis, DNA; Software},
    Month = {Mar},
    Number = {3},
    Pages = {549--56},
    Pmc = {PMC3290790},
    Pmid = {22156294},
    Pst = {ppublish},
    Title = {Efficient de novo assembly of large genomes using compressed data structures},
    Volume = {22},
    Year = {2012},
    Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.126953.111}}

@article{Pevzner:2001kx,
    Abstract = {For the last 20 years, fragment assembly in DNA sequencing followed the "overlap-layout-consensus" paradigm that is used in all currently available assembly tools. Although this approach proved useful in assembling clones, it faces difficulties in genomic shotgun assembly. We abandon the classical "overlap-layout-consensus" approach in favor of a new euler algorithm that, for the first time, resolves the 20-year-old "repeat problem" in fragment assembly. Our main result is the reduction of the fragment assembly to a variation of the classical Eulerian path problem that allows one to generate accurate solutions of large-scale sequencing problems. euler, in contrast to the celera assembler, does not mask such repeats but uses them instead as a powerful fragment assembly tool.},
    Author = {Pevzner, PA and Tang, H and Waterman, MS},
    Date-Added = {2013-05-04 13:17:23 +0000},
    Date-Modified = {2013-05-04 13:19:43 +0000},
    Doi = {10.1073/pnas.171285098},
    Journal = {Proc Natl Acad Sci USA},
    Journal-Full = {Proceedings of the National Academy of Sciences of the United States of America},
    Mesh = {Algorithms; Campylobacter jejuni; Contig Mapping; DNA, Bacterial; Genome, Bacterial; Lactococcus lactis; Models, Theoretical; Neisseria meningitidis; Sequence Alignment; Sequence Analysis, DNA; Software},
    Month = {Aug},
    Number = {17},
    Pages = {9748-53},
    Pmc = {PMC55524},
    Pmid = {11504945},
    Pst = {ppublish},
    Title = {An {E}ulerian path approach to {DNA} fragment assembly},
    Volume = {98},
    Year = {2001},
    Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.171285098}}

@article{Grabherr:2011uq,
    Abstract = {Massively parallel sequencing of cDNA has enabled deep and efficient probing of transcriptomes. Current approaches for transcript reconstruction from such data often rely on aligning reads to a reference genome, and are thus unsuitable for samples with a partial or missing reference genome. Here we present the Trinity method for de novo assembly of full-length transcripts and evaluate it on samples from fission yeast, mouse and whitefly, whose reference genome is not yet available. By efficiently constructing and analyzing sets of de Bruijn graphs, Trinity fully reconstructs a large fraction of transcripts, including alternatively spliced isoforms and transcripts from recently duplicated genes. Compared with other de novo transcriptome assemblers, Trinity recovers more full-length transcripts across a broad range of expression levels, with a sensitivity similar to methods that rely on genome alignments. Our approach provides a unified solution for transcriptome reconstruction in any sample, especially in the absence of a reference genome.},
    Author = {Grabherr, MG and Haas, BJ and Yassour, M and Levin, JZ and Thompson, DA and Amit, I and Adiconis, X and Fan, L and Raychowdhury, R and Zeng, Q and Chen, Z and Mauceli, E and Hacohen, N and Gnirke, A and Rhind, N and di Palma, F and Birren, BW and Nusbaum, C and Lindblad-Toh, K and Friedman, N and Regev, A},
    Date-Added = {2013-05-04 13:09:12 +0000},
    Date-Modified = {2013-05-05 18:28:31 +0000},
    Doi = {10.1038/nbt.1883},
    Journal = {Nat Biotechnol},
    Journal-Full = {Nature biotechnology},
    Mesh = {Algorithms; Base Sequence; Gene Expression Profiling; Molecular Sequence Data; RNA; Reference Values; Sequence Analysis, RNA; Transcriptome},
    Month = {Jul},
    Number = {7},
    Pages = {644--52},
    Pmc = {PMC3571712},
    Pmid = {21572440},
    Pst = {epublish},
    Title = {Full-length transcriptome assembly from {RNA-Seq} data without a reference genome},
    Volume = {29},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1038/nbt.1883}}

@article{CICHELLI:1980fk,
    Author = {Cichelli, RJ},
    Date-Added = {2013-05-04 12:58:44 +0000},
    Date-Modified = {2013-05-04 13:04:13 +0000},
    Isi = {A1980JD03400003},
    Isi-Recid = {39176378},
    Isi-Ref-Recids = {32240320 35795477 32884715},
    Iso-Source-Abbreviation = {Commun Acm},
    Journal = {Commun ACM},
    Pages = {17--19},
    Times-Cited = {58},
    Title = {Minimal perfect hash functions made simple},
    Volume = {23},
    Year = {1980},
    Bdsk-Url-1 = {http://ws.isiknowledge.com/cps/openurl/service?url_ver=Z39.88-2004&rft_id=info:ut/A1980JD03400003}}

@article{HUFFMAN:1952nr,
    Author = {Huffman, DA},
    Date-Added = {2013-05-03 16:32:22 +0000},
    Date-Modified = {2013-05-04 13:18:08 +0000},
    Isi = {A1952UM76900017},
    Isi-Recid = {1861079},
    Isi-Ref-Recids = {859653 1862010 1862011},
    Iso-Source-Abbreviation = {P Ire},
    Journal = {Proceedings of the Institute of Radio Engineers},
    Pages = {1098--1101},
    Times-Cited = {1565},
    Title = {A method for the construction of minimum-redundancy codes},
    Volume = {40},
    Year = {1952},
    Bdsk-Url-1 = {http://ws.isiknowledge.com/cps/openurl/service?url_ver=Z39.88-2004&rft_id=info:ut/A1952UM76900017}}

@misc{fastg,
    Author = {The FASTG Format Specification Working Group},
    Date-Added = {2013-05-03 16:14:09 +0000},
    Date-Modified = {2013-05-03 16:19:38 +0000},
    Title = {The {FASTG} Format Specification (v1.00), An expressive representation for genome assemblies},
    Urldate = {http://fastg.sourceforge.net/FASTG_Spec_v1.00.pdf}}

@article{Marcais:2011fk,
    Abstract = {MOTIVATION: Counting the number of occurrences of every k-mer (substring of length k) in a long string is a central subproblem in many applications, including genome assembly, error correction of sequencing reads, fast multiple sequence alignment and repeat detection. Recently, the deep sequence coverage generated by next-generation sequencing technologies has caused the amount of sequence to be processed during a genome project to grow rapidly, and has rendered current k-mer counting tools too slow and memory intensive. At the same time, large multicore computers have become commonplace in research facilities allowing for a new parallel computational paradigm.
RESULTS: We propose a new k-mer counting algorithm and associated implementation, called Jellyfish, which is fast and memory efficient. It is based on a multithreaded, lock-free hash table optimized for counting k-mers up to 31 bases in length. Due to their flexibility, suffix arrays have been the data structure of choice for solving many string problems. For the task of k-mer counting, important in many biological applications, Jellyfish offers a much faster and more memory-efficient solution.
AVAILABILITY: The Jellyfish software is written in C++ and is GPL licensed. It is available for download at http://www.cbcb.umd.edu/software/jellyfish.},
    Author = {Mar\c{c}ais, G and Kingsford, C},
    Date-Added = {2013-05-03 16:05:34 +0000},
    Date-Modified = {2013-05-03 16:08:25 +0000},
    Doi = {10.1093/bioinformatics/btr011},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Algorithms; Animals; Base Sequence; Computational Biology; Genome; Humans; Sequence Alignment; Sequence Analysis, DNA; Software},
    Month = {Mar},
    Number = {6},
    Pages = {764--70},
    Pmc = {PMC3051319},
    Pmid = {21217122},
    Pst = {ppublish},
    Title = {A fast, lock-free approach for efficient parallel counting of occurrences of $k$-mers},
    Volume = {27},
    Year = {2011},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btr011}}

@article{Lipman:1985qf,
    Abstract = {An algorithm was developed which facilitates the search for similarities between newly determined amino acid sequences and sequences already available in databases. Because of the algorithm's efficiency on many microcomputers, sensitive protein database searches may now become a routine procedure for molecular biologists. The method efficiently identifies regions of similar sequence and then scores the aligned identical and differing residues in those regions by means of an amino acid replacability matrix. This matrix increases sensitivity by giving high scores to those amino acid replacements which occur frequently in evolution. The algorithm has been implemented in a computer program designed to search protein databases very rapidly. For example, comparison of a 200-amino-acid sequence to the 500,000 residues in the National Biomedical Research Foundation library would take less than 2 minutes on a minicomputer, and less than 10 minutes on a microcomputer (IBM PC).},
    Author = {Lipman, DJ and Pearson, WR},
    Date-Added = {2013-05-02 21:01:23 +0000},
    Date-Modified = {2013-05-02 21:01:42 +0000},
    Journal = {Science},
    Journal-Full = {Science (New York, N.Y.)},
    Mesh = {Amino Acid Sequence; Angiotensinogen; Animals; Biological Evolution; Bunyaviridae; Cattle; Computers; Cyclic AMP; Cytochrome c Group; Humans; Information Systems; Microcomputers; Nucleoproteins; Probability; Protein Kinases; Protein Precursors; Proteins; Rats; Software; Viral Proteins},
    Month = {Mar},
    Number = {4693},
    Pages = {1435--41},
    Pmid = {2983426},
    Pst = {ppublish},
    Title = {Rapid and sensitive protein similarity searches},
    Volume = {227},
    Year = {1985}}

@article{Cock:2010ve,
    Abstract = {FASTQ has emerged as a common file format for sharing sequencing read data combining both the sequence and an associated per base quality score, despite lacking any formal definition to date, and existing in at least three incompatible variants. This article defines the FASTQ format, covering the original Sanger standard, the Solexa/Illumina variants and conversion between them, based on publicly available information such as the MAQ documentation and conventions recently agreed by the Open Bioinformatics Foundation projects Biopython, BioPerl, BioRuby, BioJava and EMBOSS. Being an open access publication, it is hoped that this description, with the example files provided as Supplementary Data, will serve in future as a reference for this important file format.},
    Author={Cock, Peter JA and Fields, Christopher J and Goto, Naohisa and Heuer, Michael L and Rice, Peter M},
    Date-Added = {2013-05-02 20:47:09 +0000},
    Date-Modified = {2013-05-02 20:48:37 +0000},
    Doi = {10.1093/nar/gkp1137},
    Journal = {Nucleic Acids Res},
    Journal-Full = {Nucleic acids research},
    Mesh = {Computational Biology; History, 20th Century; History, 21st Century; Sequence Analysis, DNA; Software},
    Month = {Apr},
    Number = {6},
    Pages = {1767-71},
    Pmc = {PMC2847217},
    Pmid = {20015970},
    Pst = {ppublish},
    Title = {The {S}anger {FASTQ} file format for sequences with quality scores, and the {Solexa/Illumina FASTQ} variants},
    Volume = {38},
    Year = {2010},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkp1137}}

@article{Ewing:1998ly,
    Abstract = {Elimination of the data processing bottleneck in high-throughput sequencing will require both improved accuracy of data processing software and reliable measures of that accuracy. We have developed and implemented in our base-calling program phred the ability to estimate a probability of error for each base-call, as a function of certain parameters computed from the trace data. These error probabilities are shown here to be valid (correspond to actual error rates) and to have high power to discriminate correct base-calls from incorrect ones, for read data collected under several different chemistries and electrophoretic conditions. They play a critical role in our assembly program phrap and our finishing program consed.},
    Author={Ewing, Brent and Green, Phil},
    Date-Added = {2013-05-02 20:44:01 +0000},
    Date-Modified = {2013-05-02 20:50:52 +0000},
    Journal = {Genome Research},
    Journal-Full = {Genome Research},
    Mesh = {Base Sequence; Chimera; Cloning, Molecular; Data Interpretation, Statistical; Discriminant Analysis; Genetic Vectors; Human Genome Project; Humans; Probability; Quality Control; Reproducibility of Results; Sequence Analysis, DNA; Software},
    Month = {Mar},
    Number = {3},
    Pages = {186--94},
    Pmid = {9521922},
    Pst = {ppublish},
    Title = {Base-calling of automated sequencer traces using {P}hred. {II}. {E}rror probabilities},
    Volume = {8},
    Year = {1998}}

@article{Ewing:1998zr,
    Abstract = {The availability of massive amounts of DNA sequence information has begun to revolutionize the practice of biology. As a result, current large-scale sequencing output, while impressive, is not adequate to keep pace with growing demand and, in particular, is far short of what will be required to obtain the 3-billion-base human genome sequence by the target date of 2005. To reach this goal, improved automation will be essential, and it is particularly important that human involvement in sequence data processing be significantly reduced or eliminated. Progress in this respect will require both improved accuracy of the data processing software and reliable accuracy measures to reduce the need for human involvement in error correction and make human review more efficient. Here, we describe one step toward that goal: a base-calling program for automated sequencer traces, phred, with improved accuracy. phred appears to be the first base-calling program to achieve a lower error rate than the ABI software, averaging 40%-50% fewer errors in the data sets examined independent of position in read, machine running conditions, or sequencing chemistry.},
    Author = {Ewing, B and Hillier, L and Wendl, MC and Green, P},
    Date-Added = {2013-05-02 20:42:38 +0000},
    Date-Modified = {2013-05-02 20:50:40 +0000},
    Journal = {Genome Res},
    Journal-Full = {Genome research},
    Mesh = {Algorithms; Base Sequence; Human Genome Project; Humans; Reproducibility of Results; Sensitivity and Specificity; Sequence Alignment; Sequence Analysis, DNA; Software},
    Month = {Mar},
    Number = {3},
    Pages = {175--85},
    Pmid = {9521921},
    Pst = {ppublish},
    Title = {Base-calling of automated sequencer traces using {P}hred. {I}. {A}ccuracy assessment},
    Volume = {8},
    Year = {1998}}

@article{Tembe:2010ys,
    Abstract = {SUMMARY: Large volumes of data generated by high-throughput sequencing instruments present non-trivial challenges in data storage, content access and transfer. We present G-SQZ, a Huffman coding-based sequencing-reads-specific representation scheme that compresses data without altering the relative order. G-SQZ has achieved from 65% to 81% compression on benchmark datasets, and it allows selective access without scanning and decoding from start. This article focuses on describing the underlying encoding scheme and its software implementation, and a more theoretical problem of optimal compression is out of scope. The immediate practical benefits include reduced infrastructure and informatics costs in managing and analyzing large sequencing data. AVAILABILITY: http://public.tgen.org/sqz. Academic/non-profit: Source: available at no cost under a non-open-source license by requesting from the web-site; Binary: available for direct download at no cost. For-Profit: Submit request for for-profit license from the web-site.},
    Author={Tembe, Waibhav and Lowey, James and Suh, Edward},Date-Added = {2013-05-02 20:37:33 +0000},
    Date-Modified = {2013-05-02 20:38:14 +0000},
    Doi = {10.1093/bioinformatics/btq346},
    Journal = {Bioinformatics},
    Journal-Full = {Bioinformatics (Oxford, England)},
    Mesh = {Algorithms; Computational Biology; Data Compression; Sequence Analysis, DNA; Software},
    Month = {Sep},
    Number = {17},
    Pages = {2192-4},
    Pmid = {20605925},
    Pst = {ppublish},
    Title = {{G-SQZ}: compact encoding of genomic sequence and quality data},
    Volume = {26},
    Year = {2010},
    Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btq346}}

@techreport{mahoney05,
    Address = {Melbourne, FL USA},
    Author = {Mahoney, M},
    Date-Added = {2013-05-02 18:33:29 +0000},
    Date-Modified = {2013-05-02 18:36:54 +0000},
    Institution = {Florida Institute of Technology},
    Number = {2005-16},
    Title = {Adaptive weighing of context models for lossless data compression},
    Type = {CS},
    Year = {2005}}

@misc{paq1,
    Author = {Mahoney, M},
    Date-Added = {2013-05-02 18:22:02 +0000},
    Date-Modified = {2013-05-02 18:23:42 +0000},
    Title = {The {PAQ1} data compression program},
    Year = {2002}}

@article{CLEARY:1984uq,
    Author = {Cleary, JG and Witten, IH},
    Date-Added = {2013-05-02 17:58:02 +0000},
    Date-Modified = {2013-05-02 18:01:46 +0000},
    Isi = {A1984SN57800008},
    Isi-Recid = {52120458},
    Isi-Ref-Recids = {48718371 52120459 52120460 52120461 52120462 22446365 39519438 5805636 40573628 43243513 34979915 43243313 52120463 52120464 42320561 22694466 13220396 42320562 42320556 52120465 13472140 36610958 49669218 35288488},
    Iso-Source-Abbreviation = {Ieee T Commun},
    Journal = {IEEE T Commun},
    Pages = {396--402},
    Times-Cited = {281},
    Title = {Data-Compression using adaptive coding and partial string matching},
    Volume = {32},
    Year = {1984},
    Bdsk-Url-1 = {http://ws.isiknowledge.com/cps/openurl/service?url_ver=Z39.88-2004&rft_id=info:ut/A1984SN57800008}}

@article{Loh:2012fk,
    Author = {Loh, P-R and Baym, M and Berger, B},
    Date-Added = {2013-05-02 17:47:16 +0000},
    Date-Modified = {2013-05-02 18:03:49 +0000},
    Doi = {10.1038/nbt.2241},
    Journal = {Nat Biotechnol},
    Journal-Full = {Nature biotechnology},
    Mesh = {Algorithms; Computational Biology; Data Mining; Genome, Human; High-Throughput Nucleotide Sequencing; Humans; Yeasts},
    Month = {Jul},
    Number = {7},
    Pages = {627-30},
    Pmid = {22781691},
    Pst = {epublish},
    Title = {Compressive genomics},
    Volume = {30},
    Year = {2012},
    Bdsk-Url-1 = {http://dx.doi.org/10.1038/nbt.2241}}