-
Notifications
You must be signed in to change notification settings - Fork 0
/
diff2.tex
3801 lines (3116 loc) · 246 KB
/
diff2.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%DIF LATEXDIFF DIFFERENCE FILE
%DIF DEL origthesis.tex Wed Jul 29 12:50:33 2015
%DIF ADD completethesis.tex Wed Jul 29 17:58:50 2015
\newcommand{\mydriver}{pdftex}
\documentclass[12pt,\mydriver]{thesis}
\usepackage{titlesec}
\titleformat{\chapter}
{\normalfont\large}{Chapter \thechapter:}{1em}{}
\usepackage{color}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{cite}
\usepackage{lscape}
\usepackage{indentfirst}
\usepackage{latexsym}
\usepackage{multirow}
\usepackage{tabls}
\usepackage{wrapfig}
\usepackage{slashbox}
\usepackage{longtable}
\usepackage{supertabular}
\usepackage{subfigure}
\usepackage[cmex10]{amsmath}
\newcommand{\tbsp}{\rule{0pt}{18pt}} \renewcommand{\baselinestretch}{2}
\setlength{\textwidth}{5.9in}
\setlength{\textheight}{9in}
\setlength{\topmargin}{-.50in}
\setlength{\oddsidemargin}{.55in}
\setlength{\parindent}{.4in}
\pagestyle{empty}
%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF
%DIF UNDERLINE PREAMBLE %DIF PREAMBLE
\RequirePackage[normalem]{ulem} %DIF PREAMBLE
\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} %DIF PREAMBLE
\providecommand{\DIFaddtex}[1]{{\protect\color{blue}\uwave{#1}}} %DIF PREAMBLE
\providecommand{\DIFdeltex}[1]{{\protect\color{red}\sout{#1}}} %DIF PREAMBLE
%DIF SAFE PREAMBLE %DIF PREAMBLE
\providecommand{\DIFaddbegin}{} %DIF PREAMBLE
\providecommand{\DIFaddend}{} %DIF PREAMBLE
\providecommand{\DIFdelbegin}{} %DIF PREAMBLE
\providecommand{\DIFdelend}{} %DIF PREAMBLE
%DIF FLOATSAFE PREAMBLE %DIF PREAMBLE
\providecommand{\DIFaddFL}[1]{\DIFadd{#1}} %DIF PREAMBLE
\providecommand{\DIFdelFL}[1]{\DIFdel{#1}} %DIF PREAMBLE
\providecommand{\DIFaddbeginFL}{} %DIF PREAMBLE
\providecommand{\DIFaddendFL}{} %DIF PREAMBLE
\providecommand{\DIFdelbeginFL}{} %DIF PREAMBLE
\providecommand{\DIFdelendFL}{} %DIF PREAMBLE
%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF
%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF
%DIF HYPERREF PREAMBLE %DIF PREAMBLE
\providecommand{\DIFadd}[1]{\texorpdfstring{\DIFaddtex{#1}}{#1}} %DIF PREAMBLE
\providecommand{\DIFdel}[1]{\texorpdfstring{\DIFdeltex{#1}}{}} %DIF PREAMBLE
%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF
\begin{document}
\clearpage{}
\hbox{\ }
\renewcommand{\baselinestretch}{1}
\small \normalsize
\begin{center}
\large{{ABSTRACT}}
\vspace{3em}
\end{center}
\hspace{-.15in}
\begin{tabular}{ll}
Title of dissertation: & {\large NOVEL METHODS FOR COMPARING}\\
& {\large AND EVALUATING SINGLE AND } \\
& {\large METAGENOMIC ASSEMBLIES} \\
\ \\
& {\large Christopher Michael Hill, Doctor of Philosophy, 2015} \\
\ \\
Dissertation directed by: & {\large Professor Mihai Pop} \\
& {\large Department of Computer Science } \\
\end{tabular}
\vspace{3em}
\renewcommand{\baselinestretch}{2}
\large \normalsize
The current revolution in genomics has been made possible by software tools called genome assemblers, which stitch together DNA fragments ``read'' by sequencing machines into complete or nearly complete genome sequences. Despite decades of research in this field and the development of dozens of genome assemblers, assessing and comparing the quality of assembled genome sequences still heavily relies on the availability of independently determined standards, such as manually curated genome sequences, or independently produced mapping data. The focus of this work is to develop reference-free computational methods to accurately compare and evaluate genome assemblies.
We introduce a reference-free likelihood-based measure of assembly quality which allows for an objective comparison of multiple assemblies generated from the same set of reads. We define the quality of a sequence produced by an assembler as the conditional probability of observing the sequenced reads from the assembled sequence. A key property of our metric is that the true genome sequence maximizes the score, unlike other commonly used metrics.
Despite the unresolved challenges of single genome assembly, the decreasing costs of sequencing technology has led to a sharp increase in metagenomics projects over the past decade.
These projects allow us to better understand the diversity and function of microbial communities found in the environment, including the ocean, Arctic regions, other living organisms, and the human body.
We extend our likelihood-based framework and show that we can accurately compare assemblies of these complex bacterial communities.
After an assembly has been produced, it is not an easy task determining what parts of the underlying genome are missing, what parts are mistakes, and what parts are due to experimental artifacts from the sequencing machine.
Here we introduce VALET, the first reference-free pipeline that flags regions in metagenomic assemblies that are statistically inconsistent with the data generation process.
VALET detects mis-assemblies in publicly available datasets and highlights the current shortcomings in available metagenomic assemblers.
By providing the computational methods for researchers to accurately evaluate their assemblies, we decrease the chance of incorrect biological conclusions and misguided future studies.
\clearpage{}
\clearpage{}
\thispagestyle{empty}
\hbox{\ }
\vspace{1in}
\renewcommand{\baselinestretch}{1}
\small\normalsize
\begin{center}
\large{{\large NOVEL METHODS FOR COMPARING AND EVALUATING}\\
{\large SINGLE AND METAGENOMIC ASSEMBLIES }}\\
\ \\
\ \\
\large{by} \\
\ \\
\large{Christopher Michael Hill}\ \\
\ \\
\ \\
\ \\
\normalsize
Dissertation submitted to the Faculty of the Graduate School of the \\
University of Maryland, College Park in partial fulfillment \\
of the requirements for the degree of \\
Doctor of Philosophy \\
2015
\end{center}
\vspace{7.5em}
\noindent Advisory Committee: \\
Professor Mihai Pop, Chair/Advisor \\
Professor Atif Memon \\
Professor H\'{e}ctor Corrada Bravo \\
Professor Michael Cummings \\
Professor Stephen Mount, Dean$\text{'}$s Representative
\clearpage{}
\clearpage{}
\thispagestyle{empty}
\hbox{\ }
\vfill
\renewcommand{\baselinestretch}{1}
\small\normalsize
\vspace{-.65in}
\begin{center}
\large{\copyright \hbox{ }Copyright by\\
Christopher Michael Hill \\
2015}
\end{center}
\vfill
\clearpage{}
\pagestyle{plain}
\pagenumbering{roman}
\setcounter{page}{2}
\clearpage{}
\renewcommand{\baselinestretch}{2}
\small\normalsize
\hbox{\ }
\vspace{-.65in}
\begin{center}
\large{Preface}
\end{center}
The algorithms, software, and results in this dissertation have either been published in peer-reviewed journals and conferences or are currently under preparation for submission.
At the time of this writing, Chapters 2, 3, 4, and 6 have already been published or submitted and are reformatted here. Chapter 5 is under preparation for submission.
I am indebted to my co-authors on these projects - their dedication and knowledge in the areas of computer science, statistics, and biology have resulted in much stronger scientific papers. \\
\noindent\textbf{Chapter 2}:
\noindent - Mohammadreza Ghodsi*, Christopher M. Hill*, Irina Astrovskaya, Henry Lin, Dan D. Sommer, Sergey Koren, and Mihai Pop. \textit{De novo likelihood-based measures for comparing genome assemblies}. BMC research notes 6, no. 1 (2013): 334.
\DIFaddbegin \DIFadd{My contributions to this work include: (1) aiding in the development of the underlying theory, specifically to the development of the theory incorporating paired-read information, (2) implementation of the alignment-based and sampling methods, (3) producing all of the results, and (4) aiding in drafting the manuscript.
}
\DIFaddend The authors would like to thank H\'{e}ctor Corrada Bravo and Bo Liu for their advice on the sampling procedure and associated statistics, Todd Treangen for advice on accessing the GAGE data, and the other members of the Pop lab for valuable discussions on all aspects of our work. This work was supported in part by the National Science Foundation
(grants IIS-0812111, IIS-1117247 to MP), and by the National
Institutes of Health (grant R01-HG-004885 to MP).\\
\noindent\textbf{Chapter 3}:
\noindent - Christopher M. Hill, Irina Astrovskaya, Heng Huang, Sergey Koren, Atif Memon, Todd J. Treangen, and \DIFdelbegin \DIFdel{Mihaela }\DIFdelend \DIFaddbegin \DIFadd{Mihai }\DIFaddend Pop. \textit{De novo likelihood-based measures for comparing metagenomic assemblies}. In Bioinformatics and Biomedicine (BIBM), 2013 IEEE International Conference on, pp. 94-98. IEEE, 2013.
\DIFaddbegin \DIFadd{My contributions to this work include: (1) developing the theory to handle the addition of organismal abundances, (2) drafting the manuscript, and (3) producing the majority of the results.
}
\DIFaddend The authors would like to thank the members of the Pop lab for valuable discussions on all aspects of our work. This work was supported in part by the NIH, grant R01-AI-100947 to MP, and the NSF, grant IIS-1117247 to MP. \\
\noindent - Koren, Sergey, Todd J. Treangen, Christopher M. Hill, Mihai Pop, and Adam M. Phillippy. \textit{Automated ensemble assembly and validation of microbial genomes}. BMC bioinformatics 15, no. 1 (2014): 126.
\DIFaddbegin \DIFadd{My contributions to this work include: (1) the development of the }\textsc{\DIFadd{lap}} \DIFadd{software used by MetAMOS.
}
\DIFaddend The authors would like to thank Magoc et al. and Comas et al. who submitted the raw data that was used in this study. We thank Lex Nederbragt and an anonymous reviewer for detailed comments on the manuscript and iMetAMOS software, usability, and documentation. MP and CMH were supported by NIH grant R01-AI-100947and the NSF grant IIS-1117247. \\
\noindent\textbf{Chapter 4}:
\noindent - Christopher M. Hill, Sergey Koren, Daniel Sommer, Bryan Dzung Ta, Atif Memon. and Mihai Pop. \textit{De novo genome assembly regression testing}. Under revision.
\DIFaddbegin \DIFadd{My contributions to this work include: (1) generating the software used by the assembler regression pipeline, (2) evaluating SOAPdenovo2's version history, (3) evaluating the effect of read permutation and multiple threads on assembly quality, and (4) drafting the manuscript.
}
\DIFaddend The authors would like to thank members of the Memon and Pop labs for their support and valuable discussions. This work was supported by NIH grant R01-AI-100947 to MP. \\
\noindent\textbf{Chapter 5}:
\noindent - Christopher M. Hill, Jonathan Gluck, Atif Memon, and Mihai Pop. \textit{VALET: a de novo pipeline for finding metagenomic mis-assemblies}. In preparation. \\
\DIFaddbegin \DIFadd{My contributions to this work include: (1) aiding in the development and theory of the overall pipeline, (2) developing the tool used for finding highly variable coverage regions, (3) producing all results, and (4) drafting the manuscript.
}
\DIFaddend \noindent\textbf{Chapter 6}:
\noindent - Christopher M. Hill, Carl H. Albach, Sebastian G. Angel, and Mihai Pop. \textit{K-mulus: Strategies for BLAST in the Cloud}. In Parallel Processing and Applied Mathematics, pp. 237-246. Springer Berlin Heidelberg, 2014.
\DIFaddbegin \DIFadd{My contributions to this work include: (1) implementing the query segmentation, database segmentation, and hybrid approaches, (2) producing the majority of the results, and (3) drafting the manuscript.
}
\DIFaddend The authors would like to thank Mohammadreza Ghodsi for advice on clustering, Daniel Sommer for advice on Hadoop, Lee Mendelowitz for manuscript feedback, Katherine Fenstermacher for the name K-mulus, and the other members of the Pop lab for valuable discussions on all aspects of our work. \\
\noindent - Christopher M. Hill, Andras Szolek, Mohamed El Hadidi, and Michael Cummings. \textit{Lossy compression of DNA sequencing quality data}. Under review.
\DIFaddbegin \DIFadd{My contributions to this work include: (1) developing the overall pipeline, (2) implementing the regression, profile, and binning approaches, and (3) aiding in the drafting of the manuscript.
}
\DIFaddend The authors would like to thank the 2014 Bioinformatics
Exchange for Students and Teachers (\textsc{best}) Summer School,
funded by the offices of the Dean of The Graduate School at University
of Maryland and the Rektor of University of T\"{u}bingen, where this
research was initiated.
\clearpage{}
\clearpage{}
\renewcommand{\baselinestretch}{2}
\small\normalsize
\hbox{\ }
\vspace{-.65in}
\begin{center}
\large{Dedication}
\end{center}
To those who inspired it and will not read it.
\clearpage{}
\clearpage{}
\renewcommand{\baselinestretch}{2}
\small\normalsize
\hbox{\ }
\vspace{-.65in}
\begin{center}
\large{Acknowledgments}
\end{center}
\vspace{1ex}
I have been truly fortunate to know and work with many outstanding people throughout my career, and I am grateful to all of you.
First and foremost, I would like to thank my advisor Mihai Pop.
It has been an absolute privilege to be under your guidance for the past 8 years.
I hope that you will continue to watch over my academic career and be proud.
I am eternally grateful for your advice, support, and encouragement and hope to have the opportunity to pay it forward someday.
I would also like to thank all of my other committee members. I will always enjoy the [often off-topic] discussions with Atif Memon who has introduced me to the world of software testing.
I am indebted to H\'{e}ctor Corrada Bravo for increasing the focus of statistics within the Center for Bioinformatics and Computational Biology.
I wish to thank Michael Cummings for his mentorship and allowing me to work abroad with him.
I have built up collaborations and friendships that were made possible thanks to him.
I am also very grateful to Stephen Mount for always being able to answer any biological question I come up with during our social hours.
I owe a great deal of thanks to countless past and present faculty and graduate students at the University of Maryland.
In particular, the majority of this dissertation was only made possible with the help of MohammadReza Ghodsi, Irina Astrovskaya, and Henry Lin.
In addition, I owe a great deal of thanks to Sergey Koren who has been my mentor and oracle for all things related to assembly.
Ted Gibbons has been a close friend and would always listen to me rant about the seemingly endless negative results and lack of progress.
I would also like to thank everyone (past and present) from the labs of Mihai Pop, H\'{e}ctor Corrada Bravo, Sridhar Hannenhalli, and Steven Salzberg.
Finally, I must thank my friends, family, and loved ones who have supported me unconditionally throughout this adventure.
It is impossible to remember everyone, so I apologize to anyone who may have been left out. If I forgot you, let me know and I will happily buy you a drink.
\clearpage{}
\renewcommand{\baselinestretch}{1}
\small\normalsize
\tableofcontents \newpage
\listoftables \listoffigures \newpage
\newpage
\setlength{\parskip}{0em}
\renewcommand{\baselinestretch}{2}
\small\normalsize
\setcounter{page}{1}
\pagenumbering{arabic}
\clearpage{}
\newcommand{\edit}[1]{\textcolor{black}{#1}}
\renewcommand{\thechapter}{1}
\chapter{Introduction}
\section{Genome assembly}
The genome sequence of an organism is a critical resource for
biologists trying to understand the organism's function and
evolution. Obtaining this sequence is difficult as modern sequencing
technologies can only ``read'' small pieces of the genome (called
\emph{reads}). The fact that these tiny \emph{reads} (under a few
thousands of basepairs/characters in length) can be glued together to reconstruct genomes
comprising millions to billions of basepairs is by no means evident
and was the subject of vigorous scientific debate during the early
days of sequencing technologies~\cite{green1997against,weber1997human}. The modern genomic revolution was in no small part made
possible by the development of algorithms and computational tools called
\emph{genome assemblers} able to reconstruct near-complete
representations of a genome's sequence from the fragmented data
generated by sequencing instruments. Despite tremendous advances made
over the past 30 years in both sequencing technologies and assembly
algorithms, genome assembly remains a highly difficult computational
problem. In all but the simplest cases, genome assemblers cannot
fully and correctly reconstruct an organism's genome. Instead, the
output of an assembler consists of a set of contiguous sequence
fragments (\emph{contigs}), which can be further ordered and oriented
into \emph{scaffolds}, representing the relative placement of the
contigs, with possible intervening gaps, along the genome.
\subsection{Computational challenges of assembly}
Theoretical analyses of the assembly problem, commonly formulated as
an optimization problem within an appropriately defined graph, have
shown that assembly is
NP-hard~\cite{myers1995,medvedev2007computability}, i.e., finding the
correct optimal solution may require an exhaustive search of an
exponential number of possible solutions. The difficulty of genome
assembly is due to the presence of repeated DNA
segments (\emph{repeats}) in most genomes. Repeats longer than the length of the sequenced reads lead to ambiguity in the reconstruction of the genome
-- many different genomes can be built from the same set of
reads~\cite{nagarajan2009complexity,kingsford2010assembly}.
As a result, practical implementations of assembly algorithms (such as
ABySS~\cite{ABySS}, Velvet~\cite{Velvet},
SOAPdenovo~\cite{li2010novo}, etc.) return just an approximate
solution that either contains errors, or is fragmented, or both.
Ideally, in a genomic experiment, assembly would be followed by the
scrupulous manual curation of the assembled sequence to correct the
hundreds to thousands of errors~\cite{salzberg2005misassemblies}, and
fill in the gaps between the assembled
contigs~\cite{nagarajan2010finishing}. Despite the value of fully
completed and verified genome sequences~\cite{fraser2002value}, the
substantial effort and associated cost necessary to conduct a
finishing experiment to its conclusion can only be justified for a
few high-priority genomes (such as reference strains or model
organisms). The majority of the genomes sequenced today are
automatically reconstructed in a ``draft'' state. Despite the fact
that valuable biological conclusions can be derived from draft
sequences~\cite{branscomb2002high}, these genomes are of uncertain
quality~\cite{chain2009genome}, possibly impacting the conclusions of
analyses and experiments that rely on their primary sequence.
\subsection{Assessing the quality of an assembly}
Assessing the quality of the sequence output by an assembler is
of critical importance, not just to inform downstream analyses, but
also to allow researchers to choose from among a rapidly increasing
collection of genome assemblers. Despite apparent incremental
improvements in the performance of genome assemblers, none of the
software tools available today outperforms the rest in all assembly
tasks. As highlighted by recent assembly
bake-offs~\cite{earl2011assemblathon,salzberg2011gage}, different
assemblers ``win the race'' depending on the specific characteristics
of the sequencing data, the structure of the genome being assembled,
or the specific needs of the downstream analysis process.
Furthermore, these recent competitions have highlighted the inherent
difficulty of assessing the quality of an assembly. More
specifically, all assemblers attempt to find a trade-off between
contiguity (the size of the contigs generated) and accuracy of the
resulting sequence. Evaluating this trade-off is difficult even when
a gold standard is available, e.g., when re-assembling a genome with
known sequence. In most practical settings, a reference genome
sequence is not available, and the validation process must rely on
other sources of information, such as independently derived data from
mapping experiments~\cite{zhou2007validation}, or from transcriptome
sequencing~\cite{adamidi2011novo}. Such data are, however, often not
generated due to their high cost relative to the rapidly decreasing
costs of sequencing. Most commonly, validation relies on \emph{de
novo} approaches based on the sequencing data alone, which include
global ``sanity checks'' (such as gene density, expected to be high in
bacterial genomes, measured, for example, through the fraction of the
assembled sequence that can be recognized by PFAM
profiles~\cite{genovo2011}) and internal consistency
measures~\cite{amosvalidate2008} that evaluate the placement of reads
and mate-pairs along the assembled sequence.
The validation approaches outlined above can highlight a number of
inconsistencies or errors in the assembled sequence, information
valuable as a guide for further validation and refinement experiments,
but difficult to use in a comparative setting where the goal is to
compare the quality of multiple assemblies of a same dataset. For
example, even if a reference genome sequence is available, while all
differences between the reassembled genome and the reference are, at
some level, assembly mistakes, it is unclear whether one should weigh
single nucleotide differences and short indels as much as larger
structural errors (e.g., translocation or large scale copy-number
changes)~\cite{earl2011assemblathon} when comparing different
assemblies. Furthermore, while recent advances in visualization
techniques, such as the FRCurve of Narzisi et
al.~\cite{FRC2011,vezzi2012feature}, have made it easier for
scientists to appropriately visualize the overall tradeoff between
assembly contiguity and correctness, there exist no established
approaches that allow one to appropriately weigh
the relative importance of the multitude of assembly quality measures,
many of which provide redundant information~\cite{vezzi2012feature}.
\section{Contributions of this dissertation}
In Chapter 2, we present our LAP framework, an objective and holistic approach for evaluating and
comparing the quality of assemblies derived from a same dataset. Our
approach defines the quality of an assembly as the likelihood that the
observed reads are generated from the given assembly, a value which can
be accurately estimated by appropriately modeling the sequencing
process. We show that our approach is able to automatically and accurately reproduce the
reference-based ranking of assembly tools produced by highly-cited assembly competitions: the
Assemblathon~\cite{earl2011assemblathon} and GAGE~\cite{salzberg2011gage}
competitions.
In Chapter 3, we extend our \emph{de novo} LAP framework to evaluate metagenomic assemblies.
We will show that by modifying our likelihood calculation to take into account abundances of assembled sequences, we can accurately and efficiently compare metagenomic assemblies.
We find that our extended LAP framework is able to reproduce results on data from the Human Microbiome Project (HMP)~\cite{mitreva2012structure,methe2012framework} that closely match the reference-based evaluation metrics and outperforms other \emph{de novo} metrics traditionally used to measure assembly quality.
Finally, we have integrated our LAP framework into the metagenomic analysis pipeline MetAMOS~\cite{treangen2013metamos}, allowing any user to reproduce quality assembly evaluations with relative ease.
In Chapter 4, we provide a novel regression testing framework for genome assemblers.
Our framework that uses two assembly evaluation mechanisms: \emph{assembly likelihood}, calculated using our LAP framework\cite{LAP}, and \emph{read-pair coverage}, calculated using REAPR\cite{hunt2013reapr}, to determine if code modifications result in non-trivial
changes in assembly quality.
We study assembler evolution in two contexts. First,
we examine how assembly quality changes
throughout the version history of the popular assembler SOAPdenovo~\cite{luo2012soapdenovo2}. Second,
we show that our framework can correctly evaluate decrease in assembly quality
using fault-seeded versions of another assembler Minimus~\cite{sommer2007minimus}.
Our results show that our framework accurately detects trivial
changes in assembly quality produced from permuted input reads and using
multi-core systems, which fail to be detected using traditional regression
testing methods.
In Chapter 5, we build on the pipeline described in Chapter 4 and introduce VALET, a \emph{de novo} pipeline for finding misassemblies within metagenomic assemblies.
We flag regions of the genome that are statistically inconsistent with the data generation process and underlying species abundances.
VALET is the first tool to accurately and efficiently find misassemblies in metagenomic datasets.
We run VALET on \DIFdelbegin \DIFdel{publicaly }\DIFdelend \DIFaddbegin \DIFadd{publicly }\DIFaddend available datasets and use the findings to suggest improvements for future metagenomic assemblers.
In Chapter 6, we discuss our other contributions to bioinformatics relating to the domains of clustering, compression, and cloud computing.
\clearpage{}
\clearpage{}
\renewcommand{\thechapter}{2}
\chapter{Comparing Whole-Genome Assemblies}
\section{Introduction}
Here we propose an objective and holistic approach for evaluating and
comparing the quality of assemblies derived from a same dataset. Our
approach defines the quality of an assembly as the likelihood that the
observed reads are generated from the given assembly, a value which can
be accurately estimated by appropriately modeling the sequencing
process. This basic idea was formulated in the 1990's in the
pioneering work of Gene Myers~\cite{myers1995}, where he suggested
the correct assembly of a set of reads must be consistent (in terms of
the Kolmogorov-Smirnoff test statistic) with the statistical
characteristics of the data generation process. The same basic idea
was further used in the arrival-rate statistic (A-statistic) in Celera
assembler~\cite{CeleraAssembler} to identify collapsed repeats, and
as an objective function in quasi-species (ShoRAH~\cite{SHORAH},
ViSpA~\cite{VISPA}), metagenomic (Genovo~\cite{genovo2011}),
general-purpose assemblers~\cite{medvedev2009maximum}, and recent assembly
evaluation frameworks (ALE ~\cite{clark2013ale}, CGAL~\cite{rahman2013cgal}).
In this chapter, we will describe in detail a mathematical model of the
sequencing process that takes into account sequencing errors and
mate-pair information, and show how this model can be computed in
practice. We will also show that this \emph{de novo} probabilistic framework is able to automatically and accurately reproduce the
reference-based ranking of assembly tools produced by the
Assemblathon~\cite{earl2011assemblathon} and GAGE~\cite{salzberg2011gage}
competitions.
Our work is similar in spirit to the recently published ALE~\cite{clark2013ale} and CGAL~\cite{rahman2013cgal}; however, we provide here several extensions of practical importance. First, we propose and evaluate a sampling-based protocol for computing the assembly score which allows the rapid approximation of assembly quality, enabling the application of our methods to large datasets. Second, we evaluate the effect of unassembled reads and contaminant DNA on the relative ranking of assemblies according to the likelihood score.
Finally, we will
demonstrate the use of our probabilistic quality measure as an
objective function in optimizing the parameters of assembly programs.
The software implementing our approach is made available, open-source
and free of charge, at: \url{http://assembly-eval.sourceforge.net/}.
\section{Methods}
\label{theory}
\subsection{Theoretical foundation for probabilistic evaluation}
In this section, we formalize the probabilistic formulation of
assembly quality and the model of the sequencing process that
allows us to compute the likelihood of any particular assembly of a
set of reads. We will show that the proposed probabilistic score is
correct in the sense that the score is maximized by the true genome
sequence.
\subsubsection{Likelihood of an assembly}
Let $A$ denote the event that a given assembly is the true genome
sequence, and let $R$ denote the event of observing a given set of
reads. In the following, we will use the same symbol to denote the
assembly sequence and the event of observing the assembly. We will
also use the same symbol to denote the set of reads and the event of
observing the set of reads.
According to Bayes' rule, given the observed set of reads, the probability of the assembly can be written as:
\begin{equation}
\Pr[A \vert R] = \frac{\Pr[R \vert A] \Pr[A]}{\Pr[R]}
\end{equation}
\noindent where $\Pr[A]$ is the \emph{prior probability} of observing the genome
sequence $A$. Any prior knowledge about the genome being assembled
(e.g., approximate length, presence of certain genes, etc.) can be
included in $\Pr[A]$; however, for the purpose of this paper, we will
assume that this prior probability is constant across the set of
``reasonable'' assemblies of a same set of reads.
Given commonly available information about the genomes, formulating a precise mathematical framework for defining $\Pr[A]$ is an extensive endeavor beyond the scope of this paper.
Similarly, $\Pr[R]$ is the prior probability of observing the set of
reads $R$. Since our primary goal is to compare multiple assemblies
of a same set of reads, rather than to obtain a universally accurate
measure of assembly quality, we can assume $\Pr[R]$ is a constant as
well. Thus, for the purpose of comparing assemblies, the values $\Pr[A
\vert R]$ and $\Pr[R \vert A]$ are equivalent. The latter, the
posterior probability of a set of reads, given a particular assembly of
the data, can be easily computed on the basis of an appropriately
defined model of the sequencing process and will be used in our paper
as a proxy for assembly quality.
Under the assumption that individual reads are independent of each
other (violations of this assumptions in the case of mate-pair
experiments will be discussed later in this section), $\Pr[R \vert A]
= \prod_{r \in R} \Pr[r \vert A]$. If the set of reads is unordered, we need to account for the different permutations that generate the same set of reads. As this value is a constant for any given set of reads, we ignore it in the rest of our paper.
$\Pr[r \vert A]$, hereafter referred to as $p_r$, can be computed using an appropriate
model for the sequencing process. Throughout the remainder of the
paper, we will discuss increasingly complex models and their impact on
the accuracy of the likelihood score.
\subsubsection{True genome obtains the maximum likelihood}
\label{maximizing_likelihood}
Any useful assembly quality metric must achieve its maximum value when
evaluating the true genome sequence; otherwise, incorrect assemblies of
the data would be preferred. We prove below that the likelihood
measure proposed in our paper satisfies this property.
\edit{Assuming that we have a set of reads $R$ from the true genome, produced by generating exactly one single-end read from each location in the genome without errors and with a fixed length. Given the set of reads $R$, the probability a particular read is generated from the true genome is precisely the number of times the read occurs in $R$ divided by the size of $R$ (note that multiple reads can have the same sequence, e.g., when generated from repeats). Let $N_s$ denote number of times that the sequence $s$ occurs in $R$, and $q_s = N_s/|R|$ denote the probability that sequence $s$ is generated from the true genome. To show that the true genome maximizes the likelihood score, let us assume that we have some assembly $A$ and $p_s$ is the probability that the sequence $s$ is generated from the assembly $A$.}
\edit{Given assembly $A$, our likelihood score is then the product of ${p_s}^{N_s}$ over all sequences $s$ in $S$, which can be rewritten as $\prod_{s \in S} {p_s}^{q_s|R|} = (\prod_{s \in S} {p_s}^{q_s})^{|R|}$. Now, note that since $|R|$ is a fixed constant, maximizing the likelihood score is equivalent to maximizing}
\begin{align*}
\prod_{s \in S} {p_s}^{q_s}
\end{align*}
The likelihood can be re-written as
\begin{align*}
\log (\prod_{s \in S} {p_s}^{q_s}) & = \sum_{s \in S} q_s \log p_s \\
& = \sum_{s \in S} q_s \log (\frac{p_s}{q_s}) + \sum_{s \in S} q_s \log q_s \\
& = - D_{KL}(Q||P) - H(Q),
\end{align*}
\noindent where $D_{KL}(Q||P)$ is the KL-divergence for the distributions $Q$ and
$P$, and $H(Q)$ is the Shannon entropy of $Q$. Since the
KL-divergence is always non-negative and only equal to 0 if and only if $Q = P$,
the average probability is maximized if the assembly is equal to the
true genome.
Even though the true genome does maximize the likelihood in this
model, there may be other assemblies that achieve the same optimal
score as long as these assemblies yield probabilities $p_s$ which are
equal to the probabilities $q_s$ for every sequence $s$. This can happen, for example,
in the case of a misassembly that is nonetheless consistent with the
generated reads. This situation highlights the loss of information
inherent in modern sequencing experiments -- without additional
long-range information, the information provided by the reads
themselves is insufficient to distinguish between multiple possible
reconstructions of a genome~\cite{kingsford2010assembly}.
\subsubsection{Error-free model for fragment sequencing}
\label{error_free_model}
The most basic model for the sequencing process is the
\emph{error-free model}. In this model, we assume reads of a given
fixed length (a more general read length distribution can be included
in the model but would not impact comparative analyses of assemblies
derived from a same set of reads). We further assume that reads are
uniformly sampled across the genome, i.e., that every position of the
genome is equally likely to be a starting point for a read. This
simplifying assumption is made by virtually all other theoretical
models of genome assembly, despite the biases inherent to all modern
sequencing technologies. A more accurate, technology-dependent, model
can be obtained by including additional factors that account, for
example, for DNA composition biases. For the purpose of generality,
we restrict our discussion to the uniform sampling model.
Furthermore, for the sake of simplicity, we assume (1) that the true
genome consists of a single circular contiguous sequence, (2) that our
assembly is also a single contig, and (3) that every read can be
mapped to the assembly. We will later discuss extensions of our model
that relax these assumptions.
Under these assumptions, we can compute the probability of a
read $r$ given the assembled sequence as:
\begin{equation}
\label{error_free_probability}
p_r = \frac{n_r}{2L}
\end{equation}
where $n_r$ represents the number of places where the read occurs in
the assembled sequence of length $L$. The factor $2$ is due to the
fact that reads are sampled with equal likelihood from both the
forward and reverse strands of a DNA molecule. This formulation was
previously used by Medvedev \emph{et al.}~\cite{medvedev2009maximum}
to define an objective function for genome assembly.
\subsection{A realistic model of the sequencing process}
The error-free model outlined above makes many simplifying assumptions
that are not representative of real datasets. Here we demonstrate
how the model can be extended to account for artifacts such as
sequencing errors, mate-pair information, assemblies consisting of
multiple contigs, and the presence of un-mappable reads.
\subsubsection{Sequencing errors}
\label{methods_errors}
All current technologies for sequencing DNA have a small but
significant probability of error. Here we focus on three common types
of errors: the insertion, deletion, and substitution of a nucleotide.
In the error-free model, the probability of a read having been
generated from a position $j$ in the sequence is one if the read
exactly matches the reference at that position and zero otherwise. We
now extend this model such that the probability of each read having
been generated from any position $j$ of the reference is a real value
between zero and one, representing the likelihood that a sequencing
instrument would have generated that specific read from that specific
position of the reference. This value clearly depends on the number of differences
between the sequence of the read and the sequence of the reference at
position $j$. Given the assembled sequence, the probability of a particular read will be the cumulative probability of the read
across all possible locations in the genome.
Specifically, let us denote the probability that read $r$ is observed
by sequencing the reference, \emph{ending} at position $j$ by
$p_{r,j}$. Then, the total probability of the read $r$ is
\begin{align}
\label{prob_error_sum}
p_r = \frac{{\sum_{j=1}^{L} p^{\text{forward}}_{r,j}} + {\sum_{j = 1}^{L} p^{\text{reverse}}_{r,j}}}{2L}
\end{align}
The individual probabilities $p_{r,j}$ can be computed if we
do not model insertion and
deletion errors and only allow substitution errors which occur with probability $\epsilon$. The per-base probability of a substitution error can be set individually for each based on the quality value
produced by the sequencing instrument. Then, $p_{r, j} = \epsilon^{s}(1 - \epsilon)^{l - s}$
, where $s$ is the number of substitutions needed to match read $r$ to
position $j$ of the reference sequence.
In the more general case,
$p_{r,j}$ values can be computed using dynamic programming.
\subsubsection{Exact probability calculation via dynamic programming}
\label{methods_dynamic}
For a model of the sequencing process that allows insertions, deletions,
and substitutions with specific probabilities, we can exactly compute
probability, $p_r = \Pr[r \vert A]$, of observing a read $r$ given an
assembly $A$ using a dynamic programming algorithm. In general, we
want to find the sum of the probabilities of all possible alignments of a
read to a position of the assembly.
\begin{figure}[h!]
\begin{center}
\includegraphics[]{figures/fig_1_optimal_alignments}
\end{center}
\renewcommand{\baselinestretch}{1}
\small\normalsize
\begin{quote}
\caption[Multiple optimal read alignments.]{Two different optimal alignments of the read \textbf{ACG} to the assembly \textbf{ACCG}. Our dynamic programming algorithm finds the sum of the probabilities of all possible alignments.}
\label{different_optimal_alignments}
\end{quote}
\end{figure}
\renewcommand{\baselinestretch}{2}
\small\normalsize
The number of such possible alignments grows exponentially as a
function of read length. Most of those alignments have a very
small probability. However, several alignments may have probabilities
that are equal or close to the optimal. For example, the two alignments
of the same pair of sequences in
Figure~\ref{different_optimal_alignments} have the same probability
and are both optimal alignments.
We use a dynamic programming algorithm (similar to the ``forward''
algorithm in Hidden Markov Models) to efficiently calculate the sum of the
probabilities of all alignments of a read to the assembly as follows.
In the formula~\eqref{prob_error_sum}, $p^{\text{forward}}_{r,j}$ and
$p^{\text{reverse}}_{r,j}$ are the sum of the probabilities
of \emph{all} possible alignments of the read $r$ to, respectively, the
reference and its reverse complement, ending at position $j$.
We define $T[x,y]$ as the probability of observing prefix $[1 \ldots
y]$ of the read $r$, if $y$ bases are sequenced from the reference, ending
at position $x$. Therefore, $p_{r, j} = T[j, l]$. $T[x, 0]$ represents the probability of observing an empty sequence if
we sequence zero bases and is set to 1. $T[0, y]$
represents the probability of observing prefix $[1 \ldots y]$ of the
read if $y$ bases are sequenced from the reference, ending at position
$0$ (before the beginning), and is set to 0.
For $x \geq 1$ and $y \geq 1$, $T[x, y]$ is recursively defined:
\begin{align}
\label{dp_single}
T[x, y] = & \quad T[x - 1, y - 1] \Pr[\operatorname{Substitute}(A[x], r[y])] \\
& + T[x, y - 1] \Pr[\operatorname{Insert}(r[y])] \notag\\
& + T[x - 1, y] \Pr[\operatorname{Delete}(A[x])], \notag
\end{align}
\noindent where $r[y]$ and $A[x]$ represent the nucleotides at positions $y$ and
$x$ of the read $r$ and the assembly $A$, respectively.
$\Pr[\operatorname{Substitute}(A[x], r[y])]$ is the probability of
observing the nucleotide $r[y]$ by sequencing the nucleotide $A[x]$.
In our experiments, we did not distinguish between
different types of errors and considered their probability to be
$\epsilon$ and the probability of observing the correct nucleotide to
be $1 - \epsilon$.
The dynamic programming algorithm outlined above has a running time of
$O(lL)$ per read. Even though the running time is polynomial, it is
slow in practice. However, we can speed it up by using alignment
seeds. The seeds would give us the regions of the assembly where a
read may align with high probability. We can apply the dynamic
programming only to those regions and get a very good approximate
value of the total probability. We use exact seeds ($k$-mers) of a
given length to build a hash index of the assembly sequence. Then,
each read is compared to the regions where it has a common $k$-mer
with the assembly sequence.
\subsubsection{Mate pairs}
any of the current sequencing technologies produce paired reads --
reads generated from the opposite ends of the same DNA fragment. This
information is extremely valuable in resolving genomic repeats and in
ordering the contigs along long-range scaffolds; however, the paired
reads violate the assumption that reads are sampled independently,
that we made in the discussion above. To address this issue, we can use the
pairs rather than the individual reads as the underlying objects from
which the assembly likelihood is computed. To address the possibility
that assembly errors may result in violations of the constraints imposed by
the paired reads, we only consider pairs for which both ends align to
a same contig or scaffold within the constraints imposed by the
parameters of the sequencing experiment. Any pairs that violate these
constraints get classified as unassembled. Note that in addition to
sequencing errors, we now also handle fragment sizing errors --
deviations of the estimated distance between paired reads from the
distance implied by the sequencing experiment. We model the
distribution of fragment sizes within a same library by a normal
distribution, using user-supplied parameters, and use this information
to appropriately scale the likelihood estimate for each possible
placement of a mate pair along the genome.
We modify the dynamic programming recurrence from formula~\eqref{dp_single} to handle the probability calculation for the paired reads as follows.
The probability of the first read in the pair is calculated as the same as in the formula~\eqref{dp_single}. For the second read, we adjust the dynamic programming to ensure that it is aligned within a certain distance downstream of the alignment of the first read.
We modify the first column of the dynamic programming table of the \emph{second} read in the pair to take into account the distance from the first read.
Formally, given a paired read, we define $T_2[x,y]$ as the probability of observing prefix $[1 \ldots y]$ of the $2$nd read in the pair, if $y$ bases are sequenced from the reference, ending at position $x$.
Assume that the second read occurs after the first read and is separated by a normally-distributed distance with mean $\mu$ and with a standard deviation $\sigma$.
Therefore,
\begin{align}
T_2[x, 0] = \sum_{i=1}^{x}{\Pr[\operatorname{insert}(x-i)|N(\mu,\sigma)))] + T_1[x-i,l]},
\end{align}
\noindent where $\Pr[\operatorname{insert}(n)|N(\mu,\sigma)))]$ is the probability of observing an insert size of length $n$ from a normal distribution with parameters $\mu$ and $\sigma$, and $l$ is the length of the first read in the pair.
Instead of using two tables, we can concatenate the read pair together with a special character ($M$), which will signal when the insert size should be taken into account.
For $x \geq 1$ and $y \geq 1$, $T[x, y]$ is recursively defined as follows:
\begin{equation}
\begin{aligned}
T[x, y] = \quad
\text{if }r[y] == M&\begin{cases}
\sum_{i=1}^{x}{\Pr[\operatorname{insert}(x-i)|N(\mu,\sigma)))] + T[x-i,y-1]}
\end{cases}
\\
\text{else }&\begin{cases} T[x - 1, y - 1] \Pr[\operatorname{Substitute}(A[x], r[y])] \\
\quad + T[x, y - 1]\Pr[\operatorname{Insert}(r[y])] \\
\quad + T[x - 1, y]\Pr[\operatorname{Delete}(A[x])]
\end{cases}
\end{aligned}
\end{equation}
\subsubsection{Assemblies containing more than one contig}
As we mentioned in the introduction, the output of an assembler
usually consists of a (large) set of contigs rather than one single
contig, representing the genome being assembled.
In the extreme case, an ``assembler'' may return the set of
unassembled input reads (\edit{or the set of all k-mers in De Bruijn-based assemblers}) as its output. Our likelihood score must be modified to account for such fragmented assemblies.
In practice, most assemblers join contigs only if they overlap by more than a certain number of bases; however, we only consider the case where contigs are non-overlapping substrings of the true genome.
In this case, the length of the original genome must be \emph{at least} the
sum of the lengths of the contigs, that is, $\sum L_j$, where $L_j$ is the
length of the $j$th contig. Therefore, the probability of every read
is at most:
\begin{equation}
\begin{aligned}
\frac{n_r}{2\sum L_j}
\end{aligned}
\end{equation}
Overlapping contigs can be handled by reducing the length of the contigs by a value representing the minimum overlap required by the assembler, as performed, for example, in Genovo~\cite{genovo2011}.
\subsubsection{Reads that do not align well}
\label{methods_practical_unassembled}
In practice, popular assemblers do not incorporate every read in the
assembly. Possible reasons include assembly errors (such as collapsed
tandem repeats), reads with high error rates, or contamination in the
DNA sample. These ``singleton'' or ``chaff'' reads cannot be modeled
by our likelihood approach as the likelihood of any assembly that
does not incorporate every read is zero.
When sequencing errors are modeled, every read obtains a non-zero likelihood, even if it does not align to the assembly. Since, in general, a non-trivial fraction of the total set of the reads cannot be mapped to the assembly, by their sheer number, the singleton reads would dominate the probability calculation.
To account for this factor, we argue that for any read that does not
align well, the overall probability of the assembly should not be
lower than the probability of the same assembly when the missing read
is appended to its sequence as a separate contig. The effect of such an addition on the
overall probability can be calculated as follows. First, the
probability of observing this read exactly,
$\left(\frac{\Pr[\text{exact match}]}{2L}\right)$, is multiplied to
the product of the probabilities of all mapped reads. Second, the
probabilities of the mapped reads are decreased slightly due to the
increase in the length of the assembled sequence.
For simplicity, let us assume an error-free model where each read maps
to exactly one position on the assembled sequence. Let $k$ denote the
number of the original reads. The ratio between the new probability
for all original reads divided by their probability before adding the
new read is:
\[
\frac{\frac{1}{(L + l)^k}}{\frac{1}{L^k}} = \left(\frac{L}{L + l}\right)^k = \left(1-\frac{l}{L + l}\right)^k \approx e^{-\frac{lk}{L}}
\]
Therefore, if the probability of observing a read is less than
\begin{equation}
\label{probability_threshold}
\frac{\Pr[\text{exact match}]}{2L}e^{-\frac{l\left\vert R\right\vert}{L}},
\end{equation}
we consider this read as ``unmapped'' and
use formula~\eqref{probability_threshold} as its probability. The probability
of an exact match $\Pr[\text{exact match}]$ is approximated by
$\left(1 - \epsilon\right)^{l}$, where $\epsilon$ is the probability of
an error (a mismatch, an insertion, or a deletion).
\subsection{Performance considerations}
\subsubsection{Estimating the average read likelihood by sampling}
\label{sampling_reads}
Depending on the specific characteristics of the chosen sequencing model, the computation of the probability $\Pr[R \vert A]$ can be expensive
for the dataset sizes commonly encountered in current projects (tens
to hundreds of millions of reads). In such cases, we can approximate
the likelihood of an assembly by using a random subset of the reads
$R^\prime \subseteq R$. To counteract the effect of the size of the
sample on the computed probability, we define the assembly quality as
the geometric mean of the individual read probabilities:
\begin{equation}
\label{average_probability}
\operatorname{AP}(R^\prime) =
\left(\prod_{r \in R^\prime} p_r\right)^{\frac{1}{\left|R^\prime\right|}}
\end{equation}
The logarithm of this value (Log Average Probability (LAP)) is
reported in the remainder of the paper as the assembly quality ``score'':
\begin{equation}
\label{average_log_probability}
\operatorname{LAP}(R^\prime) = \log_{10} \left( \operatorname{AP}(R^\prime) \right) = \frac{\sum_{r \in R^\prime} \log_{10} p_r}{\left|R^\prime\right|}
\end{equation}
In other words, we define the assembly quality as the average log
likelihood of the reads given an assembly. This formulation
also allows us to estimate the accuracy of the approximate likelihood
value produced by sub-sampling the set of reads. According to
sampling theory, the distribution of the scores across multiple
samples has the mean equal to the true likelihood of the assembly
(computed from all the reads) and a standard error proportional to
$\frac{1}{\sqrt{\left|R^\prime\right|}}$, i.e., the larger the sample is,
the more accurate our estimation is for the likelihood of the true assembly.
Since the probability of a read is bounded by formula (\ref{probability_threshold}), the variance of the sample can also be bounded by this value.
In practice, we increase the sample size until the assemblies can be unambiguously distinguished by the LAP value.
Specifically, we increase the sample size, by binary search, until the LAP values are separated by at least a single standard deviation. The level of subsampling required will, thus, be dependent on the extent of the differences between the assemblies –- for very different assemblies, low levels of subsampling are sufficient.
\subsubsection{Approximating the likelihood value using an aligner}
\label{methods_aligner}
Alternatively, when it is impractical to calculate exact probabilities
for large sets of reads, we can approximate these probabilities using
fast and memory-efficient alignment search programs, which internally
model the sequencing process. We use Bowtie 2~\cite{langmead2012fast} to align the reads to
the assembly. However, our programs are easy to adapt for any
read alignment tool that stores the alignment results in
SAM\cite{li2009sequence} format.
For each reported alignment, we use the number of substitutions $s$ to
compute the probability $p_{r}$. The probability of this alignment, $p_{r,j}$,
can be approximated by $\epsilon^{s}(1 - \epsilon)^{l - s}$ and
\begin{equation}
\label{}
p_{r} = \frac{\sum_{j \in S_r} p_{r,j}}{2L},
\end{equation}
where $S_r$ is the set of alignments in the SAM file for the read $r$.
We can further extend this equation to mated reads. A pair of mated
reads aligns if the distance and orientation of the alignment of the
pair are consistent with the experimental design parameters. Given
read $i_1$ and its mate $i_2$, we compute $p_{(i_1,i_2)}$ by
multiplying the probabilities of individually aligning each mate at
their respective positions with the probability that they are
separated by their distance from each other. That is,
\begin{equation}
\label{mate_pair_prob}
p_{(i_1,i_2)} = \frac{\sum_{(j_1,j_2) \in S_{(i_1,i_2)}} p_{i_1,j_1} p_{i_2,j_2} \Pr[\textrm{insert}(j_2 - (j_1 + l_1))]}{2(L - l)},
\end{equation}
where $p_{i_1, j_1} = \epsilon^{s_1}(1 - \epsilon)^{l_1 - s_1}$. Mate
pair insert sizes follow a normal distribution with mean and
standard deviation being estimated from the parameters of the sequencing process. Unless otherwise stated, the standard deviation is 10\%
of the insert size. If only one of the mates, $i_1$ or $i_2$, maps,
the probability $p_{(i_1,i_2)}$ is $0$. We use (\ref{probability_threshold}) to set the probability for this case.
In our experiments, Bowtie 2 was used to approximate the read
probabilities for the larger datasets; however, it could be substituted with any other aligner.
\subsection{Datasets}
The read data for \emph{Rhodobacter sphaeroides} 2.4.1 was downloaded from \url{
http://gage.cbcb.umd.edu/data/Rhodobacter_sphaeroides}, and the
corresponding reference sequence was obtained from the NCBI RefSeq database (NC\_007493.1, NC\_007494.1, NC\_009007.1, NC\_007488.1,
NC\_007489.1, NC\_007490.1, NC\_009008.1).
In addition, two more \emph{Rhodobacter} genomes were selected as
reference genomes, specifically \emph{R. sphaeroides} ATCC 17025 (NCBI
IDs NC\_009428.1, NC\_009429.1, NC\_009430.1, NC\_009431.1,
NC\_009432.1), and \emph{R. capsulatus} SB1003 (NC\_014034.1,
NC\_014035.1).
The read data for \emph{Stapylococcus aureus} USA300 was downloaded from \url{
http://http://gage.cbcb.umd.edu/data/Staphylococcus_aureus}, and the
corresponding reference sequence was obtained from the NCBI RefSeq database (NC\_010063.1, NC\_010079.1, NC\_012417.1).
In addition, two more \emph{Stapylococcus} genomes were selected as
reference genomes, specifically \emph{S. aureus} 04-02981 (CP001844), and \emph{S. epidermidis} ATCC 12228 (AE015929, AE015930,
AE015931, AE015932, AE015933, AE015934, AE015935).
The read data for human chromosome 14 was downloaded from \url{http://gage.cbcb.umd.edu/data/Hg_chr14/}, and the corresponding reference sequence was obtained from the NCBI RefSeq database (NC\_000014.8).
The Assemblathon~1 competition evaluates assemblies on the simulated
short read dataset generated from the simulated 110 Mbp diploid genome.
The competition provides sequence libraries with varying insert sizes (200-10,000 bp)
and coverage (20-40x).
Assemblathon~1 allowed teams to submit multiple entries, but for our
analyses, we only examine the top ranking assemblies from each
team. The raw reads and the consensus sequence of the top ranking
assemblies were downloaded from \url{http://korflab.ucdavis.edu/Datasets/Assemblathon/Assemblathon1/}.
Also used in our analyses is the \emph{E. coli} K12 MG1655 dataset, generated using Illumina MiSeq technology
(300 bp insert size, 370x coverage) ({\url{http://www.illumina.com/systems/miseq/scientific_data.ilmn}}).
\section{Results}
\subsection{Performance-related approximations do not significantly
affect the likelihood score}
The full and exact computation of the assembly likelihood score is