-
Notifications
You must be signed in to change notification settings - Fork 0
/
fusioncatcher.py
13449 lines (12071 loc) · 767 KB
/
fusioncatcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
==============================================================================
FusionCatcher
==============================================================================
FusionCatcher searches for novel somatic fusion genes in RNA-seq paired/single-end
reads data produced by the Illumina Solexa platforms (for example:
Solexa/HiSeq/NextSeq/MiSeq/MiniSeq).
Author: Daniel Nicorici, [email protected]
Copyright (c) 2009-2022 Daniel Nicorici
This file is part of FusionCatcher.
FusionCatcher is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
FusionCatcher is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with FusionCatcher (see file 'COPYING.txt'). If not, see
<http://www.gnu.org/licenses/>.
By default, FusionCatcher is running BLAT aligner
<http://users.soe.ucsc.edu/~kent/src/> but it offers also the option to disable
all its scripts which make use of BLAT aligner if you choose explicitly to do so.
BLAT's license does not allow to be used for commercial activities. If BLAT
license does not allow to be used in your case then you may still use
FusionCatcher by forcing not use the BLAT aligner by specifying the option
'--skip-blat'. Fore more information regarding BLAT please see its license.
Please, note that FusionCatcher does not require BLAT in order to find
candidate fusion genes!
"""
import sys
if ( sys.version_info>(3,0)):
print "ERROR: Python 3 or newer detected! Python 2.X is needed! FIX: run '/some/python/2.7/python bootstrap.py"
sys.exit(1)
import os
import struct
import optparse
import multiprocessing
import subprocess
import shutil
import socket
import locale
import math
import configuration
# for sort in linux
locale.setlocale(locale.LC_ALL, 'C')
# bowtie seed
bowtie_seed = "123456"
def expand(*p):
return os.path.abspath(os.path.expanduser(os.path.join(*p)))
def islink(alink = None):
"""
Wrapper for: os.path.islink()
"""
f = False
if alink:
alink = alink[:-1] if alink.endswith(os.sep) else alink
if os.path.islink(alink):
f = True
return f
# get the path of this script
pipeline_path = os.path.dirname(expand(sys.argv[0]))
def outdir(*more_paths):
global out_dir
return os.path.join(out_dir,*more_paths)
def datadir(*more_paths):
global data_dir
return os.path.join(data_dir,*more_paths)
def tmpdir(*more_paths):
global tmp_dir
return os.path.join(tmp_dir,*more_paths)
# make sure that a directory ends with path separator such that workflow can
# recognize it as directory
def adir(a_dir):
if (not a_dir.endswith('\\')) and (not a_dir.endswith('/')):
a_dir = a_dir + os.sep
return a_dir
#
#
# test if a command line option has been passed
def is_optparse_provided(parser, dest):
r = False
sysargv = set([e.split("=")[0] for e in sys.argv[1:]])
for opt in parser._get_all_options():
if opt.dest == dest:
if opt._long_opts and opt._long_opts[0] in sysargv:
r = True
break
if opt._short_opts and opt._short_opts[0] in sys.argv:
r = True
break
return r
# if any (opt.dest == dest and (opt._long_opts[0] in sys.argv[1:] or (False if (not opt._short_opts) else opt._short_opts[0] in sys.argv[1:])) for opt in parser._get_all_options()):
# return True
# return False
#
#
#
def empty(a_file):
f = True
if (os.path.isfile(a_file) or islink(a_file)):
s = os.path.getsize(a_file)
if s < 100:
d = [line for line in file(a_file,'r').readlines() if line.rstrip('\r\n')]
if d:
f = False
else:
f = False
return f
#
#
#
def delete_file(some_file):
some_file = some_file[:-1] if some_file.endswith(os.sep) else some_file
if os.path.isfile(some_file) or islink(some_file):
os.remove(some_file)
#
#
#
def memory(unit='default'):
meminfo = {'MemTotal':0,'free':0,'used':0,'unit':'kB','total':0}
if os.path.isfile('/proc/meminfo'):
meminfo = [line.split() for line in file('/proc/meminfo').readlines()]
t = meminfo[0][-1].strip()
meminfo = dict([(line[0].rstrip(':'),int(line[1])) for line in meminfo])
# meminfo['MemTotal'] # e.g. 3921852
meminfo['free'] = meminfo['MemFree'] + meminfo['Buffers'] + meminfo['Cached']
meminfo['used'] = meminfo['MemTotal'] - meminfo['free']
meminfo['unit'] = t
meminfo['total'] = meminfo['MemTotal']
if unit.upper() == 'GB' and t.upper() == 'KB':
for k in meminfo.keys():
if k != 'unit':
meminfo[k] = float(meminfo[k])/(1024*1024)
meminfo['unit'] = 'GB'
elif unit.upper() == 'MB' and t.upper() == 'KB':
for k in meminfo.keys():
if k != 'unit':
meminfo[k] = float(meminfo[k])/1024
meminfo['unit'] = 'MB'
return meminfo
#
#
#
def info(ajob, fromfile, tofile , top = "\n\n\n", bottom = "\n\n\n" , temp_path = 'no'):
if ajob.run():
aux = open(tofile,'a')
top = str(top).splitlines() if type(top).__name__ == 'str' else top
bottom = str(bottom).splitlines() if type(bottom).__name__ == 'str' else bottom
ajob.write("APPENDING to file: '%s'.\n"% (tofile,))
for line in top:
t = line.rstrip('\r\n')+'\n'
aux.write(t)
ajob.write(">%s" % (t,))
if fromfile:
ajob.write(">from file: '%s'\n"% (fromfile,))
for line in file(fromfile,'r').readlines():
t = line.rstrip('\r\n')+'\n'
aux.write(t)
for line in bottom:
t = line.rstrip('\r\n')+'\n'
aux.write(t)
ajob.write(">%s" % (t,))
aux.close()
#
# command line parsing
#
class MyOptionParser(optparse.OptionParser):
def format_epilog(self, formatter):
return self.epilog
def is_known_extension(something):
kx = ['fastq.gz','.fq.gz',
'.fastq.bz2','.fq.bz2',
'.fastq.zip','.fq.zip',
'.fastq.xz','.fq.xz',
'.fastq', '.fq',
'.sra',
'.bam']
sign = False
for ekx in kx:
if something.lower().endswith(ekx):
sign = True
break
# skip readme files
if (something.lower().startswith('readme') or
something.lower().startswith('index.') or
something.lower().startswith('checksum') or
something.startswith('.') or
something.lower().startswith('md5')):
sign = False
return sign
usage = "%prog [options]"
epilog = ("\n" +
"Author: Daniel Nicorici \n" +
"Email: [email protected] \n" +
"Copyright (c) 2009-2022, Daniel Nicorici \n " +
"\n")
description = ("FusionCatcher searches for novel and known somatic gene fusions in RNA-seq \n"+
"paired-end/single-end reads data produced by the Illumina sequencing \n"+
"platforms (like for example: Illumina HiSeq 2500, \n"+
"Illumina HiSeq 2000, Illumina HiSeq X, Illumina NextSeq 500, \n"+
"Illumina GAIIx, Illumina GAII, Illumina MiSeq, Illumina MiniSeq). \n")
version = "%prog 1.35"
if __name__ == "__main__":
parser = MyOptionParser(
usage = usage,
epilog = epilog,
description = description,
version = version
)
parser.add_option("--input","-i",
action = "store",
type = "string",
dest = "input_filename",
help = "The input file(s) or directory. The files should be "+
"in FASTQ or SRA format and may be or not compressed "+
"using gzip or zip. "+
"A list of files can be specified by given the "+
"filenames separated by comma. If a directory is given "+
"then it will analyze all the files found with the "+
"following extensions: .sra, "+
".fastq, .fastq.zip, .fastq.gz, .fastq.bz2, fastq.xz, "+
".fq, .fq.zip, .fq.gz, .fq.bz2, fz.xz, "+
".txt, .txt.zip, .txt.gz, .txt.bz2 ."
)
parser.add_option("--batch",
action = "store_true",
dest = "batch_mode",
default = False,
help = "If this is used then batch mode is used "+
"and the input specified using '--input' or '-i' is: "+
"(i) a tab-separated text file containing a each line such "+
"that there is one sample per line and first column are the "+
"FASTQ files' full pathnames/URLs, separated by commas, corresponding to the "+
"sample and an optional second column containing the name for the sample, or "+
"(ii) a input directory which contains a several subdirectories such that each "+
"subdirectory corresponds to only one sample and it contains all the FASTQ files "+
"corresponding to that sample. This is useful when several samples needs to be analyzed."
)
parser.add_option("--single-end",
action = "store_true",
dest = "single_end",
default = False,
help = "If this is used then it is assumed that all the input reads are single-end reads "+
"which must be longer than 130 bp. "+
"Be default it is assumed that all input reads come from a paired-end reads."
)
parser.add_option("--normal","-I",
action = "store",
type = "string",
dest = "normal_matched_filename",
help = "The input file(s) or directory containing the "+
"healthy normal-matched data. They should be given in the same "+
"format as for '--input'. In case that this option is used "+
"then the files/directory given to '--input' is considered "+
"to be from the sample of a patient with disease. This is optional."
)
parser.add_option("--output","-o",
action = "store",
type = "string",
dest = "output_directory",
help = "The output directory where all the output files "+
"containing information about the found candidate fusion"+
"genes are written. Default is '%default'.")
parser.add_option("--data","-d",
action = "store",
type = "string",
dest = "data_directory",
help = "The data directory where all the annotations files "+
"from Ensembl database are placed, e.g. 'data/'. "+
"This directory should be built using 'fusioncatcher-build'. "+
"If it is not used then it is read from configuration file "+
"specified with '--config' from 'data = ...' line.")
parser.add_option("--tmp","-T",
action = "store",
type = "string",
dest = "tmp_directory",
default = "tmp",
help = "The temporary directory where all the outputs files "+
"and directories will be written. Default is directory "+
"'%default' in the output directory specified with '--output'. ")
parser.add_option("--threads","-p",
action = "store",
type = "int",
dest = "processes",
default = 0,
help = "Number or processes/threads to be used for running SORT, Bowtie, "+
"BLAT, STAR, BOWTIE2 and other tools/programs. "+
"If it is 0 (as it is by default) then the number of processes/threads will be "+
"read first from 'fusioncatcher/etc/configuration.cfg' file. If even there it is still set to 0 then "+
"'min(number-of-CPUs-found,32)' processes will be used. Setting number of threads in 'fusioncatcher/etc/configuration.cfg' "+
"might be usefull in situations where one server is shared between several users and in order to limit FusionCatcher using all the CPUs/resources. "+
"Default is '%default'. ")
parser.add_option("--config",
action = "store",
type = "string",
dest = "configuration_filename",
default = os.path.abspath(os.path.join(pipeline_path,"..","etc","configuration.cfg"))+','+os.path.abspath(os.path.join(pipeline_path,"configuration.cfg")),
help = "Configuration file containing the paths to external "+
"tools (e.g. Bowtie, Blat, fastq-dump.) in case that "+
"they are not specified in PATH variable! "+
"Default is '%default'.")
parser.add_option("--force-paths","-F",
action = "store_true",
dest = "force_paths",
default = False,
help = "If it is specified then all external tools and all Python tools "+
"will be executed by FusionCatcher by using their corresponding absolute paths, "+
"which will be obined from the fusioncatcher/bin/configuration.cfg file. "+
"By default no paths are specified when executing tools/scripts. "+
"Default is '%default'. ")
parser.add_option("--no-update-check","-Z",
action = "store_true",
dest = "skip_update_check",
default = False,
help = "Skips the automatic routine that contacts the "+
"FusionCatcher server to check for a more recent version. "+
"Default is '%default'. ")
parser.add_option("--5keep","-l",
action = "store",
type = "int",
dest = "trim_3end_keep",
default = 60, # 60 # 68
help = optparse.SUPPRESS_HELP
# "This may be seen as seed length. For Bowtie aligner the reads "+
# "longer than '%default' will be trimmed from "+
# "their 3-end such that to become exactly '%default' bp long. "+
# "Reads shorter than '%default' will not be trimmed. " +
# "The trimming priorities are '--5end','--3end','--5keep'. " +
# "if several trimming options are used simultaneously. "+
# "The trimming is done by default only to the reads used "+
# "for BOWTIE aligner but not for BLAT/STAR/BOWTIE2 aligners. In order "+
# "to apply the trimming also the reads used by BLAT/STAR/BOWTIE2 aligners "+
# "option '--trim-psl' should be used! The trimming of reads for "+
# "BLAT/STAR/BOWTIE2 aligners is done using the option '--trim-psl-5keep'. "+
# "Default is '%default'."
)
parser.add_option("--5keep2",
action = "store",
type = "int",
dest = "trim_3end_keep2",
default = 23, # 27
help = optparse.SUPPRESS_HELP
# "After trimming using '--5keep' then a second round of trimming will "+
# "be done on unmapped reads to try to come with extra candidate fusion genes. "+
# "If this set to 0 then the trimming is disabled. "
# "Default is '%default'."
)
parser.add_option("--5end","-5",
action = "store",
type = "int",
dest = "trim_5end",
default = 0,
help = optparse.SUPPRESS_HELP
# "It trims all the reads from their 5-end with the "+
# "given size. "+
# "The trimming priorities are '--5end','--3end','--5keep'. " +
# "if several trimming options are used simultaneously. "+
# "The trimming is done by default only to the reads used "+
# "for BOWTIE aligner but not for BLAT aligner. In order "+
# "to apply the trimming also the reads used by BLAT/STAR/BOWTIE2 aligners "+
# "option '--trim-psl' or '--trim-psl-5end' should be used! "+
# "Default is '%default'."
)
parser.add_option("--3end","-3",
action = "store",
type = "int",
dest = "trim_3end",
default = 0,
help = optparse.SUPPRESS_HELP
# "It trims all the reads from their 3-end with the "+
# "given size. "+
# "The trimming priorities are '--5end','--3end','--5keep'. " +
# "if several trimming options are used simultaneously. "+
# "The trimming is done by default only to the reads used "+
# "for BOWTIE aligner but not for BLAT aligner. In order "+
# "to apply the trimming also the reads used by BLAT/STAR/BOWTIE2 aligners "+
# "option '--trim-psl' should be used! "+
# "Default is '%default'."
)
parser.add_option("--trim-psl",
action = "store_true",
dest = "trim_psl",
default = False,
help = optparse.SUPPRESS_HELP
# "If it is specified then also the reads given as input "+
# "to BLAT/STAR/BOWTIE2 aligners are trimmed using the parameters given "+
# "by command line arguments '--5keep', '--5end', and '--3end'. "+
# "By default the trimming options "+
# "'--5keep', '--5end', '--3end' are trimming the reads only for "+
# "for the BOWTIE method but not when BLAT/STAR/BOWTIE2 are used. "+
# "Default is '%default'."
)
parser.add_option("--trim-psl-5keep","-x",
action = "store",
type = "int",
dest = "trim_psl_3end_keep",
default = 82, # 80
help = optparse.SUPPRESS_HELP
# "This may be seen as seed length. All reads given as input "+
# "to BLAT/STAR/BOWTIE2 aligners and which "+
# "are longer than '%default' will be trimmed from "+
# "their 3-end such that to become exactly '%default' bp long. "+
# "The reads given as input to Bowtie are not trimmed using this "+
# "option. It should be set to 0 if no trimming should be done "+
# "for BLAT/STAR/BOWTIE2. "+
# "Default is '%default'."
)
parser.add_option("--trim-psl-5end",
action = "store_true",
dest = "trim_psl_5end",
default = False,
help = optparse.SUPPRESS_HELP
# "If it is specified then also the reads given as input "+
# "to BLAT/STAR/BOWTIE2 aligners are trimmed using the parameters given "+
# "by command line argument '--5end'. "+
# "By default the trimming options "+
# "'--5keep', '--5end', '--3end' are trimming the reads only for "+
# "for the BOWTIE method but not when BLAT/STAR/BOWTIE2 are used. "+
# "Default is '%default'."
)
parser.add_option("--trim-quality","-Q",
action = "store",
dest = "trim_quality",
type = "int",
default = 5,
help = optparse.SUPPRESS_HELP
# "The input reads will be trimmed from their 3'end "+
# "when the quality scores are below the given threshold, e.g. 5 for Q5. "+
# "Default is '%default'."
)
parser.add_option("--trim-wiggle",
action = "store",
dest = "trim_wiggle",
type = "int",
default = 0, # it was 2
help = optparse.SUPPRESS_HELP
# "The input reads will be trimmed during the alignment "+
# "from their 5' and 3' ends for filtering only purposes. "+
# "Default is '%default'."
)
parser.add_option("--trimfq",
action = "store",
dest = "trimfq",
type = "float",
default = 1.00,
help = optparse.SUPPRESS_HELP
# "If this is set less than 1.00 the quality then the quality "+
# "trimming will be done using Phred algorithm in addition to "+
# "quality filtering which is already done by default. "+
# "For this the 'seqtk trimfq' tool is used and also the input "+
# "reads should have quality score in Sanger format. A recommended value "+
# "here for quality trimming is 0.05 (which is the default value of 'seqtk trimfq') or 0.10."
)
parser.add_option("--skip-trim-multiple-5",
action = "store_true",
dest = "skip_trim_multiple_5",
default = False,
help = optparse.SUPPRESS_HELP
# "It trims the 3' ends of the reads to multiple of 5, "+
# "for example 51bp to 50bp. It looks like for Illumina "+
# "reads the last 51 or 76 or 101 or 151 is really bad quality."
)
parser.add_option("--skip-filter-low-entropy",
action = "store_true",
dest = "skip_filter_low_entropy",
default = False,
help = optparse.SUPPRESS_HELP
# "It masks with Ns the low entropy regions in reads."+
)
parser.add_option("--skip-parsort",
action = "store_true",
dest = "skip_parsort",
default = False,
help = "It skips using GNU PARSORT and instead is using classic SORT."
)
parser.add_option("--skip-fastqtk",
action = "store_true",
dest = "skip_fastqtk",
default = False,
help = "It skips using FASTQTK."
)
mydefault = sorted([
"paralogs",
"pair_pseudo_genes",
"similar_reads",
"ambiguous",
"similar_symbols",
"ensembl_fully_overlapping",
"ensembl_same_strand_overlapping",
# 'ucsc_fully_overlapping',
# 'ucsc_same_strand_overlapping',
'refseq_fully_overlapping',
'refseq_same_strand_overlapping',
"dist1000bp",
"rrna",
"trna",
"mt",
"mirna",
"yrna",
"7skrna",
"snorna",
"snrna",
"cta",
"ctb",
"ctc",
"ctd",
"rp",
"rp11",
"banned",
"healthy",
"hla",
"conjoing",
"metazoa",
"bodymap2",
"hpa",
"1000genomes",
# "non_tumor_cells",
"multi",
"fragments",
"znf",
"removed"])
all_choices = sorted([
'paralogs',
'adjacent',
'ambiguous',
'dist1000bp',
'chimer2',
'chimer4kb',
'chimer4pub',
'chimer4seq',
'cacg',
'cgp',
'duplicates',
'bodymap2',
'hpa',
"1000genomes",
'gtex',
'metazoa',
'rt_circ_rna',
'similar_reads',
'similar_symbols',
'short_distance',
'yrna',
'7skrna',
'rrna',
'trna',
'mt',
'lncrna',
'mirna',
'mitelman',
'oncokb',
'pseudogene',
'snorna',
'snrna',
'pair_pseudo_genes',
'prostate_cancer',
'rp',
'rp11',
'ensembl_fully_overlapping',
'ensembl_partially_overlapping',
'ensembl_same_strand_overlapping',
'ribosomal',
'cta',
'ctb',
'ctc',
'ctd',
'conjoing',
'healthy',
'ucsc_fully_overlapping',
'ucsc_partially_overlapping',
'ucsc_same_strand_overlapping',
'refseq_fully_overlapping',
'refseq_partially_overlapping',
'refseq_same_strand_overlapping',
'gencode_fully_overlapping',
'gencode_partially_overlapping',
'gencode_same_strand_overlapping',
'dist10kbp',
'dist100kbp',
'fragments',
'banned',
'hla',
'non_tumor_cells',
'non_cancer_tissues',
'removed',
'tcga',
'tcga2',
'tcga-normal',
'tcga-cancer',
"znf"])
parser.add_option("--filter-fusion","-b",
action = "store",
type = "string",
dest = "biotypes",
default = ','.join(sorted(mydefault)),
help = optparse.SUPPRESS_HELP)
# help = "Candidate gene fusions to be skipped from further "+
# "analysis in case that one of "+
# "partner gene or both genes (which form a fusion) "+
# "are specified here. "+
# "All possible values are: ["+', '.join(sorted(all_choices))+"]. "+
# "'short_distance' is used for labeling the "+
# "candidate fusion genes which do meet the criteria "+
# "specified with '--min-dist-fusion'. "+
# "Several can be chosen but in this case they " +
# "should comma separated. "+
# "Default is '%default'.")
parser.add_option("--filter-fusion-add","-B",
action = "store",
type = "string",
dest = "biotypes_more",
help = optparse.SUPPRESS_HELP)
# help = "Any label of fusion genes specified here will be "+
# "appended to the list given to '--filter-fusion'. "+
# "This is just an easy way to add more to '--filter-fusion'. "+
# "For more read the description of '--filter-fusion'. "+
# "Default is '%default'.")
parser.add_option("--dist-fusion","-D",
action = "store",
type = "int",
dest = "min_dist",
default = 200000,
help = optparse.SUPPRESS_HELP)
# "The candidate fusion genes where the distance "+
# "between the genes is below this threshold will be marked "+
# "using the label 'custom_distance' "+
# "Default is '%default'.")
parser.add_option("--all-reads-fusion","-A",
action = "store_true",
dest = "all_reads_junction",
default = False,
help = optparse.SUPPRESS_HELP)
# "If it is specified then all reads (reads which form "+
# "a pair and single reads which do not have a mate "+
# "read because their mate has been removed due to "+
# "different reasons, like for example low quality), "+
# "will be used for finding the fusion point, which "+
# "is the exon-exon junction. If not specified then only "+
# "reads which form a pair will be used for "+
# "finding the exon-exon junction (one read maps on one "+
# "of the transcripts of the gene involved in the fusion "+
# "and its mate will map on the exon-exon junction). "+
# "Default is '%default'."
parser.add_option("--homolog-fusion","-H",
action = "store",
type = "float",
dest = "homolog",
default = float(1)/float(8*(10**4)),#float(1)/float(2*(10**5)), # float(1)/float(8*(10**4)),# float(1)/float(5*(10**4)),
help = optparse.SUPPRESS_HELP)
# "The minimum number of reads (as percentage [0..1]) "+
# "which map simultaneously "+
# "onto two genes in order to be considered homologous. "+
# "If set to 0 then no homology analysis is done. "+
# "This information is used for filtering out candidate "+
# "fusion genes which are homologous. "+
# "Default is '%default'."
parser.add_option("--filter-str",
action = "store",
type = "float",
dest = "filter_str",
default = 0, #1.4, # 2.1
help = optparse.SUPPRESS_HELP)
# help = "If specified to 0 then it skips filtering out the reads "+
# "which contain STR (short tandem repeats). "+
# "Default is '%default'."
parser.add_option("--visualization-psl",
action = "store_true",
dest = "psl_visualization",
default = False,
help = optparse.SUPPRESS_HELP)
# help = "If it is set then the pipeline will use the BLAT "+
# "aligner for aligning the reads which support the "+
# "newly found candidate fusion genes. Please, note "+
# "that BLAT license does not allow BLAT to be used for "+
# "commercial activities. Fore more information "+
# "regarding BLAT please see its license: "+
# "<http://users.soe.ucsc.edu/~kent/src/>. Also please, note "+
# "that this option is not actively supported anymore and "+
# "in the future will be deprecated. If one still wants "+
# "to use it, then one should run this 'faToTwoBit genome.fa genome.2bit -noMask') "+
# "in 'fusioncatcher/data/current/'. Instead it is recommended to use "+
# "'--visualization-sam'. This will be deprecated in the future. "+
# "Default is '%default'.")
parser.add_option("--visualization-sam",
action = "store_true",
dest = "sam_visualization",
default = False,
help = optparse.SUPPRESS_HELP)
# help = "If it is set then the pipeline will use the BOWTIE2 "+
# "aligner for aligning the reads which support the "+
# "newly found candidate fusion genes. "+
# "Default is '%default'.")
parser.add_option("--assembly","-M",
action = "store_true",
dest = "assembly",
default = False,
help = optparse.SUPPRESS_HELP)
# help = "If used then the reads found to support the newly "+
# "found candidate fusion genes are assembled using "+
# "VELVET <http://www.ebi.ac.uk/~zerbino/velvet/>. "+
# "This will be deprecated in the future. " +
# "Default is '%default'.")
parser.add_option("--sonication",
action = "store",
type = "int",
dest = "sonication",
default = 130,
help = optparse.SUPPRESS_HELP)
# help = "In case that the input reads are longer than the threshold set here "+
# "then they will be broken up bioinformatically in smaller reads. "+
# "If this is set to 0 then no break up will be done. "+
# "Default is '%default'.")
parser.add_option("--bridges",
action = "store",
type = "int",
dest = "bridges",
default = 0,
help = optparse.SUPPRESS_HELP)
# help = "Number of encompasses paired-reads to be generated for each input long read. "+
# "If it is set to 0 then the number will chosen automatically based on "+
# "the length of input reads, i.e. ceil(length_read/160). " +
# "Default is '%default'."
parser.add_option("--skip-deduplication",
action = "store_true",
dest = "skip_deduplication",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out "+
# "the reads which are duplicates to each other. "+
# "Default is '%default'."
)
# parser.add_option("--skip-super-fast-prefilter",
# action = "store_true",
# dest = "skip_superfast",
# default = False,
# help = optparse.SUPPRESS_HELP
## help = "If specified then it skips filtering out "+
## "the reads which mapping on same transcript. "+
## "Default is '%default'."
# )
parser.add_option("--prefilter",
action = "store",
type = "string",
dest = "prefilter",
default = "1", #
help = optparse.SUPPRESS_HELP)
# help = " 0 - no pre-filtering "+
# " 1 - pre-filtering done in the very beginning "+
# " 2 - pre-filtering done later than 1 "+
# " 3 - pre-filtering done later than 2 "+
# "Default is '%default'."
# )
# parser.add_option("--skip-later-filter",
# action = "store_true",
# dest = "skip_later_filter",
# default = False,
# help = optparse.SUPPRESS_HELP
## help = "If specified then it skips filtering out "+
## "the reads which mapping on same transcript. "+
## "Default is '%default'."
# )
# parser.add_option("--fast-prefilter",
# action = "store_true",
# dest = "fast",
# default = False,
# help = optparse.SUPPRESS_HELP
## help = "If specified then it skips filtering out "+
## "the reads which mapping on same transcript BUT "+
## "before the FASTQ are trimmed and merged. "+
## "Default is '%default'."
# )
parser.add_option("--skip-filter-mt",
action = "store_true",
dest = "skip_mitochondrion_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out the reads "+
# "which map on the mitochondrion. "+
# "Default is '%default'."
)
parser.add_option("--skip-filter-vir",
action = "store_true",
dest = "skip_viruses_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out the reads "+
# "which map on known genomes of viruses. "+
# "Default is '%default'."
)
parser.add_option("--skip-filter-b",
action = "store_true",
dest = "skip_b_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out the reads with "+
# "B quality scores (i.e. low quality) which are a special "+
# "indicator in "+
# "Fastq Illumina files. Default is '%default'."
)
parser.add_option("--filter-ambiguous","-Y",
action = "store_true",
dest = "ambiguous_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it filters out the reads which "+
# "maps ambiguously (i.e. same read map simultaneously on two "+
# "locuses on genome/transcriptome within 0-3 mismatches. "
# "Default is '%default'."
)
parser.add_option("--skip-filter-genome","-G",
action = "store_true",
dest = "skip_genome_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out the reads which "+
# "maps multiple times on genome. "+
# "Default is '%default'."
)
parser.add_option("--skip-filter-unmapped-pairs",
action = "store_true",
dest = "skip_unmapped_pairs_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out the pair of reads which "+
# "are unmapped. "+
# "Default is '%default'."
)
parser.add_option("--skip-filter-genome-transcriptome",
action = "store_true",
dest = "skip_genome_transcriptome_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out the reads which "+
# "map better on genome than on transcriptome. "+
# "Default is '%default'."
)
parser.add_option("--skip-filter-adapter",
action = "store_true",
dest = "skip_adapter_filtering",
default = False,
help = optparse.SUPPRESS_HELP
# help = "If specified then it skips filtering out the reads which "+
# "contains the adapters. "+
# "Default is '%default'."
)
parser.add_option("--skip-filter-psl",
action = "store_true",
dest = "skip_prefiltering_psl",
default = False,
help = optparse.SUPPRESS_HELP)
# help = "If it is set then the pipeline will not prefilter "+
# "the short reads which will be used for doing BLAT/STAR/BOWTIE2 alignment. "+
# "By default, the short reads are prefiltered before "+
# "being aligned using BLAT/STAR/BOWTIE2 in order to speed up the BLAT/STAR/BOWTIE2 "+
# "alignment which is time and computationally demanding. "+
# "The disadvantage of doing prefiltering is that the sensitivity "+
# "of BLAT/STAR/BOWTIE2 alignment is somewhat lowered. "+
# "Default is '%default'.")
parser.add_option("--skip-interleave",
action = "store_true",
dest = "skip_interleave_processing",
default = False,
help = optparse.SUPPRESS_HELP)
# help = "If specified then it skips interleaving the short reads "+
# "from the input FASTQ files. The program tries automatically "+
# "to pair the forward and reverse short reads based on file "+
# "names. In case that the pair is done wronlgy then this "+
# "argument should be used to remedy the problem. "+
# "Default is '%default'.")
parser.add_option("--skip-known-fusions",
action = "store_true",
dest = "skip_known_fusions",
default = False,
help = optparse.SUPPRESS_HELP)
# help = "If it is set then the pipeline will not use its own database "+
# "and COSMIC database of already known fusion genes! "+
# "Here skipping means that the known fusion genes will "+
# "treated as any other candidate fusion genes "+
# "and if there is enough evidence will be shown in the "+
# "final list of found fusion genes. By default, the known "+
# "fusion genes are treated preferentially and are pushed "+
# "directly to the very final step of finding the junction "+
# "point. " +