forked from AstraZeneca-NGS/VarDict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample2vardict.pl
executable file
·67 lines (58 loc) · 3.01 KB
/
sample2vardict.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/perl -w
# From a sample list to shell script for running variant detection using checkVar.pl
use Getopt::Std;
use strict;
our ($opt_p, $opt_b, $opt_H, $opt_a, $opt_A, $opt_o, $opt_f, $opt_G, $opt_O);
getopts("Hp:b:a:A:o:f:G:O:") || USAGE();
$opt_H && USAGE();
my $option = defined($opt_o) ? $opt_o : "-pe smp 8 -q batch.q";
my $other_opt = defined($opt_O) ? $opt_O : "";
my $prog = $opt_p ? $opt_p : "/group/cancer_informatics/tools_resources/NGS/bin/runVarDict_BAM.sh";
my $bed = $opt_b ? $opt_b : "/group/cancer_informatics/tools_resources/NGS/genomes/hg19/Annotation/human_hg19_5k_segs.txt";
my $genome = $opt_G ? $opt_G : "/ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa";
my $freq = $opt_f ? $opt_f : 0.01;
while( <> ) {
chomp;
my ($sample, $bam) = split(/\t/);
$bam =~ s/\@$//;
$sample .= $opt_a if ( $opt_a );
$sample = $opt_A . $sample if ( $opt_A );
print "if [ ! -d \"$sample\" ]; then\n";
print " mkdir $sample\n";
print "fi\n";
print "cd $sample\n";
my $jobname = "${sample}_vardict";
$jobname = "X-$jobname" if ( $jobname =~ /^\d/ );
print "qsub $option -cwd -V -N ${sample}_vardict -S /bin/bash $prog \"$bam\" $sample $bed $freq $genome \"$other_opt\"\n";
print "cd ..\n\n";
}
sub USAGE {
print <<USAGE;
Usage: $0 -H -p alignment_program -b bed_file sample2bam
The program will generate shell scripts for submitting vardict jobs to clusters. The input is sample2bam mapping. Each line should
contain two tab-delimited columns, the first is sample name, the second is its corresponding BAM file. It will run 8 concurrent jobs,
with total of < 2GB of memory available. Thus regular cluster nodes are fine.
-H Print this help page.
-p alignment_program
The ABSOLUTE path to the alignment script. Default: /group/cancer_informatics/tools_resources/NGS/bin/runVardict_BAM.sh,
which will perform alignment using BWA 0.7.4 and call variants using vardict.pl
-b bed_file
The ABSOLUTE path to the bed file. If not supplied, it'll make variant calls to whole gene using 1Mb segments.
Default: /group/cancer_informatics/tools_resources/NGS/genomes/hg19/Annotation/human_hg19_5k_segs.txt
-o options
Options to be appended to qsub, such as "-l huge_ram=1", "-l mem_free=4G", etc. Default: "-pe smp 8";
To submit to regular nodes, use -o "-pe smp 8";
-a string
A string to append to the sample name, e.g. -mm10 when running against mouse genome
-A string
A string to suffix to the sample name, e.g. dis- when running after mouse disambiguation
-f allele frequency
The minimum allele frequency to call variants. Default: 0.01
-G genome_fasta
The path to genome fasta. Default to: /ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa. For GRh37, use:
/ngs/reference_data/genomes/Hsapiens/GRCh37/seq/GRCh37.fa.
-O other_options
Any other options you want parse to vardict, such as "-t" to remove duplicates for exome sequencing
USAGE
exit(0);
}