-
Notifications
You must be signed in to change notification settings - Fork 2
/
convert_to_fasta.wdl
124 lines (115 loc) · 3.57 KB
/
convert_to_fasta.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
version 1.0
workflow ConvertToFasta {
input {
File vcf
File vcf_index
File ref
File ref_index
File ref_name
}
call get_regions {
input:
vcf=vcf,
vcf_index=vcf_index
}
scatter (region in get_regions.regions) {
call convert {
input:
region_file=region,
vcf=get_regions.filtered_vcf,
vcf_index=get_regions.filtered_vcf_index,
ref=ref,
ref_index=ref_index,
ref_name=ref_name
}
}
call combine {
input:
split_fastas = convert.fasta,
split_marker_positions = convert.marker_positions
}
output {
File marker_fasta = combine.fasta
File marker_positions = combine.marker_positions
}
}
task get_regions {
input {
File vcf
File vcf_index
}
command <<<
set -exo pipefail
BCFTOOLS=/opt/hall-lab/bcftools-1.9/bin/bcftools
TABIX=/opt/hall-lab/htslib-1.9/bin/tabix
PERL=/usr/bin/perl
PRINT_REGIONS=/opt/hall-lab/scripts/printRegions2.pl
$BCFTOOLS view -m2 -M2 -v snps ~{vcf} -o tmp.vcf.gz -O z
$TABIX -fp vcf tmp.vcf.gz
zcat tmp.vcf.gz | $BCFTOOLS query -f '%CHROM\t%POS\n' | $PERL $PRINT_REGIONS > regions.txt
split -l 5000 -a 5 -d regions.txt regions.sub
ls regions.sub* > split_regions_files.txt
>>>
runtime {
memory: "4G"
docker: "apregier/bcftools_samtools@sha256:49dc9b0b9419281b87df4a9faf9ca6681725317108bca66ba37f9bd1d86e9de2"
}
output {
Array[File] regions = read_lines("split_regions_files.txt")
File filtered_vcf = "tmp.vcf.gz"
File filtered_vcf_index = "tmp.vcf.gz.tbi"
}
}
task convert {
input {
File region_file
File vcf
File vcf_index
File ref
File ref_index
String ref_name
}
command <<<
set -exo pipefail
BCFTOOLS=/opt/hall-lab/bcftools-1.9/bin/bcftools
SAMTOOLS=/opt/hall-lab/samtools-1.9/bin/samtools
OUT="split.fasta"
MARKERS="split.marker_positions"
ln -s ~{ref} ref.fa
ln -s ~{ref_index} ref.fa.fai
regions=`cat ~{region_file}`
for region in $regions; do
$SAMTOOLS faidx ref.fa $region | $BCFTOOLS consensus ~{vcf} | sed 's/^>/>alt_~{ref_name}_/' >> $OUT
$SAMTOOLS faidx ref.fa $region | sed 's/^>/>ref_~{ref_name}_/' >> $OUT
echo "ref_~{ref_name}_$region 100 ref_~{ref_name}_$region ~{ref_name}_$region r" >> $MARKERS
echo "alt_~{ref_name}_$region 100 alt_~{ref_name}_$region ~{ref_name}_$region a" >> $MARKERS ##TODO double check 100 or 101?
done
>>>
runtime {
memory: "4G"
docker: "apregier/bcftools_samtools@sha256:49dc9b0b9419281b87df4a9faf9ca6681725317108bca66ba37f9bd1d86e9de2"
}
output {
File fasta = "split.fasta"
File marker_positions = "split.marker_positions"
}
}
task combine {
input {
Array[File] split_fastas
Array[File] split_marker_positions
}
command <<<
set -exo pipefail
cat ~{sep=" " split_fastas} > combined.fasta
cat ~{sep=" " split_marker_positions} > combined_marker_positions.txt
>>>
runtime {
memory: "4G"
docker: "apregier/analyze_assemblies@sha256:4cd67e009ae65820772265b572fc8cb9ce9e6e09228d1d73ee1f5d9118e91fca"
}
output {
File fasta = "combined.fasta"
File marker_positions = "combined_marker_positions.txt"
}
}