-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile_varcall_freebayes
1021 lines (852 loc) · 40.8 KB
/
Snakefile_varcall_freebayes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
configfile: "config.yaml"
popmapfile = config["popmapfile"]
windowsize = int( config["windowsize"] )
genomefile = config["genomefile"]
samples_reads_map = config["samples_reads_map"]
regions_for_plot_bed = config["regions_for_plot_bed"]
def read_popmap(popmapfile):
SAMPLES = []
with open(popmapfile, "r") as I:
for line in I:
if len(line) > 1:
fields = line.strip("\n").split("\t")
SAMPLES.append(fields[0])
return(SAMPLES)
def parse_samples_reads_map (infile):
samples_files_dict = {}
with open(infile, "r") as I:
I.readline() # this file has a header!
for line in I:
if len(line) > 1:
fields = line.strip("\n").split("\t")
s = fields[0]
fq1 = fields[1]
try:
fq2 = fields[2]
if len(fq2.strip()) == 0:
fq2 = None
except IndexError:
fq2 = None
try:
samples_files_dict[s].append( (fq1, fq2) )
except KeyError:
samples_files_dict[s] = [ (fq1, fq2) ]
return(samples_files_dict)
def get_fastq(wildcards):
"""Get fastq files of given sample -- DANGER: currently at most one pair of read files per sample is handled!"""
fqs = samples_files_dict[wildcards.sample][0]
if not fqs[1]:
return [fqs[0]]
return [fqs[0], fqs[1]]
SAMPLES = read_popmap(popmapfile)
print (SAMPLES)
samples_files_dict = parse_samples_reads_map(samples_reads_map)
print (samples_files_dict)
rule all:
input:
"results_raw/all.post_filter.vcf.gz",
"results_processed/windows.multicov.F.txt",
"results_processed/windows.multicov.M.txt",
"results_processed/M_F_norm_coverage_ratio_log2.bed.txt",
"results_processed/plink.assoc_results.significant.window.bed.txt",
"results_raw/plink.assoc_results.assoc.raw.txt.gz",
"results_processed/LD_r2_averaged_per_window.bed.txt",
"results_raw/F_specific_kmer.fa",
"results_raw/M_specific_kmer.fa",
"results_processed/kmers.F_specific.per_window.bed.txt",
"results_processed/kmers.M_specific.per_window.bed.txt",
"results_processed/gametolog_candidate_alleles.ZW_divergence.windows.bed.txt",
"results_processed/gametolog_candidate_alleles.XY_divergence.windows.bed.txt",
"results_processed/MvF.dxy.bed.txt",
"results_processed/MvF.fst.bed.txt",
"results_processed/F.pi.bed.txt",
"results_processed/M.pi.bed.txt",
"results_processed/MvF.netdiv_M.bed.txt",
"results_processed/MvF.netdiv_F.bed.txt",
"results_processed/het_males.bed.txt",
"results_processed/het_females.bed.txt",
"results_processed/allstats.txt",
"results_processed/allstats.plots.pdf",
"results_processed/region.stats.plots.pdf"
rule bwa_idx:
input:
genomefile
output:
"{genomefile}.bwt"
shell:
"""
if [[ ! $( grep ">" {input} ) =~ "|" ]]; then
bwa index {input}
else
echo "refusing to run, fasta headers contain pipe '|' character, dying"
fi
"""
rule bwa_map:
input:
fa=genomefile,
gidx=genomefile + ".bwt",
reads=get_fastq
output:
"mapped_reads/{sample}.bam"
threads: 14
run:
if len(input.reads) == 2: # paired-end!
shell("""
# filtering alignments to be primary (i.e. each read only once; if multiple locations equally possible than a random one is chosen):
# -F 256 == -F 0x0100 == NOT not primary alignment
# filtering alignments to be NOT supplementary (supplementary: sections of read map to discontinuous coordinates, e.g. across an inversion breakpoint..):
# -F 2048 == -F 0x800 == NOT supplementary alignment
# sum of the bit flags: 2304 => filters against BOTH non-primary and supplementary alignments; verified with samtools flagstat
# filtering alignments to be "properly paired": -f 2
bwa mem -t {threads} -a {input.fa} {input.reads[0]} {input.reads[1]} -R "@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\\tPL:Illumina" | samtools view -F 2304 -f 2 -b -@ 2 - > {output}
""")
else: # single-end
shell("""
# filtering alignments to be primary (i.e. each read only once; if multiple locations equally possible than a random one is chosen):
# -F 256 == -F 0x0100 == NOT not primary alignment
# filtering alignments to be NOT supplementary (supplementary: sections of read map to discontinuous coordinates, e.g. across an inversion breakpoint..):
# -F 2048 == -F 0x800 == NOT supplementary alignment
# -F 4 read unmapped (0x4)
# sum of the bit flags: 2308 => filters against non-primary and supplementary alignments and unmapped
bwa mem -t {threads} -a {input.fa} {input.reads[0]} -R "@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\\tPL:Illumina" | samtools view -F 2308 -b -@ 2 - > {output}
""")
rule samtools_sort:
input:
"mapped_reads/{sample}.bam"
output:
"mapped_reads/{sample}.sorted.bam"
shell:
"""
samtools sort -T mapped_reads/{wildcards.sample} -O bam {input} > {output}
rm {input}
"""
rule samtools_index:
input:
"mapped_reads/{sample}.sorted.bam"
output:
"mapped_reads/{sample}.sorted.bam.bai"
shell:
"samtools index {input}"
# A checkpoint that shall trigger re-evaluation of the DAG
# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution
# we need this because the number of output files (i.e. chunks / region files for Freebayes) is not known before execution!
checkpoint split_ref_for_freebayes:
input:
ref=genomefile,
indexes=expand("mapped_reads/{sample}.sorted.bam.bai", sample=SAMPLES),
samples=expand("mapped_reads/{sample}.sorted.bam", sample=SAMPLES)
output:
directory("FB_chunks")
params:
chunksize_Mb = config["varcall_chunksize"]
shell:
"""
samtools faidx {input.ref}
echo {input.samples} | sed 's/ /\\n/g' > bamlistforsplit
# wget https://gist.githubusercontent.com/travc/0c53df2c8eca81c3ebc36616869930ec/raw/eff3032ca7c955ca33bffd8758092e4006949c75/split_ref_by_bai_datasize.py
python scripts/split_ref_by_bai_datasize.py -r {input.ref}.fai -L bamlistforsplit --target-data-size {params.chunksize_Mb} > FB_regionsALL.bed
rm bamlistforsplit
split --lines=1 FB_regionsALL.bed FB_regions_ --numeric-suffixes --suffix-length=6 --additional-suffix=.bed
mkdir {output}
for i in FB_regions_* ; do mv $i {output} ; done
"""
rule freebayes:
input:
ref=genomefile,
regions="FB_chunks/{i}.bed"
output:
"FB_chunk_VCFs/{i}.bed.vcf.gz"
params:
skip_cov = config["freebayes_skip_cov"] # set this to 10 * N_samples * expected average coverage (= bases sequenced per sample / genomesize); e.g. 20*10*(15/0.444)
shell:
"""
freebayes -L <( ls mapped_reads/*.bam ) -t {input.regions} -f {input.ref} --min-coverage 6 --min-mapping-quality 5 \
--min-base-quality 10 --max-complex-gap 3 --min-repeat-entropy 1 --binomial-obs-priors-off --no-population-priors \
--use-best-n-alleles 4 --report-monomorphic --skip-coverage {params.skip_cov} | \
vcfallelicprimitives -kg | \
tr "|" "/" | \
bgzip -c > {output}
# unfortunately, the use of vcfallelicprimitives -kg can result in a known and unresolved bug:
# https://github.com/vcflib/vcflib/issues/225
# ULTRA-important freebayes options for efficiency:
# -g --skip-coverage N
# Skip processing of alignments overlapping positions with coverage >N.
# This filters sites above this coverage, but will also reduce data nearby.
# default: no limit
# https://groups.google.com/g/freebayes/c/5L657BepjpY
# Erik Garrison writes: "This will jump over sites that have total coverage in all samples >N. This is important when processing genomes with collapsed repeats,
# where coverage may go very high. In many genomes, it makes sense to set this to a multiple of your expected read coverage (e.g. 5-10x, or
# alternatively several standard deviations above the mean). Variant calls made in these regions of collapsed sequence are unlikely to be "correct"
# in that the actual local copy number can be very high, and so the genotype won't be the ploidy that you've set."
#
# WHAT EG DOES NOT SAY: skip-coverage evaluates the TOTAL SUM OVER ALL SAMPLES
#
# --limit-coverage N
# Downsample per-sample coverage to this level if greater than this coverage.
# default: no limit
#
# EG: "So this will pseudorandomly downsample each biosample until its max coverage is N. If you have a lot of samples, the coverage will still be higher.
# I think it may not adjust DP. Look at the per-sample DP to see if it changes."
#
# WHAT EG DOES NOT SAY: limit-coverage does not help or only small reduction in running time + memory load;
# apparently all the reads still get loaded to memory before they get downsampled!?
"""
def merge_vcfs_input(wildcards):
checkpoint_output = checkpoints.split_ref_for_freebayes.get(**wildcards).output[0]
# print(checkpoint_output)
vcfs_before_filter = sorted( expand("FB_chunk_VCFs/{i}.bed.vcf.gz", i=glob_wildcards(os.path.join(checkpoint_output, "{i}.bed")).i) )
# print (thef)
vcfs_after_filter = sorted( expand("FB_chunk_VCFs_filtered/{i}.bed.vcf.gz", i=glob_wildcards(os.path.join(checkpoint_output, "{i}.bed")).i) )
return vcfs_after_filter
def VCF_get_mean_depth_per_site_input(wildcards):
checkpoint_output = checkpoints.split_ref_for_freebayes.get(**wildcards).output[0]
# print(checkpoint_output)
vcfs_before_filter = sorted( expand("FB_chunk_VCFs/{i}.bed.vcf.gz", i=glob_wildcards(os.path.join(checkpoint_output, "{i}.bed")).i) )
return vcfs_before_filter
rule merge_filtered_vcfs:
input:
region_vcfs=merge_vcfs_input # refers to the function above which evaluates the checkpoint
output:
"results_raw/all.post_filter.vcf.gz"
threads: 3
shell:
"""
# MUST NOT USE bcftools concat: it cannot resolve POS that are non-monotonically icreasing (which ca happen at the interval boundaries)
# the code below is only slightly modified from freebayes-parallel script: https://github.com/freebayes/freebayes/blob/master/scripts/freebayes-parallel
# zcat input.region_vcfs | python $(which vcffirstheader) | vcfstreamsort -w 10000 | vcfuniq | bgzip -c > {output}
# zcat alone may complain about too many arguments, so better use find -exec :
find FB_chunk_VCFs_filtered/*.bed.vcf.gz -type f -exec zcat {{}} \\; | python $(which vcffirstheader) | vcfstreamsort -w 10000 | vcfuniq | bgzip -c > {output}
"""
rule VCF_get_mean_depth_per_site:
input:
region_vcfs=VCF_get_mean_depth_per_site_input # function which evaluates the same checkpoint as merge_filtered_vcfs
output:
"results_raw/meandepthpersite.txt"
threads: 16
shell:
"""
vcf_files=$(find FB_chunk_VCFs/*.bed.vcf.gz -type f)
parallel -j {threads} 'zcat {{}} | grep -v "#" | cut -f1,2 > {{}}.sites' ::: ${{vcf_files}}
find FB_chunk_VCFs/*.bed.vcf.gz.sites -type f | xargs cat > all_pos_in_vcf.txt
rm FB_chunk_VCFs/*.sites
# get a subset of 10k to 100k of the raw VCF records; iteratively keeping only 10% of lines in file
cp all_pos_in_vcf.txt tmp1
nlines=$( cat tmp1 | wc -l )
echo $nlines
while [ "$nlines " -gt 100000 ] ; do
awk 'NR == 1 || NR % 10 == 0' tmp1 > tmp2
mv tmp2 tmp1
nlines=$( cat tmp1 | wc -l )
echo $nlines
done
grep -v "#" tmp1 > postokeep.txt
#parallel -j {threads} 'tabix {{}} ; tabix -h -R postokeep.txt {{}} > {{}}.subset.vcf ; rm {{}}.tbi ; vcftools --vcf {{}}.subset.vcf --site-mean-depth --stdout | cut -f3 | tail -n +2 > {{}}.mdps ; rm {{}}.subset.vcf' ::: ${{vcf_files}}
parallel -j {threads} 'vcftools --gzvcf {{}} --positions postokeep.txt --site-mean-depth --stdout | cut -f3 | tail -n +2 > {{}}.mdps' ::: ${{vcf_files}}
find FB_chunk_VCFs/*.mdps -type f | xargs cat > {output}
rm postokeep.txt all_pos_in_vcf.txt tmp1 FB_chunk_VCFs/*.mdps
"""
rule VCF_filter_variants_and_invariants:
input:
mdps="results_raw/meandepthpersite.txt",
gzvcf="FB_chunk_VCFs/{i}.bed.vcf.gz",
popmap={popmapfile}
output:
temp( "FB_chunk_VCFs_filtered/{i}.bed.vcf.gz" )
params:
MISS=config["VCF_MISS"],
QUAL=config["VCF_QUAL"],
MIN_DEPTH=config["VCF_MIN_DEPTH"]
shell:
"""
## https://bcbio.wordpress.com/2013/10/21/updated-comparison-of-variant-detection-methods-ensemble-freebayes-and-minimal-bam-preparation-pipelines/#comment-1469
## => "We use ... a very vanilla filter with only depth and quality (QUAL < 20, DP < 5)"
## => impose a minimum of QUAL, and a minimum of DEPTH
## for variants, I want those two, AND a maximum DPETH cutoff
## for invariants, we can only impose a minimum DEPTH and a maximum DEPTH.
## use code from dDocent to calcualte a mean depth histogram, then find the 95th percentile: no. too complicated; there are easier ways to do this.
## also good info:
## https://speciationgenomics.github.io/filtering_vcfs/
# prepare
wd=DIR_{wildcards.i}
mkdir FB_chunk_VCFs_filtered/$wd
cd FB_chunk_VCFs_filtered/$wd
# set filters
# MAX_DEPTH is specific to the dataset; we take the 98th percentile of the mean depths per site:
MAX_DEPTH=$( sort --temporary-directory=./ -n ../../{input.mdps} | awk 'BEGIN{{c=0}} length($0){{a[c]=$0;c++}}END{{p2=(c/100*2); p2=p2%1?int(p2)+1:p2; print a[c-p2-1]}}' )
echo $MAX_DEPTH
# variants:
vcftools --gzvcf ../../{input.gzvcf} --mac 1 --minQ {params.QUAL} --min-meanDP {params.MIN_DEPTH} --minDP {params.MIN_DEPTH} \
--max-meanDP $MAX_DEPTH --recode --stdout | bgzip -c > tmp.1
tabix tmp.1
# invariants:
vcftools --gzvcf ../../{input.gzvcf} --max-maf 0 --min-meanDP {params.MIN_DEPTH} --minDP {params.MIN_DEPTH} \
--max-meanDP $MAX_DEPTH --recode --stdout | bgzip -c > tmp.2
tabix tmp.2
# combine the two VCFs using bcftools concat, also remove several VCF TAGs which can cause errors later for bcftools merge, or are just not relevant anymore:
bcftools concat --allow-overlaps tmp.1 tmp.2 | bcftools annotate --remove FORMAT/AD,FORMAT/AO,FORMAT/QA,FORMAT/GL,FORMAT/QR | bcftools view --exclude-uncalled --trim-alt-alleles | bgzip -c > tmp.3
# split by M and F populations, then filter for missingness in each pop. Thus we get sites that fulfill "MISS" in at least one of either M or F populations.
cat ../../{input.popmap} | awk '{{if($2==2) print $1}}' > fpop
cat ../../{input.popmap} | awk '{{if($2==1) print $1}}' > mpop
vcftools --gzvcf tmp.3 --keep fpop --max-missing {params.MISS} --recode --stdout | bgzip -c > tmp.4
vcftools --gzvcf tmp.3 --keep mpop --max-missing {params.MISS} --recode --stdout | bgzip -c > tmp.5
tabix tmp.4
tabix tmp.5
# merge M and F files again.
bcftools merge tmp.4 tmp.5 | bgzip -c > ../../{output}
# cleanup
cd ../
rm -r $wd
"""
rule make_variant_only_VCF:
input:
"results_raw/all.post_filter.vcf.gz"
output:
temp( "results_raw/variant_sites.vcf.gz" )
shell:
"""
vcftools --gzvcf {input} --mac 1 --recode --stdout | bgzip -c > {output}
"""
rule get_coverage_in_windows:
input:
fa=genomefile,
popmap={popmapfile},
bam=expand("mapped_reads/{sample}.sorted.bam", sample=SAMPLES),
bai=expand("mapped_reads/{sample}.sorted.bam.bai", sample=SAMPLES)
output:
fcov="results_processed/windows.multicov.F.txt",
mcov="results_processed/windows.multicov.M.txt",
log2ratio="results_processed/M_F_norm_coverage_ratio_log2.bed.txt"
threads: 2
shell:
"""
seqtk comp {input.fa} | awk '{{print $1"\t"$2}}' > genomefile.txt
bedtools makewindows -w {windowsize} -g genomefile.txt > windows.bed
rm genomefile.txt
# separate calls for M and F
cat {input.popmap} | awk 'NF {{if($2==1) print $1}}' | tr "\\n" "\\t" | awk 'BEGIN {{ OFS="\\t"}}; {{$1=$1; print "chr","start","stop",$0}}' > {output.mcov}
cat {input.popmap} | awk 'NF {{if($2==2) print $1}}' | tr "\\n" "\\t" | awk 'BEGIN {{ OFS="\\t"}}; {{$1=$1; print "chr","start","stop",$0}}' > {output.fcov}
( bedtools multicov -bams $( cat {input.popmap} | awk 'BEGIN {{ ORS=" "}}; NF {{if($2==1) print "mapped_reads/"$1".sorted.bam"}}' ) -bed windows.bed >> {output.mcov} )&
( bedtools multicov -bams $( cat {input.popmap} | awk 'BEGIN {{ ORS=" "}}; NF {{if($2==2) print "mapped_reads/"$1".sorted.bam"}}' ) -bed windows.bed >> {output.fcov} )&
wait
rm windows.bed
################ get M-F norm read coverage ratio log2 (we add 1e-6 to the read counts to avoid zero division)
awk '{{ for(i=4; i<=NF;i++) j+=$i; print j; j=0 }}' {output.mcov} > total.M
totm=$( awk '{{sum+=$1;}} END{{print sum;}}' total.M )
awk -v totalsum="$totm" '{{print ($1*1000000)/totalsum}}' total.M > total.norm.M
awk '{{ for(i=4; i<=NF;i++) j+=$i; print j; j=0 }}' {output.fcov} > total.F
totf=$( awk '{{sum+=$1;}} END{{print sum;}}' total.F )
awk -v totalsum="$totf" '{{print ($1*1000000)/totalsum}}' total.F > total.norm.F
paste total.norm.M total.norm.F | awk '{{print log((($1+1e-6)/($2+1e-6)))/log(2)}}' > log2_ratio
paste <(cut -f1,2,3 {output.fcov} ) log2_ratio | tail -n +2 > {output.log2ratio}
rm total.M total.F total.norm.M total.norm.F log2_ratio
"""
rule pi_rawstats_M:
input:
popmap={popmapfile},
gzvcf="results_raw/all.post_filter.vcf.gz"
output:
"results_raw/pi_raw.M.txt.gz"
shell:
"""
cat {input.popmap} | awk '{{if($2==1) print $1}}' > mpop_pi
vcftools --gzvcf {input.gzvcf} --keep mpop_pi --max-missing 0.01 --recode --stdout | python scripts/make_denominator_and_numerator_for_pi.py | gzip -c > {output}
rm mpop_pi
"""
rule pi_rawstats_F:
input:
popmap={popmapfile},
gzvcf="results_raw/all.post_filter.vcf.gz"
output:
"results_raw/pi_raw.F.txt.gz"
shell:
"""
cat {input.popmap} | awk '{{if($2==2) print $1}}' > fpop_pi
vcftools --gzvcf {input.gzvcf} --keep fpop_pi --max-missing 0.01 --recode --stdout | python scripts/make_denominator_and_numerator_for_pi.py | gzip -c > {output}
rm fpop_pi
"""
rule pi_windowed:
input:
pim="results_raw/pi_raw.M.txt.gz",
pif="results_raw/pi_raw.F.txt.gz",
fa=genomefile
output:
pi_f_bed="results_processed/F.pi.bed.txt",
pi_m_bed="results_processed/M.pi.bed.txt"
threads: 4
shell:
"""
# unzip raw
(gunzip -c {input.pim} > tmp.pi_raw.M.txt )&
(gunzip -c {input.pif} > tmp.pi_raw.F.txt )&
wait
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.pi.txt
bedtools makewindows -w {windowsize} -g genomefile.pi.txt > windows.pi.bed
# ugly construct to handle the "bug" that chromosomes in the VCF are not sorted in
# same order as in the fasta genome file; its unclear how this can happen but it DOES
cut -f1 tmp.pi_raw.M.txt | uniq | awk '{{print $1"\\t0\\t9999999999999999"}}' > pi_sort_order_in_scores.txt
# find chroms NOT YET in the list
comm -13 <(cut -f1 pi_sort_order_in_scores.txt | sort | uniq ) <(cut -f1 genomefile.pi.txt | sort | uniq) | awk '{{print $1"\\t0\\t9999999999999999"}}' >> pi_sort_order_in_scores.txt
bedtools sort -g pi_sort_order_in_scores.txt -i windows.pi.bed > windows.pi.sort_order_in_scores.bed
# MALE
(bedtools map -a windows.pi.sort_order_in_scores.bed -b tmp.pi_raw.M.txt -c 4 -o mean > pi_num_M )&
(bedtools map -a windows.pi.sort_order_in_scores.bed -b tmp.pi_raw.M.txt -c 5 -o mean > pi_denom_M )&
# FEMALE
(bedtools map -a windows.pi.sort_order_in_scores.bed -b tmp.pi_raw.F.txt -c 4 -o mean > pi_num_F )&
(bedtools map -a windows.pi.sort_order_in_scores.bed -b tmp.pi_raw.F.txt -c 5 -o mean > pi_denom_F )&
wait
paste pi_num_M pi_denom_M > pi_both_M
awk '{{ if($8>0) print $1"\\t"$2"\\t"$3"\\t"$4/$8 ; else print $1"\\t"$2"\\t"$3"\\tNA" }}' pi_both_M > pi_tmp
bedtools sort -g genomefile.pi.txt -i pi_tmp > {output.pi_m_bed}
paste pi_num_F pi_denom_F > pi_both_F
awk '{{ if($8>0) print $1"\\t"$2"\\t"$3"\\t"$4/$8 ; else print $1"\\t"$2"\\t"$3"\\tNA" }}' pi_both_F > pi_tmp
bedtools sort -g genomefile.pi.txt -i pi_tmp > {output.pi_f_bed}
rm genomefile.pi.txt windows.pi.bed pi_num_M pi_denom_M pi_num_F pi_denom_F pi_both_M pi_both_F pi_tmp windows.pi.sort_order_in_scores.bed pi_sort_order_in_scores.txt tmp.pi_raw.M.txt tmp.pi_raw.F.txt
"""
rule dxy_rawstats:
input:
popmap={popmapfile},
gzvcf="results_raw/all.post_filter.vcf.gz"
output:
"results_raw/dxy_raw.txt.gz"
shell:
"""
zcat {input.gzvcf} | python scripts/make_denominator_and_numerator_for_dxy.py --popmap {input.popmap} | gzip -c > {output}
"""
rule dxy_windowed:
input:
raw="results_raw/dxy_raw.txt.gz",
fa=genomefile
output:
"results_processed/MvF.dxy.bed.txt"
threads: 2
shell:
"""
# unzip raw file
gunzip -c {input.raw} > tmp.dxy_raw.txt
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.dxy.txt
bedtools makewindows -w {windowsize} -g genomefile.dxy.txt > windows.dxy.bed
# ugly construct to handle the "bug" that chromosomes in the VCF are not sorted in
# same order as in the fasta genome file; its unclear how this can happen but it DOES
cut -f1 tmp.dxy_raw.txt | uniq | awk '{{print $1"\\t0\\t9999999999999999"}}' > dxy_sort_order_in_scores.txt
# find chroms NOT YET in the list
comm -13 <(cut -f1 dxy_sort_order_in_scores.txt | sort | uniq ) <(cut -f1 genomefile.dxy.txt | sort | uniq) | awk '{{print $1"\\t0\\t9999999999999999"}}' >> dxy_sort_order_in_scores.txt
bedtools sort -g dxy_sort_order_in_scores.txt -i windows.dxy.bed > windows.dxy.sort_order_in_scores.bed
# correct
( bedtools map -a windows.dxy.sort_order_in_scores.bed -b tmp.dxy_raw.txt -c 4 -o mean > dxy_num )&
( bedtools map -a windows.dxy.sort_order_in_scores.bed -b tmp.dxy_raw.txt -c 5 -o mean > dxy_denom )&
wait
paste dxy_num dxy_denom > dxy_both
awk '{{ if($8>0) print $1"\\t"$2"\\t"$3"\\t"$4/$8 ; else print $1"\\t"$2"\\t"$3"\\tNA" }}' dxy_both > dxy_tmp
bedtools sort -g genomefile.dxy.txt -i dxy_tmp > {output}
rm genomefile.dxy.txt windows.dxy.bed dxy_num dxy_denom dxy_both dxy_tmp windows.dxy.sort_order_in_scores.bed dxy_sort_order_in_scores.txt tmp.dxy_raw.txt
"""
rule netdiv_windowed:
input:
dxy="results_processed/MvF.dxy.bed.txt",
pi_f_bed="results_processed/F.pi.bed.txt",
pi_m_bed="results_processed/M.pi.bed.txt",
fa=genomefile
output:
netdiv_M="results_processed/MvF.netdiv_M.bed.txt",
netdiv_F="results_processed/MvF.netdiv_F.bed.txt"
shell:
"""
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.netdiv.txt
bedtools makewindows -w {windowsize} -g genomefile.netdiv.txt > windows.netdiv.bed
paste {input.dxy} {input.pi_m_bed} > netdiv_prep
awk '{{ if($8!="NA" && $4!="NA") print $1"\\t"$2"\\t"$3"\\t"$4-$8 ; else print $1"\\t"$2"\\t"$3"\\tNA" }}' netdiv_prep > {output.netdiv_M}
paste {input.dxy} {input.pi_f_bed} > netdiv_prep
awk '{{ if($8!="NA" && $4!="NA") print $1"\\t"$2"\\t"$3"\\t"$4-$8 ; else print $1"\\t"$2"\\t"$3"\\tNA" }}' netdiv_prep > {output.netdiv_F}
rm genomefile.netdiv.txt windows.netdiv.bed netdiv_prep
"""
rule fst_rawstats:
input:
popmap={popmapfile},
gzvcf="results_raw/variant_sites.vcf.gz"
output:
"results_raw/fst_raw.txt.gz"
shell:
"""
cat {input.popmap} | awk '{{if($2==1) print $1}}' > mpop_fst
cat {input.popmap} | awk '{{if($2==2) print $1}}' > fpop_fst
# exclude singletons..https://www.biostars.org/p/278646/
# also drop any "-nan" outputs immediately
vcftools --gzvcf {input.gzvcf} --mac 2 --weir-fst-pop mpop_fst --weir-fst-pop fpop_fst --stdout | awk '{{print $1"\\t"$2"\\t"$2"\\t"$3}}' | awk '{{if ( !($4=="-nan") ) print}}' | gzip -c > {output}
rm mpop_fst fpop_fst
"""
rule fst_windowed:
input:
fraw="results_raw/fst_raw.txt.gz",
fa=genomefile
output:
"results_processed/MvF.fst.bed.txt"
shell:
"""
## we get an "average of ratios" rather than the "ratio of averages"..
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3759727/
# unzip raw
gunzip -c {input.fraw} > tmp.fst_raw.txt
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.fst.txt
bedtools makewindows -w {windowsize} -g genomefile.fst.txt > windows.fst.bed
fstlines=$(cat tmp.fst_raw.txt | wc -l)
if [ "$fstlines" -gt 2 ]; then
bedtools map -a windows.fst.bed -b tmp.fst_raw.txt -g genomefile.fst.txt -c 4 -o mean > {output}
else
# fst file was empty, because all values in genome were NA. Make an NA windowed-file to allow downstream to proceed anyway.
cat windows.fst.bed | awk '{{print $0"\\tNA"}}' > {output}
fi
rm genomefile.fst.txt windows.fst.bed tmp.fst_raw.txt
"""
rule GWAS_plink_raw:
input:
popmap={popmapfile},
gzvcf="results_raw/variant_sites.vcf.gz",
fa=genomefile
output:
"results_raw/plink.assoc_results.assoc.raw.txt.gz"
resources:
mem_mb=20000,
cpus=4
shell:
"""
# PLINK has a few problematic behaviours:
# - VCF gets converted to temporary plink format files with hard-coded names.. => RACE CONDITON when multiple instances of plink run
# - it will by default use ALL-1 CPUs => must use argument --threads
# - By default, PLINK 1.9 tries to reserve half of your system's RAM for its main workspace. => --memory <main workspace size, in MB>
DIR="tmpdir_plink_GWAS"
if [ -d "$DIR" ]; then
rm -r $DIR
fi
mkdir tmpdir_plink_GWAS
cd tmpdir_plink_GWAS
cat ../{input.popmap} | awk 'NF {{print $1"\\t"$1"\\t"$2}}' > thepheno
plink --vcf ../{input.gzvcf} --double-id --allow-extra-chr --pheno thepheno --allow-no-sex --assoc --out assoc_results --threads {resources.cpus} --memory {resources.mem_mb}
cat assoc_results.assoc | gzip -c > ../{output}
cd ../
rm -r tmpdir_plink_GWAS
"""
rule GWAS_plink_windows:
input:
fa=genomefile,
rawgwas="results_raw/plink.assoc_results.assoc.raw.txt.gz"
output:
"results_processed/plink.assoc_results.significant.window.bed.txt",
shell:
"""
zcat {input.rawgwas} | tr -s " " "\\t" > plink.assoc_results.raw.txt
# Benjamini-Hochberg FDR procedure
bash scripts/PtoFDR.sh plink.assoc_results.raw.txt 9 > assoc.fdr.txt
cat assoc.fdr.txt | grep -v "CHISQ" | awk '{{if($9!="NA") print }}' | awk '{{if($10<=0.1) print}}' > fdr10perc.txt
cat fdr10perc.txt | cut -f1,3,10 | awk '{{print $1"\\t"$2"\\t"$2}}' > plink.assoc_results.significant.bed
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.assoc.txt
bedtools makewindows -w {windowsize} -g genomefile.assoc.txt > windows.assoc.bed
bedtools coverage -counts -a windows.assoc.bed -b plink.assoc_results.significant.bed > {output}
rm plink.assoc_results.raw.txt assoc.fdr.txt fdr10perc.txt plink.assoc_results.significant.bed genomefile.assoc.txt windows.assoc.bed
"""
rule LD_plink_raw:
input:
gzvcf="results_raw/variant_sites.vcf.gz",
fa=genomefile
output:
"results_raw/ld_clean.sorted.bed.gz"
# "results_processed/LD_r2_averaged_per_window.bed.txt"
resources:
mem_mb=20000,
cpus=4
shell:
"""
# PLINK has a few problematic behaviours:
# - VCF gets converted to temporary plink format files with hard-coded names.. => RACE CONDITON when multiple instances of plink run
# - it will by default use ALL-1 CPUs => must use argument --threads
# - By default, PLINK 1.9 tries to reserve half of your system's RAM for its main workspace. => --memory <main workspace size, in MB>
DIR="tmpdir_plink_LD"
if [ -d "$DIR" ]; then
rm -r $DIR
fi
mkdir tmpdir_plink_LD
cd tmpdir_plink_LD
# apply a MAF filter 0.2 before calculating LD as rare alleles imply very high LD by definition; they are thus useless for our purpose.
# also, thin variants to at least 1000 bp distance; (this in combination with ld-window 500 resulted in best "LD island" visibility in Leucadendron rubrum data)
vcftools --gzvcf ../{input.gzvcf} --thin 1000 --maf 0.2 --recode --stdout | bgzip -c > forplinkld.vcf.gz
# vcftools --gzvcf {input.gzvcf} --maf 0.1 --recode --stdout > forplinkld.vcf
# plink --vcf forplinkld.vcf --double-id --allow-extra-chr --r2 --ld-window-r2 0.0 --ld-window 5 --ld-window-kb 2
# rm forplinkld.vcf
plink --vcf forplinkld.vcf.gz --double-id --allow-extra-chr --maf 0.2 --r2 --ld-window-r2 0.0 --ld-window 500 --threads {resources.cpus} --memory {resources.mem_mb}
# --ld-window X compute LD only for pairs that are at most X SNPs apart (default 10)
# --ld-window-kb X compute LD only for pairs that are at most X kb apart (default 1000 kb)
# --ld-window-r2 X minimum r2 to report, else omit from output. Default = 0.2
#cat plink.ld | tr -s ' ' '\\t' | cut -f1,2,4,5,7 | tail -n +2 | awk '{{ print $1"\\t"$2"\\t"$2"\\t"$5"\\n"$3"\\t"$4"\\t"$4"\\t"$5 }}' | sort --parallel {resources.cpus} -S 2G -T . -k1,1 -k2,2n | gzip -c > ../{output}
# interpolate the position of the LD value as the midpoint between the two variants from which it is calculated.
cat plink.ld | tr -s ' ' '\\t' | cut -f1,2,4,5,7 | awk '{{if($1==$3) print $1"\\t"int( ($2+$4)/2 )"\\t"int( ($2+$4)/2 )"\\t"$5 }}' > tmp_plink_before_sort
sort --parallel {resources.cpus} -S 2G -T . -k1,1 -k2,2n tmp_plink_before_sort | gzip -c > ../results_raw/ld_clean.sorted.bed.gz
cd ../
rm -r tmpdir_plink_LD
"""
rule LD_plink_windows:
input:
ldraw="results_raw/ld_clean.sorted.bed.gz",
fa=genomefile
output:
"results_processed/LD_r2_averaged_per_window.bed.txt"
resources:
mem_mb=20000,
cpus=4
shell:
"""
# create windows, sort chroms lexicographically (same as the LD output)
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.ld.txt
bedtools makewindows -w {windowsize} -g genomefile.ld.txt > tmpldwindows
sort --parallel {resources.cpus} -S 2G -T . -k1,1 -k2,2n tmpldwindows > windows.ld.bed
rm tmpldwindows
gunzip --stdout {input.ldraw} > ld_clean.sorted.bed
# get the mean LD per window
bedtools map -a windows.ld.bed -b ld_clean.sorted.bed -c 4 -o mean > tmpoutperwindow.txt
# sort chroms back to the same order as the genome file:
bedtools sort -g genomefile.ld.txt -i tmpoutperwindow.txt > {output}
rm ld_clean.sorted.bed genomefile.ld.txt tmpoutperwindow.txt windows.ld.bed
"""
rule kmerGO:
input:
popm={popmapfile},
SamReMap={samples_reads_map}
output:
fkmer="results_raw/F_specific_kmer.fa",
mkmer="results_raw/M_specific_kmer.fa",
fcont="results_raw/F_specific.fa.cap.contigs",
mcont="results_raw/M_specific.fa.cap.contigs"
threads: 14
shell:
"""
# for kmerGO to recognise input read files, there must be no "." characters in the filename except in the suffix;
# allowed suffixes are: FASTQ_suffix = ['.fq', '.fastq', '.fq.gz', '.fastq.gz']
# also, we must concatenate all read files to a single file per sample...
cat {input.popm} | awk 'NF {{if($2==1) print $1",M"; else print $1",F"}}' > traits_sex_for_kmerGO.txt
tail -n +2 {input.SamReMap} > nohead
sed -i -e :a -e '/^\\n*$/{{$d;N;ba' -e '}}' nohead # this removes eventual empty lines from the end of that file
if [ ! -d read_files ] # avoid xxx exists' error when re-running from failed previous attempt
then
mkdir read_files
fi
while read line; do
sid=$( echo $line | awk '{{print $1}}' )
f1=$( echo $line | awk '{{print $2}}')
f2=$( echo $line | awk '{{print $3}}' )
echo $sid $f1 $f2
if [ -z "$f2" ]
then
cat $f1 >> read_files/$sid.fastq.gz
else
cat $f1 >> read_files/$sid.fastq.gz
cat $f2 >> read_files/$sid.fastq.gz
fi
done < nohead
rm nohead
# 16-mers
# -ci / No minimal K-mer occurring times (default: 2)
scripts/KmerGO_for_linux_x64_cmd/KmerGO -n {threads} -k 16 -ci 3 -i read_files -t traits_sex_for_kmerGO.txt -p 0.01 -assl 0.8 -assn 0.8
rm -r kmer_features kmer_matrix read_files kmer_countings traits_sex_for_kmerGO.txt normalization_coefficients.txt
# if kmer result files not created by kmerGO (e.g. because there were no f-spec kmers), then make a dummy file in order to finish the job successfully.
if [ ! -f contig_result/F_specific_kmer.fa ]
then
touch {output.fkmer}
else
mv contig_result/F_specific_kmer.fa {output.fkmer}
fi
if [ ! -f contig_result/M_specific_kmer.fa ]
then
touch {output.mkmer}
else
mv contig_result/M_specific_kmer.fa {output.mkmer}
fi
# if contig_result files not created by kmerGO, then make a dummy file in order to finish the job successfully.
if [ ! -f contig_result/F_specific.fa.cap.contigs ]
then
touch contig_result/F_specific.fa.cap.contigs
fi
if [ ! -f contig_result/M_specific.fa.cap.contigs ]
then
touch contig_result/M_specific.fa.cap.contigs
fi
mv contig_result/F_specific.fa.cap.contigs {output.fcont}
mv contig_result/M_specific.fa.cap.contigs {output.mcont}
rm -r contig_result
"""
rule match_kmers_to_genome:
input:
popm={popmapfile},
fa=genomefile,
gidx=genomefile +".bwt",
fkmers="results_raw/F_specific_kmer.fa",
mkmers="results_raw/M_specific_kmer.fa"
output:
fsp="results_processed/kmers.F_specific.per_window.bed.txt",
msp="results_processed/kmers.M_specific.per_window.bed.txt"
shell:
"""
# searching for perfect matches of 21-mers: https://bioinformatics.stackexchange.com/questions/7298/mapping-heteryzygous-kmers-on-a-genome
# The -k 21 says to use a minimum seed length of 21, which forces exact matches. The -T 21 requires a minimum score of 21, which also enforces an exact match. The -a parameter reports all matches, since a "best" match doesn't make sense in this situation. The -c parameter limits how many matches are reported, which may need to be adjusted depending on how repetitive the 21-mer is.
# we match IMPERFECTLY = allowing 1 mismatch in 16 bp kmers. => 6.25% divergence
# use samtools view -F 4 : removes unmapped
bwa mem -t 1 -k 15 -T 15 -a -c 5000 {input.fa} {input.fkmers} | samtools view -F 4 -b - > F_specific_kmer.bam
samtools sort -T F_specific_kmer -O bam F_specific_kmer.bam > F_specific_kmer.sorted.bam
samtools index F_specific_kmer.sorted.bam
rm F_specific_kmer.bam
bwa mem -t 1 -k 15 -T 15 -a -c 5000 {input.fa} {input.mkmers} | samtools view -F 4 -b - > M_specific_kmer.bam
samtools sort -T M_specific_kmer -O bam M_specific_kmer.bam > M_specific_kmer.sorted.bam
samtools index M_specific_kmer.sorted.bam
rm M_specific_kmer.bam
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.kmer.txt
bedtools makewindows -w {windowsize} -g genomefile.kmer.txt > windows.kmer.bed
rm genomefile.kmer.txt
bedtools multicov -bams F_specific_kmer.sorted.bam -bed windows.kmer.bed > {output.fsp}
bedtools multicov -bams M_specific_kmer.sorted.bam -bed windows.kmer.bed > {output.msp}
rm windows.kmer.bed M_specific_kmer.sorted.bam M_specific_kmer.sorted.bam.bai F_specific_kmer.sorted.bam F_specific_kmer.sorted.bam.bai
"""
rule calc_indiv_het:
input:
popm={popmapfile},
gzvcf="results_raw/variant_sites.vcf.gz"
output:
mhet="results_raw/het_males.txt.gz",
fhet="results_raw/het_females.txt.gz"
shell:
"""
# calculate heterozygosity for GLOBAL variants, i.e. M or F may be fixed for a variant but we calculate heterozygosity anyway.
# do not report het for sites where no genotypes are present.
vcftools --gzvcf {input.gzvcf} --keep <( cat {input.popm} | awk '{{if($2==1) print $1}}' ) --hardy --stdout | awk -F'[\\t/]' 'NR>1 {{if(($3+$4+$5)>0) print $1"\\t"$2"\\t"$2"\\t"$4/($3+$4+$5) }}' | gzip -c > {output.mhet}
vcftools --gzvcf {input.gzvcf} --keep <( cat {input.popm} | awk '{{if($2==2) print $1}}' ) --hardy --stdout | awk -F'[\\t/]' 'NR>1 {{if(($3+$4+$5)>0) print $1"\\t"$2"\\t"$2"\\t"$4/($3+$4+$5) }}' | gzip -c > {output.fhet}
"""
rule indiv_het_per_windows:
input:
fa=genomefile,
mhet="results_raw/het_males.txt.gz",
fhet="results_raw/het_females.txt.gz"
output:
mhetwindows="results_processed/het_males.bed.txt",
fhetwindows="results_processed/het_females.bed.txt"
shell:
"""
# unzip raw
(gunzip -c {input.mhet} > tmp.het_males.txt)&
(gunzip -c {input.fhet} > tmp.het_females.txt)&
wait
# create windows, sort chroms lexicographically (same as the LD output)
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.het.txt
bedtools makewindows -w {windowsize} -g genomefile.het.txt > windows.het.bed
# get the mean HET per window
bedtools map -a windows.het.bed -b tmp.het_males.txt -c 4 -o mean > {output.mhetwindows}
bedtools map -a windows.het.bed -b tmp.het_females.txt -c 4 -o mean > {output.fhetwindows}
rm genomefile.het.txt windows.het.bed tmp.het_males.txt tmp.het_females.txt
"""
rule on:
input:
gzvcf="results_raw/all.post_filter.vcf.gz",
popm={popmapfile}
output:
"results_raw/gametolog_candidate_alleles.allsites.vcf.gz"
shell:
"""
python scripts/pseudo_phase_gametologs.py {input.popm} {input.gzvcf} | bgzip -c > {output}
"""
rule score_XY_gametolog_divergence:
input:
"results_raw/gametolog_candidate_alleles.allsites.vcf.gz"
output:
"results_raw/gametolog_candidate_alleles.XY_divergence.rawstats.txt.gz"
shell:
"""
echo -e "XY_Y_like\\tXY_Y_like" > xypopm
echo -e "XY_X_like\\tXY_X_like" >> xypopm
echo -e "XY_X_like" > xypop
echo -e "XY_Y_like" >> xypop
vcftools --gzvcf {input} --keep xypop --recode --stdout | python scripts/make_denominator_and_numerator_for_dxy.py --popmap xypopm | gzip -c > {output}
rm xypopm xypop
"""
rule score_ZW_gametolog_divergence:
input:
"results_raw/gametolog_candidate_alleles.allsites.vcf.gz"
output:
"results_raw/gametolog_candidate_alleles.ZW_divergence.rawstats.txt.gz"
shell:
"""
echo -e "ZW_W_like\\tZW_W_like" > zwpopm
echo -e "ZW_Z_like\\tZW_Z_like" >> zwpopm
echo -e "ZW_W_like" > zwpop
echo -e "ZW_Z_like" >> zwpop
vcftools --gzvcf {input} --keep zwpop --recode --stdout | python scripts/make_denominator_and_numerator_for_dxy.py --popmap zwpopm | gzip -c > {output}
rm zwpopm zwpop
"""
rule XY_div_windows:
input:
raw="results_raw/gametolog_candidate_alleles.XY_divergence.rawstats.txt.gz",
fa=genomefile
output:
"results_processed/gametolog_candidate_alleles.XY_divergence.windows.bed.txt"
threads: 2
shell:
"""
# unzip raw
gunzip -c {input.raw} > tmp.gametolog_candidate_alleles.XY_divergence.rawstats.txt
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.xygametologs.txt
bedtools makewindows -w {windowsize} -g genomefile.xygametologs.txt > windows.xygametologs.bed
(bedtools map -a windows.xygametologs.bed -b tmp.gametolog_candidate_alleles.XY_divergence.rawstats.txt -g genomefile.xygametologs.txt -c 4 -o mean > XY_dxy_num )&
(bedtools map -a windows.xygametologs.bed -b tmp.gametolog_candidate_alleles.XY_divergence.rawstats.txt -g genomefile.xygametologs.txt -c 5 -o mean > XY_dxy_denom )&
wait
paste XY_dxy_num XY_dxy_denom > XY_dxy_both
awk '{{ if($8>0) print $1"\\t"$2"\\t"$3"\\t"$4/$8 ; else print $1"\\t"$2"\\t"$3"\\tNA" }}' XY_dxy_both > {output}
rm genomefile.xygametologs.txt windows.xygametologs.bed XY_dxy_denom XY_dxy_num XY_dxy_both tmp.gametolog_candidate_alleles.XY_divergence.rawstats.txt
"""
rule ZW_div_windows:
input:
raw="results_raw/gametolog_candidate_alleles.ZW_divergence.rawstats.txt.gz",
fa=genomefile
output:
"results_processed/gametolog_candidate_alleles.ZW_divergence.windows.bed.txt"
threads: 2
shell:
"""
# unzip raw
gunzip -c {input.raw} > tmp.gametolog_candidate_alleles.ZW_divergence.rawstats.txt
seqtk comp {input.fa} | awk '{{print $1"\\t"$2}}' > genomefile.zwgametologs.txt
bedtools makewindows -w {windowsize} -g genomefile.zwgametologs.txt > windows.zwgametologs.bed
( bedtools map -a windows.zwgametologs.bed -b tmp.gametolog_candidate_alleles.ZW_divergence.rawstats.txt -g genomefile.zwgametologs.txt -c 4 -o mean > ZW_dxy_num )&
( bedtools map -a windows.zwgametologs.bed -b tmp.gametolog_candidate_alleles.ZW_divergence.rawstats.txt -g genomefile.zwgametologs.txt -c 5 -o mean > ZW_dxy_denom )&
wait
paste ZW_dxy_num ZW_dxy_denom > ZW_dxy_both
awk '{{ if($8>0) print $1"\\t"$2"\\t"$3"\\t"$4/$8 ; else print $1"\\t"$2"\\t"$3"\\tNA" }}' ZW_dxy_both > {output}
rm genomefile.zwgametologs.txt windows.zwgametologs.bed ZW_dxy_denom ZW_dxy_num ZW_dxy_both tmp.gametolog_candidate_alleles.ZW_divergence.rawstats.txt
"""
rule plot_all:
input:
regions=regions_for_plot_bed,
a="results_processed/M_F_norm_coverage_ratio_log2.bed.txt",
b="results_processed/kmers.F_specific.per_window.bed.txt",
c="results_processed/kmers.M_specific.per_window.bed.txt",
d="results_processed/F.pi.bed.txt",
e="results_processed/M.pi.bed.txt",
f="results_processed/MvF.fst.bed.txt",
g="results_processed/MvF.dxy.bed.txt",
h="results_processed/LD_r2_averaged_per_window.bed.txt",
i="results_processed/plink.assoc_results.significant.window.bed.txt",
j="results_processed/gametolog_candidate_alleles.ZW_divergence.windows.bed.txt",
k="results_processed/gametolog_candidate_alleles.XY_divergence.windows.bed.txt",
l="results_processed/het_males.bed.txt",
m="results_processed/het_females.bed.txt",
n="results_processed/MvF.netdiv_F.bed.txt",
o="results_processed/MvF.netdiv_M.bed.txt"
output: