forked from tehranchi/cisVar
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cisVar.py
executable file
·1509 lines (1312 loc) · 49.1 KB
/
cisVar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
cisVar: Find cis QTLs based on an experimental selection method
Ashley Tehranchi <[email protected]>
Stanford University
Version: 2.0.0b1
Created: 2015-12-12
Updated: 2018-05-16
Example usage:
cisVar prep -F test_new -i individuals.txt.gz --chrom-format chr
cisVar mpileup -F <SampleName> -f <fastaFile> -B <sortedBam>
cisVar post -F <SampleName> -r <readDepth>
cisVar geno -F <SampleName> -r <readDepth> -i <individualsFile>
cisVar qtls -F <SampleName> -r <readDepth> -n <numberIndividuals>
cisVar tidy -F <SampleName> -r <readDepth> -p out.dataframe.pandas -t out.dataframe.txt
Note:
The qtls regression step will use approximately 32GB of memory on an averaged-
sized dataset.
The geno step will use approximately 20GB of memory on the same dataset.
"""
from __future__ import print_function
import os
import sys
import bz2
import gzip
import argparse
import operator
import subprocess
import multiprocessing as mp
from time import sleep
from datetime import datetime
from collections import defaultdict
# psutil is used to print memory usage if installed, otherwise ignored
try:
import psutil
except ImportError:
psutil = None
CORES = mp.cpu_count()
__version__ = '2.0.0b3'
###########################################################################
# Create Necessary Input Files from VCFs #
###########################################################################
def prep_files(vcf_files, prefix_name, inds=None, limit_file=None,
alleles_file=None, chrom_format=0, skip_indels=True, cores=4):
"""Create genotype, bed, and allele files for pipeline.
Bed is used by mpileup to define SNPs to pileup.
Alleles file is used to set default ref and alt in POST.
Note, this only needs to be created once per experiment, not once per
group/pool. The same files can be used for each population/group/pool,
they are further filtered in the genotype step.
This code first loads all locations from the limit file (if present) and
any alleles to override. These two lists are kept in memory. It then loops
through each vcf_file in parallel, line-by-line, writing temp files, before
concatenating the results into a single file.
Params
------
vcf_files : list
A list of VCF files to parse, should contain genotypes, ref and alt.
prefix_name : str
A name for the output files.
inds : list or path, optional
Either a list of individuals or a list of paths to individuals files.
Should include all individuals in this experiment, in all pools.
limit_file : path, optional
Path to a bed/vcf/simple file of SNPs to consider. Should be a subset
of SNPs in the VCF files. BED3 format is fine, simple format is just
chromosome and position (1-based) to exclude, tab separated.
alleles_file : path, optional
Path to a VCF/bed/txt file with alleles, it text, should be
chr position (base-1). Can be same file as limit file if desired.
chrom_format : {0, 1, 2}, optional
0 = leave
1 = force chr#
2 = force #
skip_indels : bool, optional
Don't include rows with indels
cores : int, optional
Number of parallel cores to use on multiple vcf files.
Writes
------
prefix.genotypes.txt.gz
A tab separated file, including header with individuals, of:
chrom, position (base-1), ref (upper), alt (upper), genotypes (0/1/2)
prefix.locations.bed.gz
A BED file of all of the locations to be tested, including names
prefix.individuals.txt.gz
A newline separated file of individuals, replicates inds if provided,
overwrites even if already exists.
"""
if isinstance(vcf_files, str):
vcf_files = [vcf_files]
for fl in vcf_files:
if not os.path.isfile(fl):
raise OSError('Could not find file {}'.format(fl))
assert isinstance(prefix_name, str)
par = False if len(vcf_files) == 1 or cores < 2 else True
if limit_file:
sys.stderr.write(
"Loading test locations from {}\n".format(limit_file)
)
limit_locs = parse_loc_file(limit_file)
else:
limit_locs = None
if alleles_file:
sys.stderr.write(
"Loading alleles from {}\n".format(alleles_file)
)
alleles = parse_loc_file(alleles_file, mode='alleles')
else:
alleles = None
if par:
pool = mp.Pool(cores)
# Individuals
if os.path.isfile(inds):
with open_zipped(inds) as fl:
inds = fl.read().strip().split('\n')
if not inds:
inds = geno_file(vcf_files[0], get_inds=True)
final_inds = list(inds)
# Write the final individuals, order preserved
ind_output = '{}.individuals.txt.gz'.format(prefix_name)
with open_zipped(ind_output, 'w') as fout:
fout.write('\n'.join(final_inds))
# Primary parsing
sys.stderr.write("Begining genotype file parse\n")
# Header
header = geno_file(vcf_files[0], get_header=True)
count = 0
jobs = []
geno_output = '{}.genotypes.txt.gz'.format(prefix_name)
bed_output = '{}.locations.bed.gz'.format(prefix_name)
for fl in vcf_files:
ofl = '{}.{}.gz'.format(geno_output, count)
bfl = '{}.{}.gz'.format(bed_output, count)
args = (
fl, ofl, header, bfl, final_inds, limit_locs,
alleles, skip_indels, chrom_format
)
if par:
jobs.append([ofl, bfl, pool.apply_async(parse_file, args)])
else:
parse_file(*args)
jobs.append((ofl, bfl, None))
count += 1
ofls = [i[0] for i in jobs]
bfls = [i[1] for i in jobs]
jobs = [i[2] for i in jobs]
if par:
for job in jobs:
job.wait()
sleep(2)
for ofl in ofls:
done_file = ofl + '.done'
if not os.path.isfile(done_file):
sleep(1)
if not os.path.isfile(done_file):
print('Cannot find {}, assuming failure'.format(done_file))
raise Exception('Missing done file')
os.remove(done_file)
for bfl in bfls:
done_file = bfl + '.done'
if not os.path.isfile(done_file):
sleep(1)
if not os.path.isfile(done_file):
print('Cannot find {}, assuming failure'.format(done_file))
raise Exception('Missing done file')
os.remove(done_file)
# Merge
print('Merging files')
with open_zipped(geno_output, 'w') as fout:
fout.write(
'chrom\tposition\tref\talt\t{}\n'.format('\t'.join(final_inds))
)
subprocess.check_call(
'zcat {} | gzip >> {}'.format(' '.join(ofls), geno_output),
shell=True
)
subprocess.check_call(
'zcat {} | gzip > {}'.format(' '.join(bfls), bed_output), shell=True
)
print('Removing tmp files')
for temp_file in ofls + bfls:
if os.path.isfile(temp_file):
os.remove(temp_file)
def parse_file(geno, outfile, header, bedfile, inds=None, limit_locs=None,
alleles=None, skip_indels=False, chrom_format=0, log=None,
fail_if_too_few_inds=True):
"""File parser, see parse_files for information."""
if not log:
log = outfile + '.parse.log'
kept = 0
indels = 0
skipped = 0
superbad = []
print('Creating {} and {} from {}'.format(outfile, bedfile, geno))
if chrom_format == 0:
def chromit(x):
"""Return x."""
return x
elif chrom_format == 1:
def chromit(x):
"""Force x to start with chr."""
return x if x.startswith('chr') else 'chr' + x
elif chrom_format == 2:
def chromit(x):
"""Force x to not start with chr."""
return x if not x.startswith('chr') else x[3:]
else:
raise ValueError('chrom_format must be one of 0, 1, 2')
if not limit_locs:
limit_locs = dict()
if not alleles:
alleles = dict()
lg = open_zipped(log, 'a')
with open_zipped(outfile, 'w') as gn, open_zipped(bedfile, 'w') as bd:
lg.write('Parse initiated\n')
for f in geno_file(geno, check_header=header, ind_list=inds, log=lg):
# Write core records
if skip_indels and (len(f[2]) > 1 or len(f[3]) > 1):
indels += 1
continue
chrom = chromit(f[0])
pos = int(f[1])
# Drop excluded positions
try:
if pos not in limit_locs[chrom]:
skipped += 1
continue
except KeyError:
pass
# Override alleles
try:
if pos in alleles[chrom]:
ref, alt = alleles[chrom][pos]
else:
ref = f[2]
alt = f[3]
except KeyError:
ref = f[2]
alt = f[3]
name = f[4]
gt_inds = f[5:]
outstr = (
'{chr}\t{pos}\t{ref}\t{alt}'.format(
chr=chrom, pos=pos, ref=ref, alt=alt
)
)
# Write genotypes
if inds:
gt_ind_l = len(gt_inds)
ind_l = len(inds)
if gt_ind_l < ind_l:
lg.write(
'{chr}.{pos} had too few ({ind}) individuals\n'
.format(chr=chrom, pos=f[1], ind=len(gt_inds))
)
if fail_if_too_few_inds:
superbad.append(f)
continue
if gt_ind_l > ind_l:
lg.write(
'{chr}.{pos} had too many ({ind}) individuals\n'
.format(chr=chrom, pos=f[1], ind=len(gt_inds))
)
superbad.append(f)
continue
gn.write('{}\t{}\n'.format(outstr, '\t'.join(gt_inds)))
bd.write('{}\t{}\t{}\t{}\n'.format(chrom, pos-1, pos, name))
kept += 1
lg.write(
'Skipped: {}\nIndels: {}\nKept: {}\nParse Complete\n'
.format(skipped, indels, kept)
)
print(
'Finished {}. Kept {} SNPs. Dropped {} indels and skipped {}'
.format(outfile, kept, indels, skipped)
)
if superbad:
msg = 'Length failure despite length filtering:\n{}\n'.format(superbad)
lg.write(msg)
print(msg)
print('Writing done files')
with open(outfile + '.done', 'w') as fout:
fout.write('Done\n')
with open(bedfile + '.done', 'w') as fout:
fout.write('Done\n')
lg.close()
return
############################################################################
# mpileup BAM files with hg19 masked genome with blacklist regions removed #
############################################################################
def mpileup(allCHRfasta, sortedBam, prefix_name, mpileupBEDfile=None):
"""Run mpileup on the provided files.
Params
------
allCHRfasta : str
Path to genome wide fasta file
sortedBam : str
Path to BAM file sorted by position
prefix_name : str
Prefix to use for default file names
mpileupBEDfile : str, optional
Path to BED file for mpileup, defaults to file output by prep.
Writes
------
prefix_name.mpileup.txt.gz
"""
if not mpileupBEDfile:
mpileupBEDfile = '{}.locations.bed.gz'.format(prefix_name)
cmnd = (
"samtools mpileup -Q 25 -f {} -l {} {} | gzip > {}.mpileup.txt.gz"
.format(allCHRfasta, mpileupBEDfile, sortedBam, prefix_name)
)
sys.stderr.write('Running {}\n'.format(cmnd))
subprocess.check_call(cmnd, shell=True)
###########################################################################
# POST-ChIP Calculation and File Trimming #
###########################################################################
def postcalc(prefix_name, inalleles, trial_depths):
"""Calculate POST frequencies from mpileup results."""
print("Post-ChIP calculation for min reads {}".format(trial_depths))
infilename = prefix_name + ".mpileup.txt.gz"
outfilename = prefix_name + "." + str(trial_depths) + ".POST.txt"
depth = int(trial_depths)
pileup_dictionary = {}
count = 1
bad = 0
# mpileup format:
# chromosome, 1-based coordinate, reference base, # reads covering the
# site, read bases, and base qualities
with open_zipped(infilename, 'r') as pileup:
for line in pileup:
count = count + 1
rows = line.rstrip().split("\t")
if not len(rows) == 6:
bad += 1
continue
# SNP name is just chr.position
snp = rows[0] + "." + rows[1]
# cov is the total coverage for the SNP
cov = int(rows[3])
# Normalize base (called read here) to lowercase to make counting
# easier
read = str(rows[4]).lower()
# Only work on those with coverage exceeding the minimum depth
if cov < depth:
Acount = 'NA'
Ccount = 'NA'
Gcount = 'NA'
Tcount = 'NA'
ALTallele = 'NA'
ALTdepth = 'NA'
REFdepth = 'NA'
RefFreq = 'NA'
AltFreq = 'NA'
else:
# Count all A's, A is plus strand, a in negative strand,
# count both
Acount = read.count('a')
Ccount = read.count('c')
Gcount = read.count('g')
Tcount = read.count('t')
# Make list of all counts
countslist = [Acount, Ccount, Gcount, Tcount]
# Find index of largest number and the value
# ALT is assumed to be most common of the 4 bases
index, ALTdepth = max(
enumerate(countslist), key=operator.itemgetter(1)
)
indels = rows[4].count('-') + rows[4].count('+')
REFdepth = cov - Acount - Ccount - Gcount - Tcount - indels
# Adjusts total_depth/cov to remove third alleles/sequencing
# errors
snp_depth = REFdepth + ALTdepth
rows[3] = str(snp_depth)
if index == 0:
ALTallele = 'A'
elif index == 1:
ALTallele = 'C'
elif index == 2:
ALTallele = 'G'
elif index == 3:
ALTallele = 'T'
else:
ALTallele = "NA"
RefFreq = "NA"
AltFreq = "NA"
if snp_depth == 0:
RefFreq = str(0)
print(
"Reference frequency is 0 for {}, probably a mistake"
.format(snp)
)
else:
postfreq = float(REFdepth) / float(snp_depth)
if snp_depth > depth and postfreq >= 0:
# To get ref freq:
# totaldepth - altdepth = refdepth / totaldepth
RefFreq = str(postfreq)
AltFreq = str(1 - postfreq)
pileup_dictionary[snp] = (
rows[2:] + [
Acount, Ccount, Gcount, Tcount, ALTdepth,
REFdepth, ALTallele, RefFreq, AltFreq
]
)
assert len(pileup_dictionary[snp]) == 13
print('{} mpileup rows were too short and thus skipped'.format(bad))
linecounter = 1
with open_zipped(inalleles) as alleles, open_zipped(outfilename, 'w') as outfile:
for line in alleles:
row = line.rstrip().split('\t')[:4]
snp = row[0] + '.' + row[1]
if snp in pileup_dictionary:
matchingVector = [str(x) for x in pileup_dictionary[snp]]
else:
continue
genoRefAllele = row[2]
genoAltAllele = row[3]
# For alt alleles with 0 reads, force alt allele to be imputed geno
# alt allele
if matchingVector[8] == str(0):
matchingVector[10] = genoAltAllele
# For ref alleles with 0 reads, force ref allele to be imputed geno
# ref allele
if matchingVector[9] == str(0):
matchingVector[0] = genoRefAllele
postChipRefAllele = matchingVector[0].upper()
postChipRefFreq = matchingVector[11]
postChipAltAllele = matchingVector[10].upper()
postChipAltFreq = matchingVector[12]
if postChipRefAllele == genoRefAllele and postChipAltAllele == genoAltAllele:
outAllele = postChipRefAllele
outfreq = postChipRefFreq
elif postChipRefAllele == genoAltAllele and postChipAltAllele == genoRefAllele:
outAllele = postChipAltAllele
outfreq = postChipAltFreq
linecounter = linecounter +1
else:
outAllele = 'NA'
outfreq = 'NA'
outfile.write(
'\t'.join(
[row[0], row[1], '\t'.join(matchingVector),
str(outAllele), str(outfreq)]
) + '\n'
)
print("post-frequency calculation DONE")
###########################################################################
# POST.txt file triming, addition of header, BED file creation #
###########################################################################
def postTrim(prefix_name, trial_depths):
"""Tidy the POST frequency outputs."""
with open_zipped("mpileup.header.txt", 'w') as headerOUT:
headerOUT.write(
'\t'.join(
['Chr', 'position', 'REFallele', 'Depth', 'Acount', 'Ccount',
'Gcount', 'Tcount', 'ALTdepth', 'REFDepth', 'ALTallele',
'REFfreq', 'ALTfreq', 'POSTallele', 'POSTfreq']
) + '\n'
)
# File names
prfx = "{}.{}".format(prefix_name, trial_depths)
post_fl = "{}.POSTth.txt".format(prfx)
bed_fl = "{}.bed".format(prfx)
print("File trimming")
# Filter out unneeded columns
script = (
"cat mpileup.header.txt <("
"sed '/NA/d' {prefix}.POST.txt | cut -f-4,7- "
") > {prefix}.POSTth.txt"
).format(prefix=prfx)
subprocess.check_call('bash -c "{}"'.format(script), shell=True)
print("File trimming DONE")
print("Writing bed file")
# Make bed file
with open_zipped(post_fl) as openfile, open_zipped(bed_fl, 'w') as bedOUT:
openfile.readline() # Discard header
for line in openfile:
ln = line.rstrip().split('\t')
snp = ln[1]
bedOUT.write('\t'.join([ln[0], str(int(snp) - 1), snp]) + '\n')
print("Bed file writing complete")
###########################################################################
# Genotype Extraction for POST-calc SNPs #
###########################################################################
def genoExtract(prefix_name, trial_depths, genosFile, individualslist=None):
"""Filter genotypes by SNP and individual.
First loops through genotypes and filters to a temp file, then transposes,
then filters transposed lines by individual.
Takes approximately 10 minutes and uses 10GB of memory on a genotype file
of 77 million SNPs and an individuals files of 66 individuals.
Params
------
prefix_name : str
trial_depths : int
genosFile : str
Path to genotype file to parse. Must be in the format:
chrom\\tpos\\tref\\talt\\tgenotypes...
Genotypes must be in 0/1/2 format, and the file must contain a header
with the individual names.
Note: Genotype file *must* be sorted by position.
individualslist : str, optional
Path to file of individuals to include separated by newlines
"""
new_prefix = prefix_name + "." + str(trial_depths)
postdata = new_prefix + ".POSTth.txt"
out_name = new_prefix + ".genotypes.txt"
print("File prefix used:", new_prefix)
# Track memory usage
process = show_mem()
# Get list of sites with POSTfreq data
print("Getting SNPs from POST data")
with open_zipped(postdata, 'r') as postin:
# Drop first line if it is a header (position is not numberic)
header = postin.readline().strip().split('\t')
if header[1].isdigit():
postin.seek(0)
postlocs = [
rows[0] + '.' + rows[1] for rows in [
line.split('\t') for line in postin.readlines()
]
]
post_snps = frozenset(postlocs)
l = len(postlocs)
# The POST data must be unique per SNP, if it isn't we have a big problem
# somewhere in the earlier steps
assert l == len(post_snps)
print("Got {} SNPs".format(l))
show_mem(proc=process)
done_snps = set()
add_to_done = done_snps.add
extras = 0
indels = 0
dups = 0
print("Filtering genotypes by SNP")
print("Time: {}".format(datetime.now()))
# Geno will have the same number of columns as the POST data has rows but
# will contain numeric matrix only, one column per SNP, and one row per
# individual *sorted to match individuals.txt*
with open_zipped(genosFile) as genos_in:
# Parse header
header = genos_in.readline().strip().split('\t')
if header[1].isdigit():
raise Exception(
'Genotype file {} has no header, cannot parse individuals'
.format(genosFile)
)
# Extract the individuals from the header
inds = header[4:]
# Parse the file
gt_dict = {}
for line in genos_in:
row = line.strip().split('\t')
# Filter by SNP
snp = row[0] + '.' + row[1]
if snp not in post_snps:
extras += 1
continue
if snp in done_snps:
dups += 1
continue
# Filter out INDELS
if len(row[2]) > 1 or len(row[3]) > 1:
if len(row[2]) > 1:
raise Exception(
'Genotype file has more than one char for ref at '
'line:\n{}'.format(line)
)
for allele in row[3].split(','):
if len(allele) > 1:
indels += 1
continue
# Keep only the genotype data, not the position data
gt_dict[snp] = row[4:]
# Done by adding this at the end, we can still get duplicates
# where the first entry was an indel or something else.
add_to_done(snp)
print("Genotype filtering complete")
show_mem(proc=process)
print("Time: {}".format(datetime.now()))
print("Dropped {} unneeded SNPs".format(extras))
print("Dropped {} indels".format(indels))
print("Dropped {} duplicates".format(dups))
print("Checking all POST SNPs matched")
if len(postlocs) != len(done_snps):
err_file = new_prefix + ".missing.snps"
with open_zipped(err_file, "w") as fout:
fout.write(
'\n'.join(sorted([i for i in postlocs if i not in done_snps]))
)
raise Exception(
"{} SNPs in POST were not in the genotype file, written to {}"
.format(len(postlocs)-len(done_snps), err_file)
)
print("Done")
# Get individual locations
if individualslist:
print(
"Getting individuals from individuals file ({})."
.format(individualslist)
)
with open_zipped(individualslist) as fin:
individuals = [i.strip() for i in fin.read().strip().split('\n')]
ind_locs = []
missing = []
for ind in individuals:
if ind in inds:
ind_locs.append(inds.index(ind))
else:
missing.append(ind)
if len(missing) > 0:
sys.stderr.write(
'The following individuals are missing '
'from the genotype file:\n'
)
sys.stderr.write('\n'.join(missing))
raise IndexError('Missing some individuals from genotypes')
assert len(ind_locs) == len(individuals)
else:
print('No individuals file provided, keeping all individuals')
ind_locs = None
print("Creating sorted matrix of only required individuals")
mat = []
for snp in postlocs:
if ind_locs:
gt = gt_dict[snp]
mat.append([gt[i] for i in ind_locs])
else:
mat.append(gt_dict[snp])
print("Done")
show_mem(proc=process)
del gt_dict
print("Number of individuals included: {}".format(len(mat[0])))
print("Transposing and writing")
with open_zipped(out_name, 'w') as fout:
fout.write(
'\n'.join(
['\t'.join(i) for i in zip(*mat)]
)
)
print("Done")
show_mem(proc=process)
del mat
print("Genotype filtering complete")
show_mem(proc=process)
###########################################################################
# Regressuion with Z-score and P-value #
###########################################################################
def regPVAL(prefix_name, trial_depths, numIndv=None):
"""Run the actual regression R code."""
new_prefix = prefix_name + "." + str(trial_depths)
postdata = new_prefix + ".POSTth.txt"
genosout_name = new_prefix + ".genotypes.txt"
if not numIndv:
wc = subprocess.check_output(['wc', '-l', genosout_name])
numIndv = wc.decode().split(' ')[0]
numIndv = str(int(numIndv))
### make sure location of R script is correct
r_script = os.path.join(os.path.dirname(__file__), 'regression_qtls.R')
if not os.path.isfile(r_script):
raise OSError('File not found: {}'.format(r_script))
subprocess.check_call(
['Rscript', r_script, postdata, genosout_name, new_prefix, numIndv]
)
print("regression DONE")
###########################################################################
# Tidy Results #
###########################################################################
orig_headers = [
'Chr', 'position', 'REFallele', 'Depth', 'Acount', 'Ccount', 'Gcount',
'Tcount', 'ALTdepth', 'REFDepth', 'ALTallele', 'REFfreq', 'ALTfreq',
'POSTallele', 'POSTfreq', 'prechipfreq', 'pvalue', 'zvalue',
'prevar', 'postvar', 'SNPpostfreq', 'SNPprefreq'
]
new_headers = [
'chrom', 'position', 'ref', 'depth', 'a_count', 'c_count', 'g_count',
't_count', 'alt_depth', 'ref_depth', 'alt', 'ref_freq', 'alt_freq',
'post_allele', 'post_freq', 'pre_freq', 'p_value', 'z_value',
'pre_variance', 'post_variance', 'snp_postfreq', 'snp_prefreq'
]
final_headers = [
'chrom', 'position', 'rsid', 'open_allele', 'closed_allele',
'pre_freq', 'post_freq', 'pre_variance', 'post_variance', 'beta',
'p_value', 'z_value', 'ref', 'alt', 'depth', 'ref_depth',
'alt_depth', 'ref_freq', 'alt_freq', 'snp_postfreq', 'snp_prefreq',
'a_count', 'c_count', 'g_count', 't_count'
]
float_dtypes = {
'position': 'object',
'REFfreq': 'float128',
'ALTfreq': 'float128',
'POSTfreq': 'float128',
'prechipfreq': 'float128',
'pvalue': 'float128',
'zvalue': 'float128',
'prevar': 'float128',
'postvar': 'float128',
'SNPpostfreq': 'float128',
'SNPprefreq': 'float128',
}
def parse_regression(regfile, textfile=None, pandasfile=None, bed=None):
"""Parse a regression file into a pandas datafram.
Optionally output either pickled panda or textfile.
For all files, gzipped files or open file handle are permissable.
Parameters
----------
regfile : file name
Tab delimited output from regression, header strictly enforced
textfile : file name, optional
Path to write tab delimited formatted output
pandasfile : file name, optional
Path to write pickled pandas dataframe
bed : file name, optional
Path to a bed file that contains rsids
Returns
-------
pandas.DataFrame
Parsed DataFrame
"""
import numpy as np
import pandas as pd
print('Loading regression file')
df = pd.read_csv(regfile, sep='\t', low_memory=False, dtype=float_dtypes,
float_precision=128)
assert df.columns.tolist() == orig_headers
df.columns = new_headers
# Set the index to chr.position
df['idx'] = df.chrom.astype(str) + '.' + df.position.astype(str)
df = df.set_index('idx', drop=True)
# Add rsID if possible
if bed:
print('Loading rsids')
df['rsid'] = bed_to_series(bed)
else:
print('No bed file, all rsids will be "Unknown"')
df['rsid'] = 'Unknown'
# Sort the data by position (default is by best pvalue)
print('Sorting')
df = df.sort_values(['chrom', 'position'])
# Force datatypes and uppercase
df['position'] = df.position.astype(int)
for i in ['ref', 'alt', 'post_allele']:
df[i] = df[i].str.upper()
# Add the beta
df['beta'] = df.post_freq - df.pre_freq
# Add open and closed
print('Calculating open/closed')
df['open_allele'] = np.where(df.post_freq > df.pre_freq, df.post_allele, df.alt)
df['closed_allele'] = np.where(df.post_freq > df.pre_freq, df.alt, df.post_allele)
df['open_allele'] = np.where(df.post_freq == df.pre_freq, 'NA', df.open_allele)
df['closed_allele'] = np.where(df.post_freq == df.pre_freq, 'NA', df.closed_allele)
# Sort columns
print('Adjusting columns')
df = df[final_headers]
# Write outputs
if pandasfile:
print('Writing pickled panda to', pandasfile)
df.to_pickle(pandasfile)
if textfile:
print('Writing parsed dataframe to', textfile)
df.to_csv(textfile, sep='\t')
return df
###########################################################################
# Helpers #
###########################################################################
def bed_to_series(bedfile):
"""Convert a bed file to a series of chr.pos->name."""
import pandas as pd
args = dict(sep='\t', usecols=[0, 2, 3])
with open_zipped(bedfile) as fin:
if not fin.readline().startswith('#'):
args['header'] = None
print('Reading in bed file')
bed = pd.read_csv(bedfile, **args)
bed.columns = ['chrom', 'pos', 'name']
bed['idx'] = bed.chrom.astype(str) + '.' + bed.pos.astype(str)
bed = bed.drop_duplicates('idx', keep=False)
bed = bed.set_index('idx', drop=True)
print('Parsed %s SNPs', len(bed))
return bed.name
def show_mem(proc=None):
"""Print memory usage."""
if not psutil:
return None
if not proc:
proc = psutil.Process(os.getpid())
print(
"Memory: {:.2f}GB"
.format(proc.memory_info().rss/1024/1024/1024)
)
return proc
def open_zipped(infile, mode='r'):
"""Return file handle of file regardless of compressed or not.
Returns already opened files unchanged
Text mode automatic for compatibility with python2.
"""
# Return already open files
if hasattr(infile, 'write'):
return infile
# Make text mode automatic
if len(mode) == 1:
mode = mode + 't'
# Refuse to handle non-strings that aren't files.
if not isinstance(infile, str):
raise ValueError("i cannot open a filename that isn't a string.")
# Treat '-' appropriately
if infile == '-':
if 'w' in mode:
return sys.stdout
return sys.stdin
# If possible open zipped files
if infile.endswith('.gz'):
return gzip.open(infile, mode)
if infile.endswith('.bz2'):
if hasattr(bz2, 'open'):
return bz2.open(infile, mode)
return bz2.bz2file(infile, mode)
# Fall back on regular open
return open(infile, mode)
# Simple location file handling
def parse_loc_file(infile, mode='locations'):
"""Return a dict of SNPs from a bed/vcf/text of chr postition (base-1)."""
if mode not in ['locations', 'alleles', 'names']:
raise ValueError(
'mode must be one of locations, alleles, names. Is {}'
.format(mode)
)
names = infile.split('.')
if 'vcf' in names:
fl_tp = 'vcf'
elif 'bed' in names:
fl_tp = 'bed'
else:
fl_tp = 'txt'
splt_char = '\t ' if fl_tp == 'txt' else '\t'
with open_zipped(infile) as fl:
for line in fl:
loc = fl.tell()
# Comment lines
if line.startswith('#'):
loc = fl.tell()
# First non-comment
else:
header = line.strip().split(splt_char)
if line[1].isdigit():
# Not a header
fl.seek(loc)
break
# All non-head lines
if mode == 'locations':
res = _get_locs(fl, fl_tp, splt_char)
elif mode == 'alleles':
res = _get_alleles(fl, fl_tp, splt_char)
elif mode == 'names':
res = _get_names(fl, fl_tp, splt_char)
else: