From 88dbe819eddd316bd80d726ec9d2450cc43cac14 Mon Sep 17 00:00:00 2001 From: Ali Pirani Date: Thu, 3 Oct 2019 11:59:52 -0400 Subject: [PATCH] Turn off unncessary core snp steps --- modules/core_prep_sanity_checks.py | 2 + modules/core_prep_sanity_checks.pyc | Bin 4050 -> 4114 bytes .../variant_diagnostics/PBS_generate_jobs.py | 255 + .../variant_diagnostics/PBS_generate_jobs.pyc | Bin 0 -> 8942 bytes modules/variant_diagnostics/core_pipeline.py | 93 +- .../core_pipeline_core_prep.pyc | Bin 0 -> 8766 bytes .../core_pipeline_core_prep_label.py | 293 ++ .../core_pipeline_core_prep_label.pyc | Bin 0 -> 8934 bytes .../core_pipeline_core_prep_main.py | 2439 +++++++++ .../core_pipeline_core_prep_main.pyc | Bin 0 -> 80696 bytes .../core_pipeline_modular.py | 4658 +++++++++++++++++ 11 files changed, 7696 insertions(+), 44 deletions(-) create mode 100644 modules/variant_diagnostics/PBS_generate_jobs.py create mode 100644 modules/variant_diagnostics/PBS_generate_jobs.pyc create mode 100644 modules/variant_diagnostics/core_pipeline_core_prep.pyc create mode 100644 modules/variant_diagnostics/core_pipeline_core_prep_label.py create mode 100644 modules/variant_diagnostics/core_pipeline_core_prep_label.pyc create mode 100644 modules/variant_diagnostics/core_pipeline_core_prep_main.py create mode 100644 modules/variant_diagnostics/core_pipeline_core_prep_main.pyc create mode 100755 modules/variant_diagnostics/core_pipeline_modular.py diff --git a/modules/core_prep_sanity_checks.py b/modules/core_prep_sanity_checks.py index d8685b0..20a59be 100755 --- a/modules/core_prep_sanity_checks.py +++ b/modules/core_prep_sanity_checks.py @@ -79,6 +79,8 @@ def make_sure_files_exists(vcf_file_array, Config, logger): for i in not_found_files: keep_logging('Error finding variant calling output files for: %s' % os.path.basename(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), 'Error finding variant calling output files for: %s' % os.path.basename(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), logger, 'exception') + keep_logging('File not found: %s' % i, + 'File not found: %s' % i, logger, 'debug') exit() def make_sure_label_files_exists(vcf_file_array, uniq_snp_positions, uniq_indel_positions, Config, logger): diff --git a/modules/core_prep_sanity_checks.pyc b/modules/core_prep_sanity_checks.pyc index f3887976bda0c6035c8ebac0126f7ebb50751414..15fefa654053c4d5154d0c24b4ba5bd4647f5604 100755 GIT binary patch delta 305 zcmca4KS_au`78D4dd-RGPl|E6aOEP9GqbfsKukck)LziOH<&nyi|PT#TZd_1WJr zGp0?J;SS=I1*!;=0}_+{*vcmJa`Q2!PyWMwTa*nbRKg4-ic51#Sb%hXL2BOQk37me zL3}`-28amapDfL5qACDl3W5lbt4cszu#4g|^OEy(3vyCRQiFsim+&SqN>9GZyPPp> aaxGsBBm3lAe12>)AZ?nH_4pSv+5!Lpbw@P- delta 218 zcmbQFa7mtn`7#UJ zmp7ka_F&v>$oig<(POeAyCZV3yJ&M!#K3t|H))ZtZT*Wd%Pg7_v!^O~sggO~y!LJ&xlfLLky zrFkjwnR&_ixdl0?C8Nhz6 diff --git a/modules/variant_diagnostics/PBS_generate_jobs.py b/modules/variant_diagnostics/PBS_generate_jobs.py new file mode 100644 index 0000000..34ae193 --- /dev/null +++ b/modules/variant_diagnostics/PBS_generate_jobs.py @@ -0,0 +1,255 @@ +# System wide imports +from __future__ import division +import sys +import argparse +import re +import os +import csv +import subprocess +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ +from collections import OrderedDict +from collections import defaultdict +from joblib import Parallel, delayed +import multiprocessing +import thread +import glob +import readline +import errno +from datetime import datetime +import threading +import json +import ConfigParser +from config_settings import ConfigSectionMap +from logging_subprocess import * +from log_modules import * + + +def create_job(filter2_only_snp_vcf_dir, jobrun, vcf_filenames, unique_position_file, tmp_dir, Config): + + """ + This method takes the unique_position_file and list of final *_no_proximate_snp.vcf files and generates individual jobs/script. + Each of these jobs/scripts will generate a *label file. These label file for each sample contains a field description for each position in unique_position_file. + This field description denotes if the variant position made to the final variant list in a sample and if not then a reason/filter that caused it to filtered out from final list. + :param jobrun: + :param vcf_filenames: + :return: + """ + if jobrun == "parallel-cluster": + """ + Supports only PBS clusters for now. + """ + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_file_name = "%s.pbs" % (i) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') + call("qsub %s" % i, logger) + + elif jobrun == "parallel-local": + """ + Generate a Command list of each job and run it in parallel on different cores available on local system + """ + command_array = [] + command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_file_name = "%s.pbs" % (i) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = filter2_only_snp_vcf_dir + "/*vcf.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + + num_cores = multiprocessing.cpu_count() + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) + + elif jobrun == "cluster": + #command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir + #os.system("bash %s" % command_file) + command_array = [] + command_file = "%s/commands_list.sh" % filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + + + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_file_name = "%s.pbs" % (i) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = filter2_only_snp_vcf_dir + "/*vcf.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + + num_cores = multiprocessing.cpu_count() + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) + + elif jobrun == "local": + """ + Generate a Command list of each job and run it on local system one at a time + """ + + command_array = [] + command_file = "%s/commands_list.sh" % filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + + + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_file_name = "%s.pbs" % (i) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = filter2_only_snp_vcf_dir + "/*vcf.pbs" + pbs_scripts = glob.glob(pbs_dir) + + + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + call("bash %s" % command_file, logger) + +def create_indel_job(filter2_only_snp_vcf_dir, jobrun, vcf_filenames, unique_position_file, tmp_dir, Config): + + """ + This method takes the unique_indel_position_file and list of final *_indel_final.vcf files and generates individual jobs/script. + Each of these jobs/scripts will generate a *label file. These label file for each sample contains a field description of each position in unique_indel_position_file. + This field description denotes if the variant position made to the final variant list in a sample and if not then a reason/filter that caused it to filtered out from final list. + :param jobrun: + :param vcf_filenames: + :return: + """ + if jobrun == "parallel-cluster": + """ + Supports only PBS clusters for now. + """ + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_file_name = "%s_indel.pbs" % (i) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') + # os.system("qsub %s" % i) + call("qsub %s" % i, logger) + + elif jobrun == "parallel-local" or jobrun == "cluster": + """ + Generate a Command list of each job and run it in parallel on different cores available on local system + """ + command_array = [] + command_file = "%s/commands_indel_list.sh" % filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + + + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug_gatk.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_file_name = "%s_indel.pbs" % (i) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + + num_cores = multiprocessing.cpu_count() + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) + + # elif jobrun == "cluster": + # command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir + # os.system("bash %s" % command_file) + elif jobrun == "local": + """ + Generate a Command list of each job and run it on local system one at a time + """ + + command_array = [] + command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + + + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_file_name = "%s_indel.pbs" % (i) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" + pbs_scripts = glob.glob(pbs_dir) + + + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + call("bash %s" % command_file, logger) + +def run_command(i): + """Function to run each command and is run as a part of python Parallel mutiprocessing method. + + :param: + i: command variable to run + + :return: + done: string variable with completion status of command. + """ + + #call("%s" % i, logger) + os.system("%s" % i) + done = "Completed: %s" % i + return done \ No newline at end of file diff --git a/modules/variant_diagnostics/PBS_generate_jobs.pyc b/modules/variant_diagnostics/PBS_generate_jobs.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dec5113c51c777391d5182c7f4ecd00ebf96b6e0 GIT binary patch literal 8942 zcmeHMOK%*<5w6(>AImo>zVsZ)R>a5=*OHZo3B^bvIXPKTMur(I;tYm6z009yXO`15 z5(z9%%Go*iALN=-faG8x{~*Z0$Ss!uIRrW7kQ{PMzN((tT|V@HWeYNFK5BcqtE+3O zs;m2}E&pe#_K){J`mjm)PZgg(u!(eL5&8JDC?x7w)Ue3n=Mse_YLsLQ?J_mWMY}?c zO3|)Tqgu2_s4-HsN2xJNszf_uv@=esDvjE)#ssNJdTb5;WbHDAQ`DKJ#x zvR0|FTJW8w#_6Jch8kz2JxbwO>YSs-IWuRB!t?Y#+YYS@XpPgah#EHeHECQVe?l6U z*t+-ECgFnMfB0F03n(1}@q>qf4&vyp>6$>-PZM7y%D)pd(?O@NT3!~WK6jRi$@jd( z3quvAqbx~<-oEmOM0=@9gN`yivtYR$N3EcJUo}&#_@37_Bdp8S<_PF1{Iia?ZG_1u z2$-J5Ve+6v$rWqwkJKyCoezIc2P2dc?O3!^qTUF5@?ex-us2F6LZ(c;Dm_LVRcQ6l zbM?sc$dTvKBhO=auFy`E7wJK2g<3UQIv69^8C^>U4<;xb=NRoxi0+fL_oqs460decUady{q{z=d(h2gX=y8RtbdvmO z8A2acw08eRo6$!VHq(rl=`W==(ZLk?vt$-rI+zwErwb)#=*w)#Us#(xgxbkgyg*{b zWwv5(hCSiWiM7}pyEjXl+-W?*euGKo*^0luP6xBJcZGVh@WBN-z@$0afyrS?w5MnX zMxPU(?k1(pO@D!Y_H!^7vLJO^MjF@WaZ=Q5{Jg&% zXuG4*?bx?d?}5^Gx~=Ri3U)K)c4Hmzf#9}+P}yGO+hL$nJ8s#nAo4={svE^_H;MOx z4o(zTN8R;@O&$y(huPaIQVE8%9YlPd`59*I#9O+qn@P}3*F}eSz2-Ix#mZU@4{Q5T z5Qcpzwr5`ry)6~W3hU6F`3F6AD^6_1!nD`vLgQu}rCtz07tBzhZ~Mxui+36@7jFq2 zPkeEsq!{YB6kkQLyq~CVKlGBoi&DEUv*Y>7PGhmEFFo4H6OYK$?chJLrVbMnGjV>+%9kywbMH^!6ybF#J zw?3k$nX_n+{ny&>so39bqnqj7~l>Q~I1h3uu(S7^cUHh`GndW4PAuZv~OtPU5Uv_aYC@^t~J0SMTm4UL*TS z%-x`?!XQ%hp9JZ4wpGt9?l$o*LDY7Oao1N5>z&xo5UzbDKk(X7tka;W>&E^rtOwSu zti9gdx33wa-*Dq7+~<&Y*^+*c;Dw$r$ooM?Pux~KE$wu35j9*uus@sIiq%R|zMHF# z2S@Yu3$YQjf~LotQWz7Z<1A?^EsWT+nG!~}rt3AEaTcYTFL0OjdUs1}z92q&L-QK- zt594z+0#xIMX=3=y{oe=D2qD^ZlqBw)_jA^JM}c${g?=27*sEy>$0vl<4y+wtX+vc zy{-8QyXEO^$f~7$kxcI5XvZtCIdzkJSeY{ZwT=IaXdoAVh&zJCmq^UP;f!n#`hm48 zfaAVgOYh1x*I@;b+8{WK+w7alm+A#RcT(|GbzS5lMQ|RHZN+VKT56e;Zzb1D5^rD3RP?YY@lxw8pgIWacEv)RgSD6- z$~rIsl9jB2Pnb!+3IUncpl=g-9@iIG6QEkHT_P z?-q{1a^n=_KDNiHr@(T!s1M8Gm0?)En8Wg1mf&_~1}vXp(4E8b69C`4Lx69kfaR-% z+_p1kkogfug$DEaPs4m>_Q1BsVBhoEcC5T>M*JHy%Sb)@@eBR{}36+Q?4Gkk(NzC_RuSW{qPAEgbD-w7}NHU1mmKO8P! zP5fu#;-#TLXAFDQ;oGCbHx!3)j>8Dh;rpBB)@ALAQp328ub_hKhK5c;cMb!&XCOO= zZ=iZrRvN%NV}}Cd1z(3l+iiR4gO^bmI0UI3cJ*C{o%@x-jDO~!WW+K;&LX#;RDU=M zBb_C1JIhQMB?@qK_(XL0((jyOa-PWrCN>i;NjR67yvF2pCYPC%nefgK95+C8&NsO4 z3KIr5&NrD{W%34-Z!x*XWSvQ!$(u~BGr7U!Ehaaae4EL4n0%MX_ds3=z$6i80F%iD zfK4AmIh}$Ou<0n!Ia69GtpGMXHxQ~$0tf+Q32Zvs2STRj2oORJ>X(vk+t7qP2yO5+ zB#tY-&!dE7@wbqCwU?UG{WW%>{u{`Tvq--5JV=VsUqlo)OS0klY>=i z9lY|Z5Oh+``Zu(aQYKBzAt2UxPYQf@-ByOWwQ^ktUHnBe81XuMgrX`0pP|0PDI-ym z1caffAIfA@JM(gl$qgcva|g;GQbgglx25})C#=Sm6IW^^%^1~j2_Nln7&^R6QP&BB zEt4s@Y$Vx3M$&EU9$#{esK`%TDUp(iN|GpcxGW`g6T{188Oj z9|Wk4X44MNXWc3XT*fZcrS#a^##NH>z^TiGuI@duMhft&9gz?pM_M9nN9v9^e6C2VBK3;aAzzhxwaAZ2eXPij zOMSe^Pe^^D$WKarQlg3+UXsHpiN-ZkyO5cdXht46gOAFtN^4fyb5ftPx{T@bywvCI z^SHDYq$)rwmABu9!`!ATSj4ISiB9=9V~vVbNpr%5y3-;EkM zBz`yO*c$5cjLkBlcBAgMUTaJtH-kTp{IZJ}dkW2Z4r6RBfbNd;^to4&-h@0FlhY~5 zkv~^hqv5?VSv{SWyvhN3liFxXdehRoB#$P<$;XB&YQq%>xd}O(l&~tf&0msUm65iJ zKl6^{Qxc*##zB;i%X&B=4=R$O0+P`W!Vu1=9)pe@YHbtZoxJ6xw$GDcyG@ zfuS>kS?!Gv^KE>;TK07tt-u)9j^M_| zHBs*7sf(7|?%gcv1X-NyyN~H$YM2g$Y0^yd%g8k2B=kE6!G7dt(LUnFP^Cu4yNNT zFx|@;$^8Ls7Z!3&q()Wm1o=Tef#=vzj9naYmIUp{V=<;wVXZLwv}>3i5{s?Hq5PKtH!XZy`ED6VAFbf!|4Iy>hKmb#6EfhUsgOg zp$z6^3C^yL$VZAv}u5wf-ZWvLVO!tW#d2Qy3l}$PpWb@PQ+KRvC6gTDg}4 zX@sVzb0%7=}_*PDtG2#&^?3`V9DvGD!N(3F=p%rd)~Ug zsm{Z6jBr@@`Rb5@r7T1@589tssRJ+?bQq|+s^N>%v$rX`AF{vxG^|ed*VX?!e}!Z3 z;jh1py|2Hpymz-&u&0@S0dmz`{JegEQygc-K^nUIINbmY+}%5O-D2;@VV|XKHwqdD zIQEmEPz2;wU=-hUqDI_|8-uY4NDZ;UophUCEo>UZTb#H> z|C@k;=DGK3TnONto#{nQcGD~;9JModQPg(LBMLLmAgN&j+~lae7iELVzS!j^6e384 zk>}!}4?JH$KUB;0miK8P9*+!*HfSB{-@;M)A;g{?J=p+GV1 zM2Sa0?=3M|W;rq7USJ}|mEtx)_$Ud%fB;?1LoEU%2G7%3HF=Qb=1m|hkPFNzmyYHd ztrXy$K<7@HMA$k8&{FXxLs4b0o5{1dqaxxY*KhPyQ3gJaLLDc__KkNHt!;xm({GI8 zl?Fw}alS=hCnp3N?)lB((8TpyYgtOYO->9(fJW4ngT2|Ja zxylk-IZx}jvz4n(t+MDWAsw6eKi<#tg3t;dX7BDCbxeA#GsK7X=1^CM)G8M8ITxv|Q#zB0dl!OH%)&#^D%!IrK zAMz6i#Dy$o8juhyM!8D}vO=OVOK=Dfjd9FdBsG93>!28Gr=SltBIn+m3hBNj_}pXJ zbf3k{lEpxneJ)tctg@K5s)ZS|ebs?==jHY{2^$ymo@rhH$gRqx801)>GMANBph7tc zX$^c-5C~e^eGZtu)Pln;4pSLmVo;eSWeZ((4Q?H+oMojl%T~?`dB~xagZyG4N6CrG z`4!}V?ewkdI^8a)4B-1B{c*ZV$^urx{F+jkE6P8iDlmC}GS-#HEI=Z7%m%?e=mtsF zbOrPU&c*eRWdL+JrIM{!gOwPlBGZ-bG}r_>SY5y&$bXdFk;h!t$179?kMzjZVIH%j z-u^&dxLODwc;VNOJn+IJ@EEwDBr0%$QWSC!umzA7D7LAZoOvLLSW?bIMnTeYUA4Mi zge~R_k;4PTXQXY0ggpxzja@~@a579Y) z9#*IO8<3Wd(m5U&%441L`?2?R4ruu9)++biw{f-%7)o(NS#p$b@kVcmU|eJ=_L535 zv=Ws6IX5}4fS%-M*~qyj_yvMWC`wh z4)ewL?@|c6z+|Y~Xm(p zzJ_Fr2xug^V1N{XrWfmH{bJo>p}0ryDAcuWsD$?-2mS_=mzccF^=cMG0I#)%W**kzRo={i|kbb z7o+wxM|-%-Yk*?v^@3fh<;o8BjK1=^gyxWeb>RE@N~@9L<*>d`G#*Ky$HA4a;*C-( z-t(#~1OXv%#y<|={c_xwLlVtbPx%2kw^G{u7GASa21}IRk|{@7mZTnYdJ|>avp0u5 zG-;x5+w5zryRwNS91QqWHp6v&H8Vb^kO6r_Tpo`x_Bv0 zJ!+M6dnw-{`6j@ttP{V%I~=1gGVi{$>lZhMJ{({?Qf}pt9;1r$G?aYQk*==(se;F8e*ki^asL1S literal 0 HcmV?d00001 diff --git a/modules/variant_diagnostics/core_pipeline_core_prep_label.py b/modules/variant_diagnostics/core_pipeline_core_prep_label.py new file mode 100644 index 0000000..470eed5 --- /dev/null +++ b/modules/variant_diagnostics/core_pipeline_core_prep_label.py @@ -0,0 +1,293 @@ +# System wide imports +from __future__ import division +import sys +import argparse +import re +import os +import csv +import subprocess +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ +from collections import OrderedDict +from collections import defaultdict +from joblib import Parallel, delayed +import multiprocessing +import thread +import glob +import readline +import errno +from datetime import datetime +import threading +import json +import ConfigParser +from config_settings import ConfigSectionMap +from logging_subprocess import * +from log_modules import * +from tabix import * +from Bio import SeqIO +from core_prep_sanity_checks import * +from PBS_generate_jobs import * +from core_pipeline_core_prep_main import * + +def core_prep_label(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, reference, log_unique_time, log_file_handle, logger, jobrun, Config): + # Create temporary Directory core_temp_dir/temp for storing temporary intermediate files. Check if core_temp_dir contains all the required files to run these pipeline. + global temp_dir + temp_dir = filter2_only_snp_vcf_dir + "/temp" + + # # Extract All the unique SNO and Indel position list from final filtered *_no_proximate_snp.vcf files. + unique_position_file = create_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger) + unique_indel_position_file = create_indel_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger) + + # bgzip and tabix all the vcf files in core_temp_dir. + files_for_tabix = glob.glob("%s/*.vcf" % filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) + + # Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file. + tmp_dir = "/tmp/temp_%s/" % log_unique_time + + create_job(filter2_only_snp_vcf_dir, jobrun, vcf_filenames, unique_position_file, tmp_dir, Config) + + create_indel_job(filter2_only_snp_vcf_dir, jobrun, vcf_filenames, unique_indel_position_file, tmp_dir, Config) + + # If Phaster Summary file doesn't exist in reference genome folder + if not os.path.isfile("%s/summary.txt" % os.path.dirname(reference)): + if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": + keep_logging('Functional class filter is set to yes. Preparing Functional class filters\n', + 'Functional class filter is set to yes. Preparing Functional class filters\n', logger, + 'info') + if ConfigSectionMap("functional_filters", Config)['find_phage_region'] == "yes": + # Submit Phaster jobs to find ProPhage region in reference genome. + run_phaster(reference, filter2_only_snp_vcf_dir, logger, Config) + + call( + "cp %s %s/Logs/core_prep/" % (log_file_handle, os.path.dirname(os.path.dirname(filter2_only_snp_vcf_dir))), + logger) + + +"""core_prep methods + + This block contains methods that are respnsible for running the first part of core_All step of the pipeline. + This methods generates all the necessary intermediate files required for the second part of core_All step. + Example of intermediate files: various diagnostics files/matrices where it decides why a variant was filtered out. + +""" + +def create_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger): + + """ + This method gathers SNP positions from each final *_no_proximate_snp.vcf file (these are the positions that passed variant filter parameters + from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files. Use these *_no_proximate_snp.vcf_position files to generate a list of unique_position_file + :param: list of final vcf filenames i.e *.vcf_no_proximate_snp.vcf . These files are the final output of variant calling step for each sample. + :return: unique_position_file + """ + + filter2_only_snp_position_files_array = [] + for file in vcf_filenames: + with open(file, 'rU') as csv_file: + file_name = temp_dir + "/" + os.path.basename(file) + "_positions" + addpositionfilenametoarray = file_name + filter2_only_snp_position_files_array.append(addpositionfilenametoarray) + f1 = open(file_name, 'w+') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + position = row[0] + if not position.startswith('#'): + p_string = row[1] + "\n" + f1.write(p_string) + f1.close() + csv_file.close() + + """ Get Positions Specific to Outgroup Sample name """ + if outgroup is not None: + outgroup_position_file_name = temp_dir + "/" + outgroup_vcf_filename + "_positions" + outgroup_position_array = [] + f1 = open(outgroup_position_file_name, 'r+') + for lines in f1: + lines = lines.strip() + outgroup_position_array.append(int(lines)) + f1.close() + + + position_array_excluding_outgroup = [] + for filess in filter2_only_snp_position_files_array: + if outgroup not in filess: + f = open(filess, 'r+') + for line in f: + line = line.strip() + position_array_excluding_outgroup.append(int(line)) + f.close() + position_array_unique_excluding_outgroup = set(position_array_excluding_outgroup) + position_array_sort_excluding_outgroup = sorted(position_array_unique_excluding_outgroup) + #print len(position_array_sort_excluding_outgroup) + outgroup_specific_positions = [] + f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'w+') + for i in outgroup_position_array: + if i not in position_array_sort_excluding_outgroup: + f_outgroup.write(str(i) + '\n') + outgroup_specific_positions.append(int(i)) + # outgroup_indel_specific_positions.append(int(i)) + f_outgroup.close() + print "No. of variant positions in outgroup: %s" % len(outgroup_position_array) + print "No. of variant positions specific to outgroup: %s" % len(outgroup_specific_positions) + + position_array = [] + for filess in filter2_only_snp_position_files_array: + f = open(filess, 'r+') + for line in f: + line = line.strip() + # Changed variable to suit sorting: 25-07-2018 + position_array.append(int(line)) + f.close() + # Check why python sorting is not working + keep_logging('Sorting unique variant positions.\n', 'Sorting unique variant positions.\n', logger, 'info') + position_array_unique = set(position_array) + position_array_sort = sorted(position_array_unique) + keep_logging('\nThe number of unique variant positions:%s' % len(position_array_sort), '\nThe number of unique variant positions:%s' % len(position_array_sort), logger, 'info') + unique_position_file = "%s/unique_positions_file" % filter2_only_snp_vcf_dir + f=open(unique_position_file, 'w+') + for i in position_array_sort: + # Changed variable to suit sorting: 25-07-2018 + f.write(str(i) + "\n") + f.close() + + if len(position_array_sort) == 0: + keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') + exit() + + return unique_position_file + + else: + + """ Create position array containing unique positiones from positions file """ + + position_array = [] + for filess in filter2_only_snp_position_files_array: + f = open(filess, 'r+') + for line in f: + line = line.strip() + # Changed variable to suit sorting: 25-07-2018 + position_array.append(int(line)) + f.close() + # Check why python sorting is not working + keep_logging('Sorting unique variant positions.\n', 'Sorting unique variant positions.\n', logger, 'info') + position_array_unique = set(position_array) + position_array_sort = sorted(position_array_unique) + keep_logging('\nThe number of unique variant positions:%s' % len(position_array_sort), '\nThe number of unique variant positions:%s' % len(position_array_sort), logger, 'info') + unique_position_file = "%s/unique_positions_file" % filter2_only_snp_vcf_dir + f=open(unique_position_file, 'w+') + for i in position_array_sort: + # Changed variable to suit sorting: 25-07-2018 + f.write(str(i) + "\n") + f.close() + + if len(position_array_sort) == 0: + keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') + exit() + return unique_position_file + +def create_indel_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger): + + """ + This function gathers Indel positions from each final *_indel_final.vcf (these are the positions that passed variant filter parameters + from variant calling pipeline) and write to *_indel_final.vcf files. Use these *_indel_final.vcf_position files to generate a list of unique_position_file + :param: list of final vcf filenames i.e *_indel_final.vcf . These files are the final output of variant calling step for each sample. + :return: unique_indel_position_file + """ + + filter2_only_indel_position_files_array = [] + for file in vcf_filenames: + indel_file = file.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf') + with open(indel_file, 'rU') as csv_file: + file_name = temp_dir + "/" + os.path.basename(indel_file) + "_positions" + addpositionfilenametoarray = file_name + filter2_only_indel_position_files_array.append(addpositionfilenametoarray) + f1 = open(file_name, 'w+') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + position = row[0] + if not position.startswith('#'): + p_string = row[1] + "\n" + f1.write(p_string) + f1.close() + csv_file.close() + + """ Get Positions Specific to Outgroup Sample name """ + if outgroup is not None: + outgroup_position_indel_file_name = temp_dir + "/" + outgroup_indel_vcf_filename + "_positions" + print outgroup_position_indel_file_name + outgroup_position_indel_array = [] + f1 = open(outgroup_position_indel_file_name, 'r+') + for lines in f1: + lines = lines.strip() + outgroup_position_indel_array.append(int(lines)) + f1.close() + #print len(outgroup_position_indel_array) + + position_array_indel_excluding_outgroup = [] + for filess in filter2_only_indel_position_files_array: + if outgroup not in filess: + f = open(filess, 'r+') + for line in f: + line = line.strip() + position_array_indel_excluding_outgroup.append(int(line)) + f.close() + position_array_indel_unique_excluding_outgroup = set(position_array_indel_excluding_outgroup) + position_array_sort_indel_excluding_outgroup = sorted(position_array_indel_unique_excluding_outgroup) + outgroup_indel_specific_positions = [] + f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % filter2_only_snp_vcf_dir, 'w+') + for i in outgroup_position_indel_array: + if i not in position_array_sort_indel_excluding_outgroup: + f_outgroup.write(str(i) + '\n') + outgroup_indel_specific_positions.append(int(i)) + f_outgroup.close() + print "No. of indel variant positions in outgroup: %s" % len(outgroup_position_indel_array) + print "No. of indel variant positions specific to outgroup: %s" % len(outgroup_indel_specific_positions) + + position_array = [] + for filess in filter2_only_indel_position_files_array: + f = open(filess, 'r+') + for line in f: + line = line.strip() + # Changed variable to suit sorting: 25-07-2018 + position_array.append(int(line)) + f.close() + position_array_unique = set(position_array) + position_array_sort = sorted(position_array_unique) + keep_logging('\nThe number of unique indel positions:%s' % len(position_array_sort), '\nThe number of unique indel positions:%s' % len(position_array_sort), logger, 'info') + unique_indel_position_file = "%s/unique_indel_positions_file" % filter2_only_snp_vcf_dir + f=open(unique_indel_position_file, 'w+') + for i in position_array_sort: + # Changed variable to suit sorting: 25-07-2018 + f.write(str(i) + "\n") + f.close() + if len(position_array_sort) == 0: + keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') + exit() + + return unique_indel_position_file + + + else: + + """ Create position array containing unique positiones from positions file """ + position_array = [] + for filess in filter2_only_indel_position_files_array: + f = open(filess, 'r+') + for line in f: + line = line.strip() + # Changed variable to suit sorting: 25-07-2018 + position_array.append(int(line)) + f.close() + position_array_unique = set(position_array) + position_array_sort = sorted(position_array_unique) + keep_logging('\nThe number of unique indel positions:%s' % len(position_array_sort), '\nThe number of unique indel positions:%s' % len(position_array_sort), logger, 'info') + unique_indel_position_file = "%s/unique_indel_positions_file" % filter2_only_snp_vcf_dir + f=open(unique_indel_position_file, 'w+') + for i in position_array_sort: + # Changed variable to suit sorting: 25-07-2018 + f.write(str(i) + "\n") + f.close() + if len(position_array_sort) == 0: + keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') + exit() + return unique_indel_position_file \ No newline at end of file diff --git a/modules/variant_diagnostics/core_pipeline_core_prep_label.pyc b/modules/variant_diagnostics/core_pipeline_core_prep_label.pyc new file mode 100644 index 0000000000000000000000000000000000000000..992c5d8334a5a1472f3bf1dd16763857990aedb8 GIT binary patch literal 8934 zcmd5>O>i8=74F&9kG1;$Tm4yPFc^d#q(C4^IS>do7zmNg2x78T&eUjURvLMCW_Nnl zNL8x?A-7aPRXF8D4ml<{({Se ze_v1WUz4T3H9opiRo%Z~{Jv}}rT<_l<>O5~GRV-!kxuEI=RVhdv@-Zt%&E#;uJOF8%x4-bsLDc?UsRRFEMHcYa+Y6Gl_klKsQR*M zEUU_LKGuq=tYqz0Rb@5Huc^wK#H-hXdOFIHg1UDU_$P+;(jW~Rfhn0m zlWTEQ3-@+{Y6^)z@tUTFx?DC{`qoa+dgEqw3^}UHk+0eCwztr%W6|4|Jm_wzjy!h? zsxzWK98$+)Dn-7pu%yzWI)MM7%2H+c%kj8Mhg4@=`9=2ZoMUIpN70ZfA5W-sm@PY_ zGSryrOh}UvWu<7GSCmdxjCOt1A6Bpg{T9_ldL{+x6)S}_BMQ4>RIOl{?-o>oiV5Y9 z@dHh;c+})n+wr_@*-9rlC{@U{Zk4UpiWR{eQ(WoJ3Hd!mvMo|8s4`7e5HB*$u&rtF ztu$yfHND}I-nzVbuUdoAl)|s(5^SZ7rZjO;t*1;g-EK6zD4)2Ng^ zL36l%n4~Lgo&@k?w;Ah@UaL91K&L@7eWX_lK|y^`8|t%Hk9Xl!7~I81N{Jxoheve{~35_yflVKH1tsk=eYbnEfn9;^cgQYvXjoH_JN zWRxoQsce6Ek*ustgM*lLG8TW9(V=)ec;4^}LL#y8B+_Lt|=5-JlAym>;y0tceHd0!q7sWM&ri($!wYX$zZbDj=)j z*5br9)@8hazf$)NWR)cP1LU@%n%)X@6sC8>Xlpl&T$$8XGi(O+FbcMAgz0{Jck3Q} z%!^W&CWATMtlss5ds~gzZ)5rMMtSUG&grFx9;r*_08sy7^kn;hGP10 zB&t{{E?SGld27B{Dy$S%iWjXFYXtwJ)=J@gp=e#Qs`RD{_;MX2DE<{CdrBP-sl$IT zU|qlScXf;?fGCKw6+6G6KK`RR9$~yh2wZ|=As8MY*bOs0Eg=Rj9V66@s^fDEfCmVE z2#7eyImD+Sa-ru*EQL@i?GxVk&0l3y3IX-srg zpOV>6%Y-Ea8f_1srkv|{E~(=g)tONT5Cx2R{=Bzrb(HPh7 z{9GN+DV{KM>h7pYDq@v6hFZ#*SClhv?~!vNU_ZM3#;`7)IkShcYjJ@7h+h{mT$c7!=iXx>ORsatLpH!G0IIEh3@f{ z46&LC?-}A(g_DMe3%64stQmfAzWFc*PeB-FRgAGFhFO#RdBvuX4&}~^a;F}8-F+AV zGdW%tMb|Sw#)@5H&+B*A#duhb0S0S5S3IF$E(_kxL;Is5bpSqu4n1|t;=VYEJL_ua zYqr;)PpZ@UwD{23%OCm@d;MkTf%d}o-q~10Ow<1c$Sc|SduJbqKhBf=*thp^0s>Fi zJKMMH?0f)5NaBVac-4KNg~+Shm)$7FK^`B54W8FH!w6&uB?5Z`1L(l^lE6mp#31Ru zm)cE0j=;COai2YKYN9m3=|~_W3W>O^N%v^5v!mfD8z-IpeiEh-5Z_0Y93F^Hz%%bc zC8OA2-TOLlvY5=_tb?0TqEJFeM-4o=bds?6YO-q4sSvxyx2cDNbX0{8|SZxic4 z!^t6=o6igcP|$3P2K`ncII?jfo7K@?qgf9&jY>(7wv*`U{goEvle~*KPuxkUlv_b- zqN4Y&2nqq*3L7wYqS^RKDssH=V2YcAQD>O|BNVBY)GrwjWgQ+99Uj-8oto z9dI~3dTC<`V+fdxn?d9dd^u$%Qcj$>>*;`gCD;zgKZtxG5>Lsau zABankv1}d5RqHV@KY`A690dp+J@_e@l&&Z;2&i?MgiR3m`YIBx6xQVG)?M zfvH4mEOkY13iW(H>z~gjHbcL%8+g$lHwV12cQ;;Db1j_4IM%-zjen3tAja@3^P)Xqsk~|zl!Xan;F~0hD zNYvPfHEGRR7r+%Jt!Zl!X$ikm)`B%-O%*3lH-hI$YrM@sp8s_pjnG1Ny z0Z^~76=G~*E~Fi}PBC0&l-E1}!%Nu4ATneuX<4 z?o5f0)+Y*|+bkRJa+yhT83;4a8JC$9F7rt-o5o~Ubs*kpb>nq{#~HcbnpS}2Mr9HW zGA$6ASs@pw5T-(M0~=*Dg0pQs17x3T!Qtjk5*i?55Sn@63@tGY?mdm11tByGM$RJH z$bpfA{9GnSD2mAW4dj6HbZ6JHS}hP7fcQDu?C0Ggaq$yv;Vg&v3^H;%CZ`u7p!M44>1w0 z#7o4^1w&oFM&o>TQk~XsL0$fz#&N(??rWUi4?WO0Amck5ggp9loG?9}l3j1+Oyvz; z2cDoAXSs^G&JrXoH06KJQ2G^+^ZYa?>1&c7w_Q5$P0R!)K`5V2)y>XI9lYE0IU8w z$O^Zq+?C^MzwT87=i4agBP|X&obxP`?=X3e$@5IU%Y=4uzK`TIxHc-&5$Gwz^v+eb z`~i~}n7qh@lW<;ULd5U<2+4+Uia`{Eo|ZY9T*05Vi!qA<EaXDinw3r3~0Sl_vYw$9s3 z9CoFb4(xh=w!rPv*G1UE{Y0Lgt7(gO}?F95#}{P#nD(1`)ie{d^n<0)t>-s<8lATECb*QEz^$@q~L z77L1w%PMepM#dyC*9zQ{(o?s0aj#be-IS{avtjvaPWHFAtK}8hITSz!c7yB6E3j&e zSJU!NQag;Y4hI+RiZ@X8aMvNJc6ieuFJ>BeyB&7rkWlkgRk~09?eP7k=*cJHxn#^i zl0>n?oZLv6w(Q9v4|E*Kn>q7}>lSC1h@7gt19y2*fyUar!=r@m>pQnMU6}^ouyx-$ zIV7oad+|ybJDhMzTHkx6q&>XdI&!PL#?kT$^wujoZgy$tV%|DEpZ7TXrmEqEk;7{z zVVDlRt+xdpqJ1EX@q)3+54o(Gad$?1kVKXL&lP4sf6FJzvicc#Isd!Ag|RDRt^WdI CeVcLs literal 0 HcmV?d00001 diff --git a/modules/variant_diagnostics/core_pipeline_core_prep_main.py b/modules/variant_diagnostics/core_pipeline_core_prep_main.py new file mode 100644 index 0000000..1a54436 --- /dev/null +++ b/modules/variant_diagnostics/core_pipeline_core_prep_main.py @@ -0,0 +1,2439 @@ +# System wide imports +from __future__ import division +import sys +import argparse +import re +import os +import csv +import subprocess +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ +from collections import OrderedDict +from collections import defaultdict +from joblib import Parallel, delayed +import multiprocessing +import thread +import glob +import readline +import errno +from datetime import datetime +import threading +import json +import ConfigParser +from config_settings import ConfigSectionMap +from modules.logging_subprocess import * +from modules.log_modules import * +from modules.tabix import * +from Bio import SeqIO +from modules.core_prep_sanity_checks import * +from PBS_generate_jobs import * + + +"""core methods + + This block contains methods that are respnsible for running the second part of core_All step of the pipeline. + It uses intermediate files generated during the first step, finds core SNPs and annotates variants using snpEff. + It will generate all types of SNP matrices that is required for downstream pathways / Association analysis. + Output: + - + +""" + + +def generate_paste_command(): + """ + This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. + :param: null + :return: null + """ + + """ Paste/Generate and sort SNP Filter Label Matrix """ + paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files.sh" + f4 = open(paste_file, 'w+') + paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir + for i in vcf_filenames: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') + paste_command = paste_command + " " + label_file + header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % ( + args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) + sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir + sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir + + call("%s" % header_awk_cmd, logger) + call("%s" % sed_header, logger) + call("%s" % sed_header_2, logger) + + temp_paste_command = paste_command + " > %s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir + paste_command = paste_command + " > %s/All_label_final_raw" % args.filter2_only_snp_vcf_dir + f4.write(paste_command) + f4.close() + sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw > %s/All_label_final_sorted.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + paste_command_header = "cat %s/header.txt %s/All_label_final_sorted.txt > %s/All_label_final_sorted_header.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + + ls = [] + for i in vcf_filenames: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') + ls.append(label_file) + ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) + + with open('%s/All_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: + outfile.write(paste_command) + outfile.close() + + with open('%s/temp_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: + outfile.write(temp_paste_command) + outfile.close() + + call("bash %s/All_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) + call("bash %s/temp_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) + call("%s" % sort_All_label_cmd, logger) + call("%s" % paste_command_header, logger) + + """ Assign numeric code to each variant filter reason""" + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir + call("%s" % remove_unwanted_text, logger) + +def generate_paste_command_outgroup(): + """ + This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. + :param: null + :return: null + """ + + if args.outgroup: + """ Paste/Generate and sort SNP Filter Label Matrix """ + paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files_outgroup.sh" + f4 = open(paste_file, 'w+') + paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir + for i in vcf_filenames: + if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') + paste_command = paste_command + " " + label_file + + """Exclude outgroup sample name in header + + header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) + sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir + sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir + + """ + + header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % ( + outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) + sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir + sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir + + call("%s" % header_awk_cmd, logger) + call("%s" % sed_header, logger) + call("%s" % sed_header_2, logger) + + temp_paste_command = paste_command + " > %s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir + paste_command = paste_command + " > %s/All_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir + f4.write(paste_command) + f4.close() + sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw_outgroup > %s/All_label_final_sorted_outgroup.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + paste_command_header = "cat %s/header_outgroup.txt %s/All_label_final_sorted_outgroup.txt > %s/All_label_final_sorted_header_outgroup.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + + ls = [] + for i in vcf_filenames: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') + ls.append(label_file) + ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) + + with open('%s/All_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: + outfile.write(paste_command) + outfile.close() + + with open('%s/temp_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: + outfile.write(temp_paste_command) + outfile.close() + call("bash %s/All_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) + call("bash %s/temp_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) + + """ + remove this lines + #subprocess.call(["%s" % paste_command], shell=True) + #subprocess.call(["%s" % temp_paste_command], shell=True) + #subprocess.check_call('%s' % paste_command) + #subprocess.check_call('%s' % temp_paste_command) + #os.system(paste_command) change + #os.system(temp_paste_command) change + """ + + call("%s" % sort_All_label_cmd, logger) + call("%s" % paste_command_header, logger) + + """ Assign numeric code to each variant filter reason""" + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir + call("%s" % remove_unwanted_text, logger) + + else: + print "Skip generating seperate intermediate files for outgroup" + + +def generate_indel_paste_command(): + """ + This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. + :param: null + :return: null + """ + + """ Paste/Generate and sort SNP Filter Label Matrix """ + paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files.sh" + f4 = open(paste_file, 'w+') + paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir + for i in vcf_filenames: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_indel_final.vcf_indel_positions_label') + paste_command = paste_command + " " + label_file + header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % ( + args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) + sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir + sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir + + # os.system(header_awk_cmd) + # os.system(sed_header) + # os.system(sed_header_2) + + call("%s" % header_awk_cmd, logger) + call("%s" % sed_header, logger) + call("%s" % sed_header_2, logger) + + temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir + paste_command = paste_command + " > %s/All_indel_label_final_raw" % args.filter2_only_snp_vcf_dir + f4.write(paste_command) + f4.close() + + call("bash %s" % paste_file, logger) + + sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw > %s/All_indel_label_final_sorted.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + paste_command_header = "cat %s/header.txt %s/All_indel_label_final_sorted.txt > %s/All_indel_label_final_sorted_header.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + + ls = [] + for i in vcf_filenames: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_indel_final.vcf_indel_positions_label') + ls.append(label_file) + ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) + + with open('%s/All_indel_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: + outfile2.write(paste_command) + outfile2.close() + + with open('%s/temp_indel_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: + outfile2.write(temp_paste_command) + outfile2.close() + + # Why is this not working? + call("bash %s/All_indel_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) + call("bash %s/temp_indel_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) + keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') + + """ + remove this lines + #subprocess.call(["%s" % paste_command], shell=True) + #subprocess.call(["%s" % temp_paste_command], shell=True) + #subprocess.check_call('%s' % paste_command) + #subprocess.check_call('%s' % temp_paste_command) + #os.system(paste_command) change + #os.system(temp_paste_command) change + """ + + call("%s" % sort_All_label_cmd, logger) + call("%s" % paste_command_header, logger) + + """ Assign numeric code to each variant filter reason""" + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call( + ["sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir + call("%s" % remove_unwanted_text, logger) + + +def generate_indel_paste_command_outgroup(): + """ + This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. + :param: null + :return: null + """ + + if args.outgroup: + """ Paste/Generate and sort SNP Filter Label Matrix """ + # define a file name where the paste commands will be saved. + paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files_outgroup.sh" + f4 = open(paste_file, 'w+') + + # initiate paste command string + paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir + + # Generate paste command + for i in vcf_filenames: + if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_indel_final.vcf_indel_positions_label') + paste_command = paste_command + " " + label_file + # Change header awk command to exclude outgroup + # header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) + header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % ( + outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) + sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir + sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir + + call("%s" % header_awk_cmd, logger) + call("%s" % sed_header, logger) + call("%s" % sed_header_2, logger) + + temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir + paste_command = paste_command + " > %s/All_indel_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir + f4.write(paste_command) + f4.close() + + call("bash %s" % paste_file, logger) + + sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw_outgroup > %s/All_indel_label_final_sorted_outgroup.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + paste_command_header = "cat %s/header_outgroup.txt %s/All_indel_label_final_sorted_outgroup.txt > %s/All_indel_label_final_sorted_header_outgroup.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + + ls = [] + for i in vcf_filenames: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_indel_final.vcf_indel_positions_label') + ls.append(label_file) + ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) + + with open('%s/All_indel_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: + outfile2.write(paste_command) + outfile2.close() + + with open('%s/temp_indel_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: + outfile2.write(temp_paste_command) + outfile2.close() + + # Why is this not working? + call("bash %s/All_indel_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) + call("bash %s/temp_indel_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) + keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') + + """ + remove this lines + #subprocess.call(["%s" % paste_command], shell=True) + #subprocess.call(["%s" % temp_paste_command], shell=True) + #subprocess.check_call('%s' % paste_command) + #subprocess.check_call('%s' % temp_paste_command) + #os.system(paste_command) change + #os.system(temp_paste_command) change + """ + + call("%s" % sort_All_label_cmd, logger) + call("%s" % paste_command_header, logger) + + """ Assign numeric code to each variant filter reason""" + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir + call("%s" % remove_unwanted_text, logger) + else: + print "Skip generating seperate intermediate files for outgroup" + + +def generate_position_label_data_matrix(): + """ + Generate different list of Positions using the matrix All_label_final_sorted_header.txt. + + (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). + + Filtered Position label matrix: + List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. + + Only_ref_variant_positions_for_closely_matrix.txt : + Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. + + :param: null + :return: null + + """ + + def generate_position_label_data_matrix_All_label(): + position_label = OrderedDict() + f1 = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') + f2 = open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f3 = open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f4 = open( + "%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, + 'w+') + if args.outgroup: + with open("%s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + position_label[row[0]] = row[1:] + keep_logging('Generating different list of Positions and heatmap data matrix... \n', + 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f2.write('\t' + print_string_header.strip() + '\n') + f3.write('\t' + print_string_header.strip() + '\n') + f4.write('\t' + print_string_header.strip() + '\n') + for value in position_label: + lll = ['0', '2', '3', '4', '5', '6', '7'] + ref_var = ['1', '1TRUE'] + if set(ref_var) & set(position_label[value]): + if set(lll) & set(position_label[value]): + if int(value) not in outgroup_specific_positions: + print_string = "" + for i in position_label[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f3.write(STRR2) + if position_label[value].count('1TRUE') >= 2: + f4.write('1\n') + else: + f4.write('0\n') + else: + if int(value) not in outgroup_specific_positions: + strr = value + "\n" + f1.write(strr) + STRR3 = value + "\t" + str(position_label[value]) + "\n" + f2.write(STRR3) + csv_file.close() + f1.close() + f2.close() + f3.close() + f4.close() + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + else: + with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + position_label[row[0]] = row[1:] + keep_logging('Generating different list of Positions and heatmap data matrix... \n', + 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f2.write('\t' + print_string_header.strip() + '\n') + f3.write('\t' + print_string_header.strip() + '\n') + f4.write('\t' + print_string_header.strip() + '\n') + for value in position_label: + lll = ['0', '2', '3', '4', '5', '6', '7'] + ref_var = ['1', '1TRUE'] + if set(ref_var) & set(position_label[value]): + if set(lll) & set(position_label[value]): + + print_string = "" + for i in position_label[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f3.write(STRR2) + if position_label[value].count('1TRUE') >= 2: + f4.write('1\n') + else: + f4.write('0\n') + else: + + strr = value + "\n" + f1.write(strr) + STRR3 = value + "\t" + str(position_label[value]) + "\n" + f2.write(STRR3) + csv_file.close() + f1.close() + f2.close() + f3.close() + f4.close() + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + def temp_generate_position_label_data_matrix_All_label(): + + """ + Read temp_label_final_raw.txt SNP position label data matrix for generating barplot statistics. + """ + temp_position_label = OrderedDict() + f33 = open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + print_string_header = "\t" + + if args.outgroup: + for i in vcf_filenames: + if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: + print_string_header = print_string_header + os.path.basename(i) + "\t" + else: + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + + f33.write('\t' + print_string_header.strip() + '\n') + keep_logging( + 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, + 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, + logger, 'info') + lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', + 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', + 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', + 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', + 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] + ref_var = ['reference_allele', 'VARIANT'] + + if args.outgroup: + print "here" + with open("%s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + if set(ref_var) & set(row[1:]): + if set(lll) & set(row[1:]): + if int(row[0]) not in outgroup_specific_positions: + + print_string = "" + for i in row[1:]: + print_string = print_string + "\t" + i + STRR2 = row[0] + print_string + "\n" + f33.write(STRR2) + csv_file.close() + f33.close() + + else: + with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + if set(ref_var) & set(row[1:]): + if set(lll) & set(row[1:]): + + print_string = "" + for i in row[1:]: + print_string = print_string + "\t" + i + STRR2 = row[0] + print_string + "\n" + f33.write(STRR2) + csv_file.close() + f33.close() + """ + Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ + """ + temp_position_label_FQ = OrderedDict() + f44 = open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') + with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, + 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + + for row in csv_reader: + temp_position_label_FQ[row[0]] = row[1:] + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f44.write('\t' + print_string_header.strip() + '\n') + for value in temp_position_label_FQ: + lll = ['LowFQ'] + if set(lll) & set(temp_position_label_FQ[value]): + + print_string = "" + for i in temp_position_label_FQ[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f44.write(STRR2) + f44.close() + csv_file.close() + f44.close() + + """ + Perform Sed on temp files. Find a faster way to do this. + """ + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + """ + Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp + """ + temp_position_label_DP = OrderedDict() + f44 = open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') + with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, + 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + temp_position_label_DP[row[0]] = row[1:] + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f44.write('\t' + print_string_header.strip() + '\n') + for value in temp_position_label_DP: + lll = ['HighFQ_DP'] + ref_var = ['reference_allele', 'VARIANT'] + if set(lll) & set(temp_position_label_FQ[value]): + + print_string = "" + for i in temp_position_label_FQ[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f44.write(STRR2) + f44.close() + csv_file.close() + + """ + Perform Sed on temp files. Find a faster way to do this. + """ + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + def barplot_stats(): + keep_logging( + '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', + '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', + logger, 'info') + """ + Read each Sample columns and calculate the percentage of each label to generate barplot statistics. + This will give a visual explanation of how many positions in each samples were filtered out because of different reason + """ + + c_reader = csv.reader( + open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), + delimiter='\t') + columns = list(zip(*c_reader)) + keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') + counts = 1 + + if args.outgroup: + end = len(vcf_filenames) + 1 + end = end - 1 + else: + end = len(vcf_filenames) + 1 + + f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_count.write( + "Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") + f_bar_perc.write( + "Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") + + for i in xrange(1, end, 1): + """ Bar Count Statistics: Variant Position Count Statistics """ + true_variant = columns[i].count('VARIANT') + unmapped_positions = columns[i].count('reference_unmapped_position') + reference_allele = columns[i].count('reference_allele') + Only_low_FQ = columns[i].count('LowFQ') + Only_DP = columns[i].count('HighFQ_DP') + Only_low_MQ = columns[i].count('HighFQ') + low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count( + 'LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count( + 'LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count( + 'LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[ + i].count('LowFQ_DP') + high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count( + 'HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count( + 'HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count( + 'HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL') + other = low_FQ_other_parameters + high_FQ_other_parameters + + total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ + + filename_count = i - 1 + + if args.outgroup: + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions, reference_allele, true_variant, + Only_low_FQ, Only_DP, Only_low_MQ, other) + f_bar_count.write(bar_string) + else: + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions, reference_allele, true_variant, + Only_low_FQ, Only_DP, Only_low_MQ, other) + # f_bar_count.write(bar_string) + """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ + try: + true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) + except ZeroDivisionError: + true_variant_perc = 0 + try: + unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) + except ZeroDivisionError: + unmapped_positions_perc = 0 + try: + reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) + except ZeroDivisionError: + reference_allele_perc = 0 + try: + Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total) + except ZeroDivisionError: + Only_low_FQ_perc = 0 + try: + Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total) + except ZeroDivisionError: + Only_DP_perc = 0 + try: + Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total) + except ZeroDivisionError: + Only_low_MQ_perc = 0 + try: + low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[ + i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[ + i].count('LowFQ_DP_proximate_SNP') + columns[i].count( + 'LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + + columns[i].count('LowFQ_QUAL') + columns[i].count( + 'LowFQ_DP')) * 100) / total) + except ZeroDivisionError: + low_FQ_other_parameters_perc = 0 + try: + high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[ + i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[ + i].count('HighFQ_DP_proximate_SNP') + columns[i].count( + 'HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + + columns[i].count('HighFQ_QUAL')) * 100) / total) + except ZeroDivisionError: + high_FQ_other_parameters_perc = 0 + + other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) + if args.outgroup: + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions_perc, true_variant_perc, + Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, + other_perc) + else: + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions_perc, reference_allele_perc, + true_variant_perc, + Only_low_FQ_perc, Only_DP_perc, + Only_low_MQ_perc, other_perc) + f_bar_count.write(bar_string) + f_bar_perc.write(bar_perc_string) + f_bar_count.close() + f_bar_perc.close() + bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % ( + args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) + barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') + barplot_R_file.write(bargraph_R_script) + keep_logging('Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, + 'Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, + logger, 'info') + + """ Methods Steps""" + keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') + generate_position_label_data_matrix_All_label() + keep_logging('Running: Changing variables in data matrices to codes for faster processing...', + 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') + temp_generate_position_label_data_matrix_All_label() + keep_logging('Running: Generating Barplot statistics data matrices...', + 'Running: Generating Barplot statistics data matrices...', logger, 'info') + barplot_stats() + + +def generate_indel_position_label_data_matrix(): + """ + Generate different list of Positions using the matrix All_label_final_sorted_header.txt. + + (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). + + Filtered Position label matrix: + List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. + + Only_ref_variant_positions_for_closely_matrix.txt : + Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. + + :param: null + :return: null + + """ + + def generate_indel_position_label_data_matrix_All_label(): + position_label = OrderedDict() + print "Generating Only_ref_indel_positions_for_closely" + f1 = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') + f2 = open("%s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f3 = open("%s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f4 = open( + "%s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, + 'w+') + + if args.outgroup: + with open("%s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + position_label[row[0]] = row[1:] + keep_logging('Generating different list of Positions and heatmap data matrix...', + 'Generating different list of Positions and heatmap data matrix...', logger, 'info') + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + # f.write('\t' + print_string_header.strip() + '\n') + f2.write('\t' + print_string_header.strip() + '\n') + f3.write('\t' + print_string_header.strip() + '\n') + f4.write('\t' + print_string_header.strip() + '\n') + for value in position_label: + lll = ['0', '2', '3', '4', '5', '6', '7'] + ref_var = ['1', '1TRUE'] + if set(ref_var) & set(position_label[value]): + if set(lll) & set(position_label[value]): + if int(value) not in outgroup_indel_specific_positions: + print_string = "" + for i in position_label[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f3.write(STRR2) + if position_label[value].count('1TRUE') >= 2: + f4.write('1\n') + else: + f4.write('0\n') + else: + if int(value) not in outgroup_indel_specific_positions: + strr = value + "\n" + f1.write(strr) + STRR3 = value + "\t" + str(position_label[value]) + "\n" + f2.write(STRR3) + csv_file.close() + f1.close() + f2.close() + f3.close() + f4.close() + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + else: + with open("%s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + position_label[row[0]] = row[1:] + keep_logging('Generating different list of Positions and heatmap data matrix...', + 'Generating different list of Positions and heatmap data matrix...', logger, 'info') + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + # f.write('\t' + print_string_header.strip() + '\n') + f2.write('\t' + print_string_header.strip() + '\n') + f3.write('\t' + print_string_header.strip() + '\n') + f4.write('\t' + print_string_header.strip() + '\n') + for value in position_label: + + lll = ['0', '2', '3', '4', '5', '6', '7'] + ref_var = ['1', '1TRUE'] + if set(ref_var) & set(position_label[value]): + if set(lll) & set(position_label[value]): + print_string = "" + for i in position_label[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f3.write(STRR2) + if position_label[value].count('1TRUE') >= 2: + f4.write('1\n') + else: + f4.write('0\n') + else: + strr = value + "\n" + f1.write(strr) + STRR3 = value + "\t" + str(position_label[value]) + "\n" + f2.write(STRR3) + csv_file.close() + f1.close() + f2.close() + f3.close() + f4.close() + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + def temp_generate_indel_position_label_data_matrix_All_label(): + + """ + Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. + """ + temp_position_label = OrderedDict() + f33 = open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + print_string_header = "\t" + if args.outgroup: + for i in vcf_filenames: + + if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: + print_string_header = print_string_header + os.path.basename(i) + "\t" + else: + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + + f33.write('\t' + print_string_header.strip() + '\n') + keep_logging( + 'Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, + 'Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + # lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] + lll = ['reference_unmapped_position', 'LowAF', 'LowAF_DP', 'LowAF_QUAL', 'LowAF_DP_QUAL', 'LowAF_QUAL_DP', + 'HighAF_DP', 'HighAF_QUAL', 'HighAF_DP_QUAL', 'HighAF_QUAL_DP', 'HighAF', 'LowAF_proximate_SNP', + 'LowAF_DP_proximate_SNP', 'LowAF_QUAL_proximate_SNP', 'LowAF_DP_QUAL_proximate_SNP', + 'LowAF_QUAL_DP_proximate_SNP', 'HighAF_DP_proximate_SNP', 'HighAF_QUAL_proximate_SNP', + 'HighAF_DP_QUAL_proximate_SNP', 'HighAF_QUAL_DP_proximate_SNP', 'HighAF_proximate_SNP', '_proximate_SNP'] + ref_var = ['reference_allele', 'VARIANT'] + + if args.outgroup: + with open("%s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + if set(ref_var) & set(row[1:]): + if set(lll) & set(row[1:]): + if int(row[0]) not in outgroup_indel_specific_positions: + print_string = "" + for i in row[1:]: + print_string = print_string + "\t" + i + STRR2 = row[0] + print_string + "\n" + f33.write(STRR2) + csv_file.close() + f33.close() + else: + with open("%s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + if set(ref_var) & set(row[1:]): + if set(lll) & set(row[1:]): + + print_string = "" + for i in row[1:]: + print_string = print_string + "\t" + i + STRR2 = row[0] + print_string + "\n" + f33.write(STRR2) + csv_file.close() + f33.close() + """ + Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of AF + """ + temp_position_label_AF = OrderedDict() + f44 = open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir, + 'w+') + with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, + 'Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + + for row in csv_reader: + temp_position_label_AF[row[0]] = row[1:] + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f44.write('\t' + print_string_header.strip() + '\n') + for value in temp_position_label_AF: + lll = ['LowAF'] + if set(lll) & set(temp_position_label_AF[value]): + + print_string = "" + for i in temp_position_label_AF[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f44.write(STRR2) + f44.close() + csv_file.close() + f44.close() + + """ + Perform Sed on temp files. Find a faster way to do this. + """ + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + """ + Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp + """ + temp_position_label_DP = OrderedDict() + f44 = open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, + 'w+') + with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, + 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + temp_position_label_DP[row[0]] = row[1:] + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f44.write('\t' + print_string_header.strip() + '\n') + for value in temp_position_label_DP: + lll = ['HighAF_DP'] + ref_var = ['reference_allele', 'VARIANT'] + if set(lll) & set(temp_position_label_AF[value]): + print_string = "" + for i in temp_position_label_AF[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f44.write(STRR2) + f44.close() + csv_file.close() + + """ + Perform Sed on temp files. Find a faster way to do this. + """ + subprocess.call([ + "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF_DP/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/LowAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call([ + "sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + def barplot_indel_stats(): + keep_logging( + 'Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', + 'Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', + logger, 'info') + """ + Read each Sample columns and calculate the percentage of each label to generate barplot statistics. + This will give a visual explanation of how many positions in each samples were filtered out because of different reason + """ + + c_reader = csv.reader( + open('%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, + 'r'), delimiter='\t') + columns = list(zip(*c_reader)) + print len(columns) + keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') + counts = 1 + + if args.outgroup: + end = len(vcf_filenames) + 1 + end = end - 1 + else: + end = len(vcf_filenames) + 1 + print end + + f_bar_count = open("%s/bargraph_indel_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_perc = open("%s/bargraph_indel_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_count.write( + "Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_AF\tOnly_DP\tOnly_low_MQ\tother\n") + f_bar_perc.write( + "Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_AF_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") + for i in xrange(1, end, 1): + """ Bar Count Statistics: Variant Position Count Statistics """ + print i + true_variant = columns[i].count('VARIANT') + unmapped_positions = columns[i].count('reference_unmapped_position') + reference_allele = columns[i].count('reference_allele') + Only_low_AF = columns[i].count('LowAF') + Only_DP = columns[i].count('HighAF_DP') + Only_low_MQ = columns[i].count('HighAF') + low_AF_other_parameters = columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[i].count( + 'LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[i].count( + 'LowAF_DP_proximate_SNP') + columns[i].count('LowAF_proximate_SNP') + columns[i].count( + 'LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + columns[i].count('LowAF_QUAL') + columns[ + i].count('LowAF_DP') + high_AF_other_parameters = columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[i].count( + 'HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[i].count( + 'HighAF_DP_proximate_SNP') + columns[i].count('HighAF_proximate_SNP') + columns[i].count( + 'HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + columns[i].count('HighAF_QUAL') + other = low_AF_other_parameters + high_AF_other_parameters + total = true_variant + unmapped_positions + reference_allele + Only_low_AF + Only_DP + low_AF_other_parameters + high_AF_other_parameters + Only_low_MQ + filename_count = i - 1 + # bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_AF, Only_DP, Only_low_MQ, other) + if args.outgroup: + ### + + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions, reference_allele, true_variant, + Only_low_AF, Only_DP, Only_low_MQ, other) + else: + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions, reference_allele, true_variant, + Only_low_AF, Only_DP, Only_low_MQ, other) + + f_bar_count.write(bar_string) + + """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ + try: + true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) + except ZeroDivisionError: + true_variant_perc = 0 + try: + unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) + except ZeroDivisionError: + unmapped_positions_perc = 0 + try: + reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) + except ZeroDivisionError: + reference_allele_perc = 0 + try: + Only_low_AF_perc = float((columns[i].count('LowAF') * 100) / total) + except ZeroDivisionError: + Only_low_AF_perc = 0 + try: + Only_DP_perc = float((columns[i].count('HighAF_DP') * 100) / total) + except ZeroDivisionError: + Only_DP_perc = 0 + try: + Only_low_MQ_perc = float((columns[i].count('HighAF') * 100) / total) + except ZeroDivisionError: + Only_low_MQ_perc = 0 + try: + low_AF_other_parameters_perc = float(((columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[ + i].count('LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[ + i].count('LowAF_DP_proximate_SNP') + columns[i].count( + 'LowAF_proximate_SNP') + columns[i].count('LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + + columns[i].count('LowAF_QUAL') + columns[i].count( + 'LowAF_DP')) * 100) / total) + except ZeroDivisionError: + low_AF_other_parameters_perc = 0 + try: + high_AF_other_parameters_perc = float(((columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[ + i].count('HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[ + i].count('HighAF_DP_proximate_SNP') + columns[i].count( + 'HighAF_proximate_SNP') + columns[i].count('HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + + columns[i].count('HighAF_QUAL')) * 100) / total) + except ZeroDivisionError: + high_AF_other_parameters_perc = 0 + + other_perc = float(low_AF_other_parameters_perc + high_AF_other_parameters_perc) + if args.outgroup: + ### + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( + os.path.basename( + vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, + other_perc) + f_bar_perc.write(bar_perc_string) + else: + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( + os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, + other_perc) + f_bar_perc.write(bar_perc_string) + + f_bar_count.close() + f_bar_perc.close() + bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_indel_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot_indel.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % ( + args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) + barplot_R_file = open("%s/bargraph_indel.R" % args.filter2_only_snp_vcf_dir, 'w+') + barplot_R_file.write(bargraph_R_script) + keep_logging('Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, + 'Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, + logger, 'info') + + """ Methods Steps""" + keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') + # if args.outgroup: + # f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') + # global outgroup_indel_specific_positions + # outgroup_indel_specific_positions = [] + # for i in f_outgroup: + # outgroup_indel_specific_positions.append(i) + # f_outgroup.close() + # + # f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') + # global outgroup_specific_positions + # outgroup_specific_positions = [] + # for i in f_outgroup: + # outgroup_specific_positions.append(i) + # f_outgroup.close() + # else: + # global outgroup_specific_positions + # global outgroup_indel_specific_positions + # outgroup_indel_specific_positions = [] + # outgroup_specific_positions = [] + generate_indel_position_label_data_matrix_All_label() + keep_logging('Running: Changing variables in data matrices to codes for faster processing...', + 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') + temp_generate_indel_position_label_data_matrix_All_label() + keep_logging('Running: Generating Barplot statistics data matrices...', + 'Running: Generating Barplot statistics data matrices...', logger, 'info') + barplot_indel_stats() + + +def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter): + """ Generate jobs/scripts that creates core consensus fasta file. + + This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. + Input for Fasttree, Beast and pairwise variant analysis. + + :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. + :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. + :return: + :raises: + """ + if jobrun == "parallel-cluster": + """ + Supports only PBS clusters for now. + """ + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( + job_name, ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], + args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) + job_file_name = "%s_fasta.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') + # os.system("qsub %s" % i) + call("qsub %s" % i, logger) + + + elif jobrun == "parallel-local" or jobrun == "cluster": + """ + Generate a Command list of each job and run it in parallel on different cores available on local system + """ + command_array = [] + command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( + job_name, ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], + args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) + job_file_name = "%s_fasta.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + if args.numcores: + num_cores = int(num_cores) + else: + num_cores = multiprocessing.cpu_count() + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) + + # elif jobrun == "cluster": + # command_array = [] + # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir + # f3 = open(command_file, 'w+') + # for i in vcf_filenames: + # job_name = os.path.basename(i) + # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) + # job_file_name = "%s_fasta.pbs" % (i) + # f1=open(job_file_name, 'w+') + # f1.write(job_print_string) + # f1.close() + # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" + # pbs_scripts = glob.glob(pbs_dir) + # for i in pbs_scripts: + # f3.write("bash %s\n" % i) + # f3.close() + # with open(command_file, 'r') as fpp: + # for lines in fpp: + # lines = lines.strip() + # command_array.append(lines) + # fpp.close() + # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) + else: + """ + Generate a Command list of each job and run it on local system one at a time + """ + command_array = [] + command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( + job_name, ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], + args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) + job_file_name = "%s_fasta.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" + pbs_scripts = glob.glob(pbs_dir) + + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + # os.system("bash command_file") + call("bash %s" % command_file, logger) + + +def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, config_file): + """ Generate jobs/scripts that creates core consensus fasta file. + + This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. + Input for Fasttree, Beast and pairwise variant analysis. + + :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. + :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. + :return: + :raises: + """ + if jobrun == "parallel-cluster": + """ + Supports only PBS clusters for now. + """ + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( + job_name, ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], + args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) + job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') + # os.system("qsub %s" % i) + call("qsub %s" % i, logger) + + + elif jobrun == "parallel-local" or jobrun == "cluster": + """ + Generate a Command list of each job and run it in parallel on different cores available on local system + """ + command_array = [] + command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( + job_name, ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], + args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) + job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + if args.numcores: + num_cores = int(num_cores) + else: + num_cores = multiprocessing.cpu_count() + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) + + # elif jobrun == "cluster": + # command_array = [] + # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir + # f3 = open(command_file, 'w+') + # for i in vcf_filenames: + # job_name = os.path.basename(i) + # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) + # job_file_name = "%s_fasta.pbs" % (i) + # f1=open(job_file_name, 'w+') + # f1.write(job_print_string) + # f1.close() + # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" + # pbs_scripts = glob.glob(pbs_dir) + # for i in pbs_scripts: + # f3.write("bash %s\n" % i) + # f3.close() + # with open(command_file, 'r') as fpp: + # for lines in fpp: + # lines = lines.strip() + # command_array.append(lines) + # fpp.close() + # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) + else: + """ + Generate a Command list of each job and run it on local system one at a time + """ + command_array = [] + command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( + job_name, ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], + args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) + job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" + pbs_scripts = glob.glob(pbs_dir) + + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + # os.system("bash command_file") + call("bash %s" % command_file, logger) + + +def create_job_DP(jobrun, vcf_filenames): + """ + Based on type of jobrun; generate jobs and run accordingly. + :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. + :param vcf_filenames: + :return: + """ + + if jobrun == "parallel-cluster": + """ + Supports only PBS clusters for now. + """ + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % ( + job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) + job_file_name = "%s_DP.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') + # os.system("qsub %s" % i) + call("qsub %s" % i, logger) + + + elif jobrun == "parallel-local" or jobrun == "cluster": + """ + Generate a Command list of each job and run it in parallel on different cores available on local system + """ + command_array = [] + command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % ( + job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) + job_file_name = "%s_DP.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" + pbs_scripts = glob.glob(pbs_dir) + + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + with open(command_file, 'r') as fpp: + for lines in fpp: + lines = lines.strip() + command_array.append(lines) + fpp.close() + print len(command_array) + if args.numcores: + num_cores = int(num_cores) + else: + num_cores = multiprocessing.cpu_count() + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) + + # elif jobrun == "cluster": + # """ Test pending """ + # command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir + # f3 = open(command_file, 'w+') + # for i in vcf_filenames: + # job_name = os.path.basename(i) + # job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) + # job_file_name = "%s_DP.pbs" % (i) + # f1=open(job_file_name, 'w+') + # f1.write(job_print_string) + # f1.close() + # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" + # pbs_scripts = glob.glob(pbs_dir) + # for i in pbs_scripts: + # f3.write("bash %s\n" % i) + # f3.close() + # os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) + + else: + """ + Generate a Command list of each job and run it on local system one at a time + """ + command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir + f3 = open(command_file, 'w+') + for i in vcf_filenames: + job_name = os.path.basename(i) + job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % ( + job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) + job_file_name = "%s_DP.pbs" % (i) + f1 = open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" + pbs_scripts = glob.glob(pbs_dir) + for i in pbs_scripts: + f3.write("bash %s\n" % i) + f3.close() + # os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) + call("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir, logger) + + +def generate_vcf_files(): + if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": + keep_logging( + 'Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, + 'Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, + logger, + 'info') + # phage_positions = [] + # phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir + # with open(phage_region_positions, 'rU') as fp: + # for line in fp: + # phage_positions.append(line.strip()) + # fp.close() + + functional_filter_pos_array = [] + with open(functional_class_filter_positions, 'rU') as f_functional: + for line_func in f_functional: + functional_filter_pos_array.append(line_func.strip()) + + ref_variant_position_array = [] + ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') + for line in ffp: + line = line.strip() + if line not in functional_filter_pos_array: + ref_variant_position_array.append(line) + ffp.close() + + # Adding core indel support: 2018-07-24 + ref_indel_variant_position_array = [] + ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') + for line in ffp: + line = line.strip() + if line not in functional_filter_pos_array: + ref_indel_variant_position_array.append(line) + ffp.close() + + else: + functional_filter_pos_array = [] + ref_variant_position_array = [] + ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') + for line in ffp: + line = line.strip() + ref_variant_position_array.append(line) + ffp.close() + + # Adding core indel support: 2018-07-24 + ref_indel_variant_position_array = [] + ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') + for line in ffp: + line = line.strip() + if line not in functional_filter_pos_array: + ref_indel_variant_position_array.append(line) + ffp.close() + + print "No. of core SNPs: %s" % len(ref_variant_position_array) + print "No. of core INDELs: %s" % len(ref_indel_variant_position_array) + + f_file = open( + "%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, + 'w+') + for pos in ref_variant_position_array: + f_file.write(pos + '\n') + f_file.close() + + # Adding core indel support: 2018-07-24 + f_file = open( + "%s/Only_ref_indel_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, + 'w+') + for pos in ref_indel_variant_position_array: + f_file.write(pos + '\n') + f_file.close() + + base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)[ + 'vcftools_bin'] + filter2_files_array = [] + for i in vcf_filenames: + filter2_file = i.replace('_no_proximate_snp.vcf', '') + filter2_files_array.append(filter2_file) + + filtered_out_vcf_files = [] + for i in filter2_files_array: + print_array = [] + with open(i) as file_open: + for line in file_open: + line = line.strip() + if line.startswith("#"): + print_array.append(line) + else: + split_array = re.split(r'\t+', line) + if split_array[1] in ref_variant_position_array and 'INDEL' not in split_array[7]: + print_array.append(line) + file_open.close() + file_name = i + "_core.vcf" + keep_logging('Generating %s' % file_name, 'Generating %s' % file_name, logger, 'info') + filtered_out_vcf_files.append(file_name) + f1 = open(file_name, 'w+') + for ios in print_array: + print_string = str(ios) + "\n" + f1.write(print_string) + f1.close() + + filename = "%s/consensus.sh" % args.filter2_only_snp_vcf_dir + keep_logging('Generating Consensus...', 'Generating Consensus...', logger, 'info') + for file in filtered_out_vcf_files: + f1 = open(filename, 'a+') + bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) + f1.write(bgzip_cmd) + subprocess.call([bgzip_cmd], shell=True) + tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) + f1.write(tabix_cmd) + subprocess.call([tabix_cmd], shell=True) + fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s.fa\n" % ( + args.reference, base_vcftools_bin, file, file.replace('_filter2_final.vcf_core.vcf', '')) + f1.write(fasta_cmd) + subprocess.call([fasta_cmd], shell=True) + base = os.path.basename(file) + header = base.replace('_filter2_final.vcf_core.vcf', '') + sed_command = "sed -i 's/>.*/>%s/g' %s.fa\n" % (header, file.replace('_filter2_final.vcf_core.vcf', '')) + subprocess.call([sed_command], shell=True) + f1.write(sed_command) + keep_logging('The consensus commands are in : %s' % filename, 'The consensus commands are in : %s' % filename, + logger, 'info') + sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % ( + args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], + ConfigSectionMap("bioawk", Config)['bioawk_bin']) + # os.system(sequence_lgth_cmd) + call("%s" % sequence_lgth_cmd, logger) + + +def gatk_filter2(final_raw_vcf, out_path, analysis, reference): + gatk_filter2_parameter_expression = "MQ > 50 && QUAL > 100 && DP > 9" + gatk_filter2_command = "java -jar %s/%s/GenomeAnalysisTK.jar -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % ( + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("gatk", Config)['gatk_bin'], reference, out_path, + analysis, final_raw_vcf, gatk_filter2_parameter_expression) + keep_logging('Running Command: [%s]' % gatk_filter2_command, 'Running Command: [%s]' % gatk_filter2_command, logger, + 'info') + # os.system(gatk_filter2_command) + call("%s" % gatk_filter2_command, logger) + filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % ( + out_path, analysis, out_path, analysis) + call("%s" % filter_flag_command, logger) + gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis) + return gatk_filter2_final_vcf + + +def remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference): + all_position = [] + remove_proximate_position_array = [] + gatk_filter2_final_vcf_file_no_proximate_snp = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf" + with open(gatk_filter2_final_vcf_file, 'rU') as csv_file: + for line in csv_file: + if not line.startswith('#'): + line_array = line.split('\t') + all_position.append(line_array[1]) + for position in all_position: + position_index = all_position.index(position) + next_position_index = position_index + 1 + + if next_position_index < len(all_position): + diff = int(all_position[next_position_index]) - int(position) + if diff < 10: + # print position + " " + all_position[next_position_index] + if position not in remove_proximate_position_array and all_position[ + next_position_index] not in remove_proximate_position_array: + remove_proximate_position_array.append(int(position)) + remove_proximate_position_array.append(int(all_position[next_position_index])) + f1 = open(gatk_filter2_final_vcf_file_no_proximate_snp, 'w+') + with open(gatk_filter2_final_vcf_file, 'rU') as csv_file2: + for line in csv_file2: + if line.startswith('gi') or line.startswith('MRSA_8058'): ##change this! + line_array = line.split('\t') + if int(line_array[1]) not in remove_proximate_position_array: + print_string = line + f1.write(print_string) + else: + print_string = line + f1.write(print_string) + gatk_filter2_final_vcf_file_no_proximate_snp_positions = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf_positions_array" + f2 = open(gatk_filter2_final_vcf_file_no_proximate_snp_positions, 'w+') + for i in remove_proximate_position_array: + position_print_string = str(i) + "\n" + f2.write(position_print_string) + return gatk_filter2_final_vcf_file_no_proximate_snp + + +def FQ_analysis(): + for i in vcf_filenames: + filename_base = os.path.basename(i) + aln_mpileup_vcf_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_aln_mpileup_raw.vcf_5bp_indel_removed.vcf') + analysis = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '') + # print aln_mpileup_vcf_file + grep_reference_file = "grep \'^##reference\' %s" % aln_mpileup_vcf_file + proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + out = out.strip() + reference_file = out.split(':') + # Change it to multiprocessing + gatk_filter2_final_vcf_file = gatk_filter2(aln_mpileup_vcf_file, temp_dir, analysis, reference_file[1]) + # print gatk_filter2_final_vcf_file + gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, temp_dir, + analysis, reference_file[1]) + grep_fq_field = "awk -F\'\\t\' \'{print $8}\' %s | grep -o \'FQ=.*\' | sed \'s/FQ=//g\' | awk -F\';\' \'{print $1}\' > %s/%s_FQ_values" % ( + gatk_filter2_final_vcf_file_no_proximate_snp, os.path.dirname(i), analysis) + # os.system(grep_fq_field) + call("%s" % grep_fq_field, logger) + # print grep_fq_field + + +def DP_analysis(): + create_job_DP(args.jobrun, vcf_filenames) + paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir + for i in vcf_filenames: + label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') + paste_command = paste_command + " " + label_file + + paste_file = args.filter2_only_snp_vcf_dir + "/paste_DP_files.sh" + f2 = open(paste_file, 'w+') + paste_command = paste_command + " > %s/filtered_DP_values_temp.txt" % args.filter2_only_snp_vcf_dir + # os.system(paste_command) + f2.write(paste_command + '\n') + cat_header = "cat %s/header.txt %s/filtered_DP_values_temp.txt > %s/filtered_DP_values.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + # os.system(cat_header) + f2.write(cat_header + '\n') + sed_command = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/filtered_DP_values.txt" % ( + args.filter2_only_snp_vcf_dir) + # os.system(sed_command) + f2.write(sed_command + '\n') + cmd = "bash %s" % paste_file + # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) + + +def DP_analysis_barplot(): + # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) + call("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir, logger) + keep_logging('Generating DP barplots data...', 'Generating DP barplots data...', logger, 'info') + c_reader = csv.reader(open('%s/filtered_DP_values.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') + columns = list(zip(*c_reader)) + counts = 1 + end = len(vcf_filenames) + 1 + f_bar_count = open("%s/DP_bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_perc = open("%s/DP_bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_count.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") + f_bar_perc.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") + for i in xrange(1, end, 1): + """ Bar Count Statistics: Variant Position Count Statistics """ + reference_position = columns[i].count('NA') + oneto5 = 0 + for k in list(columns[i][1:]): + if k != "": + if k != "NA": + if int(k) < 5: + oneto5 += 1 + sixto10 = 0 + for k in list(columns[i][1:]): + if k != "": + if k != "NA": + if int(k) >= 5 and int(k) <= 10: + sixto10 += 1 + elevento14 = 0 + for k in list(columns[i][1:]): + if k != "": + if k != "NA": + if int(k) >= 11 and int(k) <= 14: + elevento14 += 1 + fifteenorabove = 0 + for k in list(columns[i][1:]): + if k != "": + if k != "NA": + if int(k) >= 15: + fifteenorabove += 1 + total = reference_position + oneto5 + sixto10 + elevento14 + fifteenorabove + filename_count = i - 1 + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( + os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + reference_position, oneto5, sixto10, elevento14, fifteenorabove) + f_bar_count.write(bar_string) + + """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ + try: + reference_position_perc = float(reference_position * 100 / total) + except ZeroDivisionError: + reference_position_perc = 0 + try: + oneto5_perc = float(oneto5 * 100 / total) + except ZeroDivisionError: + oneto5_perc = 0 + try: + sixto10_perc = float(sixto10 * 100 / total) + except ZeroDivisionError: + sixto10_perc = 0 + try: + elevento14_perc = float(elevento14 * 100 / total) + except ZeroDivisionError: + elevento14_perc = 0 + try: + fifteenorabove_perc = float(fifteenorabove * 100 / total) + except ZeroDivisionError: + fifteenorabove_perc = 0 + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( + os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + reference_position_perc, oneto5_perc, sixto10_perc, elevento14_perc, fifteenorabove_perc) + f_bar_perc.write(bar_perc_string) diff --git a/modules/variant_diagnostics/core_pipeline_core_prep_main.pyc b/modules/variant_diagnostics/core_pipeline_core_prep_main.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba3029d19a157813b9af888aef318ec7a646dd0d GIT binary patch literal 80696 zcmeHwX>=UfbzXI2ApsI3K!N}<Ky6@e$zq+(9w&05#i zTi1Flw@WU1n=Uq3?ndj)O!zB#cUYx9Yi^Tud6Ozjr`&J0E^k)%T~?{znj5e#4_I?s ztjk-15^uFGZ}m$sXk8xkueVv3xB1sY*5x7pdf2)=EZ5yuX}dMI-MYLz;5A}h9`WPd zV_m+-zusY8-XYiPtkOo`-3Z6UG-R{2dueK>+)z2dsh%= zH?DfE4_MaaJyv0ZTs&wMHp<0btI#JGV^(34TX&Q(wP5#@BHJYAf*;7&D=<4gIfqOdIc)g?#o1-JIx%QBuf;f|b7+ZfM((uqth z>sp3GdiXYOveu%E=Qp#~Z9MJJTgc3F(uX)R*- zcU!}YonVx;YKN^$g>{m;OY@k{T0^%n{?`XQF#&pl8$`x^ueI1^4WaH}n$Zus;}6ya z518o&pT+R)^-_wB;R94B!`Xy*m^c~U^jL3N7*%~blB&Lxt%qYq-ABXe4wSpsnz5|g z8?1#xsuA|IXoL;cHB_Nfa;rU(xwMD}8#n|p_!QG2sTs;W;6K19#IuhNTDLb^iyI@w zy@X+T{xv>Qg+}jSd}=lcsn$!jO{vhV+x%3KmK{oXhO+krjjNl#H++DSfZ4F0a0Vk} zyVdAJwwX*r5l7O9_oNYbq!D+f5${bS-j_zaKaFUo5g$k+j;0ZJr4e_h5%;7KA50_e zO(Tw_5%;AL_oopLq!ACM5i!SPQ5s4bU|59`h=mFN@omg(wUG1$0~hn?!q{P~dfw*{ z^xF7$akl8$XXeYYY}z-ArIOvqUw3V+y>?^PwfC3uSKX35T`b{QxnR$@Ww(ySccPm2 z8m?Vz*i)6#{9GAHJ=bnjf(Ulrp367t#f5R?;ew{_R2ElVd){*kcCl<1DmTksqwePC zz&~HQ5Y4{9^WH34|wxz8!1|YuOiz!gfM^ey1n;(FFO}bjP{Ja=&Suf4~!pF}V*>*n#I z9B(W%ynB&?ZFQ(<@AW3$Gto1_Xf0wKa_zml^RLHaur>C0(i)0?!=0-JO@=Dx>iL_J z8(-rp%BfN*9>betSt|90eW+|7x_;!~5tbw&VloG&bPJNrt6(&hZSe?WJGt&o zLrN-7xw-lBT)tX$3&CibI6N^Uy&G?J)liX!Xi(tsX5CUrbz3lmLhhL4ZpocE(k@Hy zoklTVJ>@)i>g>gdBNv@lo^G3mS=ARSH_x2Uoqy%ji@C{jkr{X4?74}fE8+=u!Zw`6 zOXK;XQP)U0m&@?^1coc9^D_zTmRqz_3A}WlETI>YxaeM30+%OSlw}Dljwi5~T*`Pl zXE1oSI5XRFkUre@AWdYNJWAVV`cli1nq#4TjxQu}Y#OKS^LsXlpINxa+HS+Al9>4; z^`Vuq@rP>rY|P~_GE64g=W!ybMi{K^b9lnwAPd#RBW*XuF(b9A!^ER)C%}8;Qml*hc~@11M3G0ObWmhbn1`jr>giNvZVqC$2q8sb70R^@Hii;(au6YN zQ*#BC8N>ti7`ms35&AZFw6O&bMD8%z^Dza6Tqs45+aMkr5q-WaXGyhVWxm0dlDKtu zu5yDy%FTQkl3lLhE;Pmv)sn#bdR$GEr@aZ+D;FEri{*){#d2<@UYV~>REt%&R4ls_ z&lDT8^H(QsDp zhrZ6u<%{L<>aE5Q3gIg)Iiqq1K=FPBfwjIbvpLg;pC0^d$#i74@-91&!Hv#j1;p_-v_UdpjGsyTI-I@N(x{L~kf~)A$R@aQZycB-QzDzT?wHX6esfzXP zE?xjV(I^-2t7epo@Dxz4Q=E5`SDqli1AGf3atL78^>tSL2|>MpVBxnrf_kkBQLiVg z+7kf20C8m`?`{dRmS9Lorr<8-Br2*Z)zEEVU+V~D3HAjPhuD{TKQ&g6LbGcxr1G(^SSqxl!nu$t(vS+m zGM}OOiG3NZP3%iQ=(DbY8Q?C}zJw-#b!}4Emm(^73tW?k3f^i#RPa^{qJp7hu!rwf*HKo?L#;0y?Z>t$`9EnRH!T%7;K6fSQ|K8z=JSd60oIlrYM)x?6kldjo$`D$CgQk=)wYevkt}Sv(n@( zpt>+Ns>7+k?n%My!uW`LFcLQjybB|1=C2_^IIu#TZJ8*Z!QbKbsb{MDVGfX*+eY#X zww|e;FB#G^nVAg`!OWWQgSN1}J2Tfh`*MIolldz~*F{QhA{EOh_Jst_tI-@LoTN&; z;=(3jls>$&jJhYW(_PjcoM(1Y66DF!Er%ChC*fRG&DSnQXqM_-2B!!L)Qo7ig_^a& zJ$cwKgS)(1f>&X*?B-Bf2J4GS&C)dRm%;da5@WNtZ2^jAJ14@P{?Km=jBA6RKlYcw z&s?V?J=GpkXcq2FQWLfdI-0DWGFZtxY7c!h*_<}mECq%%`4ImIr}b*$k!C)RBcBV` zi&cNW8<>^txmDR4$F?@4=Q$TJ6}HHUmU;GcrEdE&(AZ`qc8JwMH?b`Lu|wd}A;{-Q zgnnKPFd$s#Y-KRWfVFH9&VE&Ih7lj20&v#%OV(VT=|h8pddG zqC)G?xWpFYM1|J1AR5MK@sx%!TAXMYqs57aF4Xc(i#iH0#+oM;%M z#fgS7TAXMYqs55|t!vS?hA~<^Wnhw))MW~@Yr*!20D5s^;i&&O(Z-GV-a~vBd=9}^ z0lvpqxyJX3Wf(#SMp>q%8(-qGy+zPQTe_+EE3kJP2h%`?Qrtodxj1*(sZ?0qA*4$| z=GNBAjV%#D;LJvg+PYDw2(hcgc-}h1(Rh(38g~*7mzXq-vjWk?67DK8Fc7#X@Yc*Y zOnGPi5o0g~xls(f!{}g}XyC0mb7j<~Q8zZTkGdsC)u6Ylo9rZ)jzw&a`1JXFl?Flml(ZWcU_!kfT#Jc3_*+$_j2K@x4@V!v~dGYMzgl{M8E_vQ|3fzhH;z3Fb?2HXcz~8Y_l?qQ$)jp zElxBn*y2RPf-O!oEZE{i!-6eNG%VQSM8kqDPBbjo;zYxOElxBn*y2RPf-O!oEZE{i z!-6eNG%VQSM8kqDPBbjo;zYxOElxBn*y2RPf-O!A?AQpH#fb_N)3$GMqC&>v#KJ@V zploFr_b?6PwgEVMgm3(>65x!uLo47c*foln$1#3e2pVyUwK0-gF5X5N_A9_PjW3Z^xe;=g(gWTID!`zjSY3Fq|fmzLeyNlbNzE(HKIM@bTR+GNJp zymS~Cyvflt-ts$b`6bL5gDipxH6zDMz}Cz%C+3knS(cs0>n-bH4!UJ$U>^U}4b?PV zmd*ZBQu)oIF9C=&+dmOH^(V{{K*Tn<`ZH$PT+O#ZWIQefTQm#$Vp7vC4U07ynh3AV z!KI*9Zt(Cz4xDqp_ElCdNe1a;fX&sAx~TbOt0<4PY^*Xdw<07q3WIEaU=e95aIDcoxL zhBuN=wGJm(aPvg=+P9YGI>sXSdda?hx=v^OGWqReOH1gcIVSa{1Fsi#GnOmmLp=Q+ zBwsYXd3u&cKuDVlsc;{jaHc-5ADrK~aY8+hw zUE-JNkpUV|&Ogf^WSoQbb549FAcpn|hTgNoT3nokLxrHjJoFFxLrUkjHH+fHR{&Z$ z(6ZQ)IQC@2wmKhlN@Es&rGVpb!S`F*XQJv7b{JfZNkiVj*OKVwYW%yy~+?j@LeKU1Bbq#4}{T7rfwVasjaH2BftuoSt7q_Hcj* z<56rfAa^(#KrEihuk~OL0A+*=NSrW-hu3~K+%UEuhQ%c%9BH;9b+#leT;Mu2dF6!+ zJX5L@_AtMN1ugajfM>!yc4i&jRRZ>h3G58I1fBv0=v@+d@&a}2wMKArvi`vgAFz4i zl!Yw(XdoUbTg3&!=d;)y0GP^lQ!FO0z;7k$-^c0=qw2#T*Lt#-kPZ)}p108^Qq4u0 z(aI9b+^?am+xY-{4cEJ@x|@ab05--BiB+5T?{}*E&u44D-c+Jr&w>>j^Y&)(^@)&M z56DFNe7LdAFDV|X()80M0`gfzPaqpAkf zT`QgrKhs(Jna;1Cxqvx+VQe>EU1!D461Z-A?`z+a=EF$ZcB05j-a8UW=*tp;J43h{ zNE8h`s;7Q-ZKcZ53861~_#O=4lL#W6AoWyXz%W&$473HNz?xxM^;hJ8v;%jX6fcol zi(!FmL;1)(X5Wc5IoiE~P2ne#a29nqa$psc{jwh8r%cJ9j=&lm*Hbf zF3v&`k%N*lQQ-}&6A7yM#;h1^VNXe%zsZS78P#wyT&xOLxNeibIHuX)=?Z^GFXvU= zsp52TDqNl<_)yc?@GZ+@K(p|Ppy85O^IcLU3m_vfqyj(ZNtPX|1< z(VJUeW+TkmdgW&ukAQ+o4{rQ_UuK8=^da1sxd%U6@O%K_CQwH*{n@SAElAmy*_pXl z^B|`v+!DX;?HFl4KlqZ1=GNb#^17$Zyrx)V9u?+UV9`i_0!~8ippilyg-bMyMSQoQ zUa=O6)E_6WJmmX%>kRz7Jwp8e(+~c>`00QSY6-(gF5fkH#Hu}FT`C_C^^Xjo{=uDG z>jXEfOX{-&;6S}Y%+ZDcDT(?A4KkwK25K(GF=ck>u+VU!%_;>J+9}jhYNJq+koqv?VL)g~i$tk+TleGUWc>}%FXfX6j1r;j z5N!?*_J}H~bBm9)haEuPBtiXVb#=-fkm_1 zo~`Z3*51hA7Xs~GDtG8s7~5gOFs8aH|A{~lQ0KL_q}t$LwAlvVs@mYgX>IWSEUTW> z2HhR))%F*mQQx7q-5s^}XYrd-RXs;GNQGfJrYo#a8$bjY3g|!mbiWJ*M?ez{Q<#fs zLU0uh(>BHmAOnsSKrNV5s$+pV=t>>0eWK$MdY2v;(Gl%TtY7Q6q&|&-WX%(6VS$*& zSVwCc==eH4%n%t%)$CYfXe9U27uD=~@#X zP9r{&Mtn4l_*fe8@igKSX~bh`#CN0-kEan&q!FJ?Bfc|@cruN6DvkJ58u4@*aWa{x zVBbx$R6OlR(Xemw0&;RK7Z4zeYE?X5`{Q*>TtNEQr!F8LumD$1URjg{WRqS%jMr#% zNslZhudZ)zG5KEPpR$;|y1w@6dVehsmXGiSrdNqGekUX?Ak}2i$8`j`(r99v@QgyGIS4R- zwq@x%;jk&Z*Csy#3PhqX6?P4ED$(#&mWEN2I4uu~Vv>kFgoQQ@HwDcI(n*=k#B8WY z;<%v}4y7vmRT`pD9su1iC{2tm{1yp|?Mia%avnwR0G$ zy)uI43DJU?o+ttEyo@Ofe7M4VmkX~hF-yO$oG@O&$T;jq-vlB+--2%)EiX6kT`e+F z<0bg)(K5N4rm?<-wr^>>Or~brq)gQb-==aEGfJGeE6pfxbV8&?#w1^9F6O30umrAZ z9I%obYj8E+%;rmGO>$b3 zDR#AGomO>tzBefwRA^N*xssI0s^-vaQ@(oBERpvr&9jS1EkjXir32MTV!ZO1cGh4k z>q20yzRZ`_%LYp|c~@HdWeRVT=S1{-ONA(%rzs<>h5V#5S|j;Mb6;KgDY0$Bmr;{C zv+QA$lF{le?mDA2mY;MkYbrnKJl0Nr(itQPIa=3TX?G{FUg^57OTFsyljgh1@{{KJ z7RXO&IjzYQTWk4ASLnS7Z?9Efm)1~z($z?maaL)XCGuXSdA9oUlWve!@{@nsZ>hqL zW}75GEm!%`ELT;2Qas1R5{lqiqB+-Q5>7HW#o#Fh1p1uQ3<6^>=QwXqFnE%|I~hze zc$xt}SCD9TMfL)sN*kM|LSh#+{m?@W1r6l}pFQPpy>(t%3mSaQ)w3h7`c1tS;B`Y5)`VR!-t z;1nDhRGeVhMT0c(=&|bGY++Xh`znZy=fgB9!v`k@@!o`NP@?CwySt zf~PR;(WO#PyJ&WF4TfWK^e4l^En-DR`>2J{Zfx}+7nRqW{0_jTZdiW7HVXU9BRI4C&hoizra=wN}JmDibNyB46;iCabC>B{huPyQjJLO?4eEt*cH%1@9bO!V=$~N8) zM;_qNjro99Ge!(KRTNFaqc}zkcsDa2$eE*P5;;)8!%UH!?`5jU{e5vR_s1XD@dppY zAHdkmEXl6;gWd9ghp14PO&AV%cU^jbC zDY=ZvX!1cL`Ir3UgUQKd{6>>+Gm;N=>fE;_Czr7vO+I8Kf5lHel$>1JG@5+aNd6su z^5Nvf>a83FGD9m}7V_lD44omI=$O3J))ncrME?=CJtG#HmQU zmSE^835H-&$!U9-Kk6vKRruNk_IF`y9Pj3{9yRe^u@-UjILm%P zj;P}pcWHjE?8!-Vd`o?5zQk`(@t8NhO*MrB==m8rzAb6giErrIq zZ-Tejmu_*E%c&!&?5t~x5$>l4p8Mgs`dF4en{w=UxB!SdUb&RJjSUUtLe03!lAo zzDK+Q^m^CRvdFPK23z$c!aMQ+O0L8Ys4tyY4~*Th&mk)7R2b0tjzi?mq1fik zGTRQ*G1mM@%V-ERH(mwyg!u`Gc&YbI=Zjde*lY)*?iUA*Sm1U zK7PpN;2Ce^ua?}g(H7%i^dOJO;q=f69uQ>0-Fm7>`br(7vEboB8wYPL%+Gmv^zK7P zuI$^_J6D)Ka$?RcHSp+Qu`mu_0^W&H#bb0|Z?!NzHi}^ghmhLbY8;P{>&;@JF>9Z& zA3Dquz-Fi+H~V^3rN+qjpq+QUF~#d3JQ(0O&p{q2ES(UK1*q4)ef9xN&&nLRkI|6p z#AvaAd0cGV8YS%(Ch^$lIb(94iTjT5BqN?B~qKnxvd_-eR^Mvy9bC#J?mcO8EG zp~H_}#k0|?rTo-&Ucl_DJmd1Jkgs2d1It^y-Zfn)9L^t=%g3gVK6d!wQPm1Fs5E=} zBLCRW!n{YfKyhvijeO+rVK&Ge>U8ArzJqW8QFcq?S5XXpBO9kE!`j!~sm2&9i)VY` zCZyr+J;+tHGG7m$kOcb6CiVWyDn%V@D7{3XMPe zz%Rd0#OhX@y6%nNif}|*m9LMP^o1n4W`MpMkJ6TdzzOZxlT=%G33FMI9Uz1H^KrJ|_niX%2-IZNAj- za%jOZD&Fd3pnnvQXZrM;QHrPDf>L($iWneHb7fGkN3m#B_k5YCgMw^_=*tZ%k7YH# z5#^O441-q?FfZmW_uASqpuKcnj0M#@GN)fsz8@T~!Pmwt);zpGn~7{FNg++vs9>Oo zog`+RoY{<2hE-FCAEi;F^^kX>6r#B@ipFRv=>Rdp4E*h=_XhwMlfsCA)5E9;N+oRd>CLmCNBx%A*6o9k`^8#&+n67pr` zZNlX9eictGV21qZ?d}F9)&tydtDuVgnZ9g)mWMXGGeq##Wk<4GLDN5--;x=?|K99q zwigKFMoGIBM>F*me0S{u%GL|c?f&iP{5If}Dosbnjm$1yXXy)!JiGC?Pq=r3Cy$Bt z;gBYeeRgCHfX@5gjM$Z8>7vuAWA-!3yOkW*502qaxgLaBB|Hn#?8iPmi-neP{dc@a zr%<68HxhCUoT?P`cVnmVaVxkzf)xx%^pOR=-eYLeqyjwEWbBh)t>YZP#dl@2$68quFn}Mx$Sn6}d+HbksE(W|ul%4VYgYoW!O9(GRtu0KndO zidaDZyJAn3f3)CaxaOeJ;w1Zzh^Hs)H)vM!ag)$L8a4?uA-IA(MG(gDkA~yD+#k5J zf3&{9KiVexNBe^EkA~yun<++WzTk!Jfjjy~BTxnd7W0SphJUXxwAvT4wJ$c6=!@bXjhSHA07u>thW^n|BH-)Trvl#3 zgV4k9Dj=H&7&C~d{iA&`TlaA1!J8V?(di)F7VQ zYStM3(N;EVKHJhi+V^#}H%E?!{?Xc>BOiG%a@odWsY~63ojXkN%g6wKiBB{zizR8=IsYZO!zj3 zf4lzG5>4(1Ma5vxh`HV*sZS$lJb)nZnx?AJ>^n_?sLwJHkvQd#p5UWH;C754sq-9z z=NY`fK-L0o55^7j&hZ(MHL<>PxQ;19zKgtig#k^p9U73{ZO{c5i2%AV`@9*z2%VaJ zipbetAT6Dmbpd(ekGM4xTf7vvX0XrW&#JmL%ci(B^Yg=BBifhCcYVTQfSiR(dlu;>WU7qXT^czRf7x$2^*i(teK$Z{X4FK}O3gq zH+eMkDMP+Tv)xgTW?Qo`mI^(ZA#$>Jsozu{&5$ez$n9?kap%8PD;m`AggZII^C%&+OKCXZ%5B|}5AHfsCx(1FDbSnJwmI%>DF_)V#* zo*poJM5l)?+>$+-?USJpcr=5}jRr@8t0a$RE!!#X(F`r1J({5lkJo;1sUFRKX%#)1 z^>>E7pnMZ0kGbd5J(~4**81=4(Tw9zyCZ^SP^jd?>c{YC2E(4#M8n-#Jf-38EKW4s zoyCcUyR$gaaCa6b8t%^GM8n-#oM^Z^ixUlZXK|w8?kr9;+?~aVhP$&k(QtPbCmQa~ z;zYyUS)6FNJBt$ycV}^;;qEL>Jef93PNfl_N+TNX&cZyS9?d5G^esG^sRbnPX!djv zJ-Ax&0-`*c$pZ3+OZ8~BVU0YRonPPHV)C8o9?j0LubrnyGkI;b>Cr6gC*?8m8NbO1 z=*5YKN3%HbxwMqerx9OBBfgkMd?}4+cr=Tb*6?T+CmJ5j;zYxvS)6EiG>a1rk7jYA z;n6HkG(4KciH1kBIMMKE7AG1W&EiDEqgkA2cr=R>4UcAVqT$gjPBc841w;&J?a^!= zT8rt?Z2x{agOuPtY5#tE6}_4rN6t`|B{Km?mI@|laE>RRdY0we>@-sPcovJXOy6c= zI3UKJVkLv4?5EE7?#oV{p&v1tk;r4iS((t{PxSH3z${;9v-72npG)D$#^EXb74>8j_Se>Z} zW`;1P_DV(!#vkd4jKm81osn2eoe#NFXF{0v77LvTVaonF)s&ka`4Dv^_2; z%LWyfwHWx7=Opp28QF(6Wlff6X)ep|TvwfE>=PSdc6s^SS(@Xr_VXZI7Pe>`?#tR| zZMIfqw#O%BnpcLEWtTXKS0_7K*oO@lshu(HS0^twA+NP*#H$H8twDuePspYX-vR9p zhNKMiU|5|lc~^@XtPXeL*YUxX?ux1XrnzqLPJ5ZHPF)i7Tf>QI?s7%zw*~{5M{L55 z3Pp!C$ciS5Oa>DQ9juU6}DD=RF!yjap4 zrnNuuxjhy#Rvo)K?Xe>1QadH_P`Wz_mVwR1SGGv0Yi!C#p?J62(xlEVXLNs<-Kt2F zI*+xGCKFpLB5$p-Ae55bTD-J$c5jI^sq=aZq)DCAT1%5Un>#geKOw3_bXXjc; zle&poNs}p#np8!8OI*qQAf?T142VQKp*fnEzKJXfQwnQ! zoW`9xCV0`q$uN8AtKm^fw)&Uvkh zk2vw41~V@l8cTB*2X|QCVPQ)v#a&!85jz$Nia~e0h%J4*0&;mghL`nxA}m zj^lpZPrf|Iaev|`|H^h8_fubyj^o@rJC4KL{H2vRjx$}oWiy5kx@3=zt$fhMS>1hJ zk{uzUR#NZz?_;CbZ1}}2I-?2h3a#|raMgx3gy~V!d)OpIc zJ<54SIv0D}qukpb<)R+tXn%wa2yuqVsXG_*Mqr#^Bo-{A~u`!Qk&8Xs{~s zj@KE81G?`d`MXHwOSwhX$|)LaGE0wf3HeJ5=kJl(_cHkV489M+-G--eK>F^_)C)wM zpgq@(ci2y4ghXLnazNo`Xko^ILi4|?+ERO&`-`!Gx%8qciRx-`qDO9`~5saOrRU?(B6bQ zwBeqQY#pSRy}%tBB3N7SzYqUU+lc*l@dS$&fV}Hjt9~f6@HzU+nY{AP<;x0|bq(h* zv-Fh>fDVUEaX<)44;)hB94U}BoE^nyAV!S)uGl@ks9@07#qQVRUOZqU6JEd#KQigC zrZ>P%H=HX09_+*wPOJhf!N(+&Cv9BAX%cAs_=J_=4G5TVHF(fh{NSi&7zZ-3Kwg7d z*CRFPwHAJ;1E)$SuN;*M$`>NYpZ=8ahaZOMQ(mIvD-ik_&&byPCtRGn9c--4Y%K5T z46pm8;kr9!a9SCjany$g@Ocb;-GK#02CBe))>nShmNV&1dp{5CPF@*i)8fMn=$?Xn zL19P^@xF|cuXw1sNw*E0_~VrF5esPD!d`ynVe-oT7JW9su{sesM z-{P^@L7o8K6!5~R8L;ZVC8v`{6CEq*Kpu%*c?X`Q7ok`OVy?lx|3;aAoNJ(8ko`gLhqIRDJKS{b zYn7|sgz}CGrtFpzOJ2Z{CA1X1s~%THDor?Jm;i@br9Hc`d$4 zhtK}9D$eukkRLVU3Faa$IfZv?)^=suu9Qo+R19(NS#H>Y@!Gf?iF&SFoo|TO>@z4+ zqwd1x_fv4C-H>dn`C|QM(Q}n^P^8Y6^QBuJd_(t2>SI+p*=4;^z+?7Pc^dY??XtK~ z7H``&?7HX#n{k4sk@~Sph&nn|nx{k2(POq-M8rBD(yb_*KWh@aa5EnsH#k(P!2dVm zGIg;G*V*bh6A8)oZh_kzl0;)RQ~iX8!tA~It8;L+=u@OO$ydiv?WJpMcwRnc)1Nb& zd%B4Bu|br+6}QZGaon}{=gO7H2_3Io9Y>^b`{JyN#^pG`Hxe*H*iecG{F$(0z|-$I z)VoBlM5ta>I!rJ#Pd>Du%T{+A^Y!vE)iw1zI?hu!9_3-bVt$pq_hY+s*SV)I*oV&A za2PLxvR6HL2@mw;TzL7aepCt{)xyhD;bqCLRlF0^rTK;4-ih+GH{p8aV&i(TJaM&H z&e6+xbpk^P6QYnm%4ZYRTa8(?b*sqLV%0_M-HB%~M&_?h=tjxuM#=f~92|U4%vB2W zXwe{2p_rd3S3I~gM?n@E_54&L$B6;A*3Wtn|6$M?DV`sx5HmcWx z$PN^0V9qFa=tDSIOm4ncX-{Eh_!il-j#@K=UjeDVTw%YcjkEYs^?*aqJJ+wpV;qL6VFB4^SuDq zF!)g*1(dEsRQo8o3Y3)g}Fo{j$x-~uuWU(oVxE;x_C>kslC!dEjjNOkmz#$?)OFvUet zFtHk-rKJqwO&AUhENlF*Dyn{tCrW(wz8+U=KG4|l;jmP9b+?LhWW_Runb(6t9OoLp zP^~}tFq&LCs@w9|`nu7^hH&kiWN?bXQw*TJDd+H~E9L3pOvJN1Sq39eeE?Ckek$_f zc7CQ*xf(iJR^}$B$&>lWanADm9E<8H$4Uea=HL&$2>X8R>{$llRCQi?MR(YDW0csX zW7T^=@CNbgF141WlJ7gtA@Q5;zvqIfpj^Xqw17j7%cdysys6fU_y~k{x=yu#4>{i? znmB8cUdBil{u>a5*ZO4%ftfDrWn|>?^?Lr6%ozU(8Ht-G@nJt*t;)>eLgPtA(Ht^^ zJ;_5m$G_WjFivD8XK-Gr+4n!F<9`eUYs1D)>h>Mb*=_vngvPuX3fCqWtnlxS?DmfB zom9bimk>`=wjtdXTyM+_Z>uoge&Kld5iydt$O$AC8~GMudrIw*yWR9 z&gXeWOu+0zVuox#6j5a?>o7kEqM&iM@lqCu;{Dw>tr z#{DA6zscZB44!B3TMT}i!GB=zI}BcA@VgBDBZJ>#@ShOeWgXJEA7zSYN#^)gi$jIo zbQda31T(1Q5g9mbqm1X!mE7D>yJBCm7D{SYtRP-zgIzJW+r;hxU=!J}f{J?$TUZie z3u~IYRxm!tH_fxS!mb%UN{$^g*)`+W#=H3$bqLRtoq4m4TNP^mN_gO#D8p*oiyvM9 z6?x(NBA?D;vrG1`@Z}Wjuz&~n>p$=kg%7UFS5B}Qg=8S%88P+R_hJ8PKFezL$o|!T zmn!zk{*{}Z!8QrF!K0{TK6JZR3JiX#z#HDuM%a#>rnE+YBEm)hkcFnl{t)&Fv4Mxm zvN3Bv43DySs2Y^bj#^+>PbnfBWv6MQ+G)b4#j&G-kHDbzxbCs8$rIu72lO?&iHZoE z_Gw;lQz#pWI9Gt;Z5Fq?*qc&T|C?mfX~4RMFI$7(CT>7suaH!1Y4nRu(i3btZMMpP zeAl%1hqEhb@2&9Q8n(CE1y!R+k3BTw1wpaA_dsG2_&nUoxK));6a!iT%J`0OTLupR z+T&}T+=?LjTIWXtO1F%C0+s4(cV|RzMMU-`lD8s0Xh^ETCPcnk#1}A&C+FP4`21XP zYIYn*d9W{$*H_`TM44ZyJ8|S#wOT%LCap`;jS0|yk{xr3 zf$S870_2ipM=>01DL_i+hXSOS{C%~_in3`SyX%6ePoDd4lnm==SZloxr;oG-(gAxD z5I17Pp>UYmX%KyT2)zd>LeM=s#a!>h|6cqT>8}^qenX@`ZjiymDQk@sIz&!-EQ6gd zNJM!159vQVVAv%h7ey$fRQ6G@OM<*{fIDpq4=CwzM5Gl+k0P=ukwPSm3l<~|3z7yo zz_1EZqL@3ufM!U2qzi)N!lM)xC$Egt4vQyGL?lty5bCb4v+CbXyD4%e z-Wtsy6(2!MwTCGAnHD9Ua(fgnbqP zU<@1-0RnSBKR`Hw7QEhV)pvB@(`bCO!y4f*-qFONwxdJO5E_Q5Rz=AQJE0-L;tmaP zueCUU$)f&*QiZUQ|Wpz*v)d5?qg#*gqY?pN#?Fd6P7^C4xGrI6L4t#B)TCfQ8 zn~n5eA>iVH%DDD_vX=_@l9}qy+uJ1n`z?OkFIWl|W;4io^2&QNw}+$|(f+nnev5_A zQn2q}lcxF({b}^&C>mzSn#Kxq8%9y89MZHIWze>W&p4u-_{QHhlrEDg>~5PMxk9Ze2rMj6CSB z=;M0~R=0Op1*`@;nzEq2yi=R+E$%eA>I&>s9dicX6h!ZjS&Q4y`|3~F3x$2Umy+5W zAHPuTf4~}%#=h5T4AQy!;=RzBMd{68QNA!n2=ARypt1qADt5w*#BoI50@)o|~ zd>Vfpcdl}S=0&e6xRf?(%H9Z8R%Zg+71}ECbF85mnOH9!6O{7+`5Z=|JynO_pYzRi z0^_sj5jzoa>Iate(Yit8a<(!~`zRV2&}TPL`aPaU=`u=q?(F2#FDlZ@AVKhyV*!LO zQwx>52}2_q@HDlXTL{r(-?#?ai`DDB63al@+^<45n9Gp=(TwxMfqVf7Hn0N=_5ZDPCP4rz}updA?4MXH;Pr*R&f-G-9;mR zwYb2-t1rgw3}Om_F%MP6%x;f2KJ!MeN3aowQb=zvBA7rq4*}8C)|L{3pw8 zQCizz9mQYhO{`Z~LDEA9g-?#}pLh~1XZHH(M?r_-9tNJDfp7jKHhh>bG7>o&rYm*3 zNIOL^*pJNLVHYaA#_!c)C4cj}eP~LIK^N@3@53k6Ve4Y=rv1x7yX2N<8na`&J-4>c zzP;BzZtpHKdD#_H8NmsLF_ju}~YqS$mKr&Hc`AAx1EKim49<~=`HizR` zVacK`a3VRfGK6XbN0K^M#QCQXWaBzj45ks3D_3R$-NSfX{H3$0ibQLQok7S}XZ5=;( zw(3-DOymi3A&AznFoHGV0RD0g(mYUR9(w?lo$)M8u3s`Ew~ zPUc>2YOWBOc>bN*GUFYS$b!R$P^V5+8BlIL`lUfITIb#Jl$$GI@RBQM=3TTeBkxJy zvN~+-SFynABT@Xa+&cUqT;DAwPxofKarlU)Pa89Xncj}*T_5iJ`+*p(H@g`oQ1XpD zq=q3BjzrOPO0KhZX1$zz*^(Im|2|yP0Ba+{et0Rem9HK7ZXTcaX9mP%Yb&ldqip-} zm*3y>|BlK%k8th8^8x(uVAN1%Seaa*dKUkmwo%Vd-~w9*1pfii;~FZR5;b^M2KUeq z)jiI}KnAAA6u{Wna_F?C*Qvb%2sJne(*>bM{lToDCdkj1gDnIzknjxRbO!2+<{*4T zK!cX2qTwWp9}T*Iia_xST-QOi;e9tuei$`-a z`Nnm=8|*`e^jln_X$#?}VYvzE3D&vx=pJtrkua$s9G%4!w9lQoaKY!}{4omW{0Rb? zN8~CKk{7`nS_5kKX=SW<%zpPC?~1a=Rja!93>eJ7gw)=<>-EJbf4-SeUq8FB*nH(h z3Bk)wUe7IPV|@FIm1+3)2=^GrIE=)qQkdoVi3l!$e9Q$*BoyBGc%-cJ^7fJHswO&$ zcsMxVqP21~fK|;1`4%-~qxiA>r6pWdfy?G+f+dG9DIA-j_>j-SPbxg* z6V%)p6-TF_XJYvf-CZ^Tp@;&3K!Jqib-aETXb+5AJSR)uNTTiB#lji3Zi}(*qehRF zLX_%^jvn4aYsbpuCuPY%(~knyQs5n#+I^YYf`#8pYzu6+0eylh4~!t%G#?U};I@JZ z<|9pmr7L53WjUvZmXbZR`cKGhq@`GvpqX?zf0mI5830tkq9w<>GPR%0)SgcyJkL!# zLTT*0UIh=Jgi!lcftA|dg`QJtKRQpT{X8tF)PCfm)&4&PO4J>!k{m0^Vm?(e=g$yC zKS52&QL8T^ugw%i0C>r{a4Prs;YS|#_O#*`ii=A4P@kY4<2Y4D3V+F~|H**!Q^Y_D zR|`sXe42zo2G1fGW3ff#i_Q)#09dty_p>tyJ;&L`fTj?l;y^nM3L=~Pb!xD%>Zg8!0k18qcQ9N2o5rYgo^}%9-@jzX|(i!ISciWQ4f;M#;&f9o@%C&J4 zR10fkHzXMvo zV_A~rj`)%oQeY{-8i$wSo0+V}k+G&g!^WD_&Gi}cR>5`J(gI*1769bEo}(Ll@rtGB z1vv6r(gq)~eTr;Chj|P^-Ll#C^bl4=c%u_8aM3Fdod`I+FG_?G6 z>vdVU%(%{dD46pAOUK-W+jI?V+|pgA$8qSQ|9mNP0IR8I4LU>I|D7?EqcA%dv2g7#5LfWcxR`)yb&_}<6nnPScal&P>D;PHC3Im68_AxA@* z$|7MgG78{FR`Z^=T(Sa@3hBqELPjG_kJX6NovYwkIb~PHc`P1F430y>fWUuQdE#ko zFXFbjD9Gy(wgGxFIef7RxHlo$kivoN5f`PoxYOh(J|e zM5wU62?acm#tNlwZw zS-!;wM8_GCuot1cpD8DDQp$vU5>pcs5^rD50K8)Lq{LFjd53o+<`ZTg!3KHrVlCxn zw~CAxFy3*PZ}LLTV1{Awg=mWz_EG=!j`;z@JhU#BZM<|jT{u(0M$J9si8m${2(ZOd zyhAl}^;Atnan6C{qLu_V3%5J4dV(cp9>O=dauho_J0Vs8P~mPTw)pU>2jfj*GKp`d zBDx|Y$^9TsUOCNIKGh9)^XRbZnGCP{4#zL zu+<5{{yLnCj&*QP72OISA{py7odUfdNikrgxa_BZACWKx`Yw`Ui;?1^eu^#86lnQK z3iz>!lEd+51@>9bUxg&zE9(8(Q$=vIL?SN|;VV+9EppW^@@-V)8%l4<_3ZJYzr9P*5Q>mA zEOl;Oq*}@x>cuC=6b|zCID-=m>Cd#!On_T6j?%6^*BlUm{5ayAtPx3obD&zPG)_?> zV7FM=LFny!AQ5y!C?CQP<%5yzR^0E9U6w7-;Qi~}Ev^}7NOn2yCkO1UYzG(adfe$R zI#eaKK8$=7FqQ+uxSSGaLbGLW$=JDD;PDY!UCD3(zI z%wP2a{5vN?Z!V`bLpZMy-#(th;pI>cmN5qd@lx@soXv3{_o(waLTSvx0y8s>=U9b0UapWElrIU2 zo7!c{c&Rco14<6=K(x;XQ7j|moW2$n(SI)6^3rrq6)O%)E0CN|7C#U``Kvh(*4&L- zIe0vox~{fR;eRF--0E5pm_ %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) +# sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir +# sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir +# +# call("%s" % header_awk_cmd, logger) +# call("%s" % sed_header, logger) +# call("%s" % sed_header_2, logger) +# +# temp_paste_command = paste_command + " > %s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir +# paste_command = paste_command + " > %s/All_label_final_raw" % args.filter2_only_snp_vcf_dir +# f4.write(paste_command) +# f4.close() +# sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw > %s/All_label_final_sorted.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# paste_command_header = "cat %s/header.txt %s/All_label_final_sorted.txt > %s/All_label_final_sorted_header.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# +# ls = [] +# for i in vcf_filenames: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') +# ls.append(label_file) +# ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) +# +# with open('%s/All_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: +# outfile.write(paste_command) +# outfile.close() +# +# with open('%s/temp_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: +# outfile.write(temp_paste_command) +# outfile.close() +# +# call("bash %s/All_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) +# call("bash %s/temp_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) +# call("%s" % sort_All_label_cmd, logger) +# call("%s" % paste_command_header, logger) +# +# """ Assign numeric code to each variant filter reason""" +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir +# call("%s" % remove_unwanted_text, logger) +# +# def generate_paste_command_outgroup(): +# """ +# This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. +# :param: null +# :return: null +# """ +# +# if args.outgroup: +# """ Paste/Generate and sort SNP Filter Label Matrix """ +# paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files_outgroup.sh" +# f4=open(paste_file, 'w+') +# paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir +# for i in vcf_filenames: +# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') +# paste_command = paste_command + " " + label_file +# +# +# """Exclude outgroup sample name in header +# +# header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) +# sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir +# sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir +# +# """ +# +# header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % (outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) +# sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir +# sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir +# +# call("%s" % header_awk_cmd, logger) +# call("%s" % sed_header, logger) +# call("%s" % sed_header_2, logger) +# +# temp_paste_command = paste_command + " > %s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir +# paste_command = paste_command + " > %s/All_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir +# f4.write(paste_command) +# f4.close() +# sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw_outgroup > %s/All_label_final_sorted_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# paste_command_header = "cat %s/header_outgroup.txt %s/All_label_final_sorted_outgroup.txt > %s/All_label_final_sorted_header_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# +# ls = [] +# for i in vcf_filenames: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') +# ls.append(label_file) +# ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) +# +# with open('%s/All_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: +# outfile.write(paste_command) +# outfile.close() +# +# with open('%s/temp_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: +# outfile.write(temp_paste_command) +# outfile.close() +# call("bash %s/All_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) +# call("bash %s/temp_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) +# +# +# """ +# remove this lines +# #subprocess.call(["%s" % paste_command], shell=True) +# #subprocess.call(["%s" % temp_paste_command], shell=True) +# #subprocess.check_call('%s' % paste_command) +# #subprocess.check_call('%s' % temp_paste_command) +# #os.system(paste_command) change +# #os.system(temp_paste_command) change +# """ +# +# call("%s" % sort_All_label_cmd, logger) +# call("%s" % paste_command_header, logger) +# +# """ Assign numeric code to each variant filter reason""" +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir +# call("%s" % remove_unwanted_text, logger) +# +# else: +# print "Skip generating seperate intermediate files for outgroup" +# +# def generate_indel_paste_command(): +# """ +# This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. +# :param: null +# :return: null +# """ +# +# """ Paste/Generate and sort SNP Filter Label Matrix """ +# paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files.sh" +# f4=open(paste_file, 'w+') +# paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir +# for i in vcf_filenames: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') +# paste_command = paste_command + " " + label_file +# header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) +# sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir +# sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir +# +# #os.system(header_awk_cmd) +# #os.system(sed_header) +# #os.system(sed_header_2) +# +# call("%s" % header_awk_cmd, logger) +# call("%s" % sed_header, logger) +# call("%s" % sed_header_2, logger) +# +# +# +# temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir +# paste_command = paste_command + " > %s/All_indel_label_final_raw" % args.filter2_only_snp_vcf_dir +# f4.write(paste_command) +# f4.close() +# +# call("bash %s" % paste_file, logger) +# +# sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw > %s/All_indel_label_final_sorted.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# paste_command_header = "cat %s/header.txt %s/All_indel_label_final_sorted.txt > %s/All_indel_label_final_sorted_header.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# +# ls = [] +# for i in vcf_filenames: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') +# ls.append(label_file) +# ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) +# +# with open('%s/All_indel_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: +# outfile2.write(paste_command) +# outfile2.close() +# +# with open('%s/temp_indel_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: +# outfile2.write(temp_paste_command) +# outfile2.close() +# +# # Why is this not working? +# call("bash %s/All_indel_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) +# call("bash %s/temp_indel_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) +# keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') +# +# """ +# remove this lines +# #subprocess.call(["%s" % paste_command], shell=True) +# #subprocess.call(["%s" % temp_paste_command], shell=True) +# #subprocess.check_call('%s' % paste_command) +# #subprocess.check_call('%s' % temp_paste_command) +# #os.system(paste_command) change +# #os.system(temp_paste_command) change +# """ +# +# call("%s" % sort_All_label_cmd, logger) +# call("%s" % paste_command_header, logger) +# +# """ Assign numeric code to each variant filter reason""" +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir +# call("%s" % remove_unwanted_text, logger) +# +# def generate_indel_paste_command_outgroup(): +# """ +# This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. +# :param: null +# :return: null +# """ +# +# if args.outgroup: +# """ Paste/Generate and sort SNP Filter Label Matrix """ +# # define a file name where the paste commands will be saved. +# paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files_outgroup.sh" +# f4=open(paste_file, 'w+') +# +# # initiate paste command string +# paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir +# +# +# # Generate paste command +# for i in vcf_filenames: +# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') +# paste_command = paste_command + " " + label_file +# # Change header awk command to exclude outgroup +# #header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) +# header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % (outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) +# sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir +# sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir +# +# +# +# call("%s" % header_awk_cmd, logger) +# call("%s" % sed_header, logger) +# call("%s" % sed_header_2, logger) +# +# +# +# temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir +# paste_command = paste_command + " > %s/All_indel_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir +# f4.write(paste_command) +# f4.close() +# +# call("bash %s" % paste_file, logger) +# +# sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw_outgroup > %s/All_indel_label_final_sorted_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# paste_command_header = "cat %s/header_outgroup.txt %s/All_indel_label_final_sorted_outgroup.txt > %s/All_indel_label_final_sorted_header_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# +# ls = [] +# for i in vcf_filenames: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') +# ls.append(label_file) +# ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) +# +# with open('%s/All_indel_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: +# outfile2.write(paste_command) +# outfile2.close() +# +# with open('%s/temp_indel_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: +# outfile2.write(temp_paste_command) +# outfile2.close() +# +# # Why is this not working? +# call("bash %s/All_indel_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) +# call("bash %s/temp_indel_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) +# keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') +# +# """ +# remove this lines +# #subprocess.call(["%s" % paste_command], shell=True) +# #subprocess.call(["%s" % temp_paste_command], shell=True) +# #subprocess.check_call('%s' % paste_command) +# #subprocess.check_call('%s' % temp_paste_command) +# #os.system(paste_command) change +# #os.system(temp_paste_command) change +# """ +# +# call("%s" % sort_All_label_cmd, logger) +# call("%s" % paste_command_header, logger) +# +# """ Assign numeric code to each variant filter reason""" +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir +# call("%s" % remove_unwanted_text, logger) +# else: +# print "Skip generating seperate intermediate files for outgroup" +# +# def generate_position_label_data_matrix(): +# +# """ +# Generate different list of Positions using the matrix All_label_final_sorted_header.txt. +# +# (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). +# +# Filtered Position label matrix: +# List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. +# +# Only_ref_variant_positions_for_closely_matrix.txt : +# Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. +# +# :param: null +# :return: null +# +# """ +# def generate_position_label_data_matrix_All_label(): +# position_label = OrderedDict() +# f1 = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') +# f2 = open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f3 = open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f4 = open( +# "%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, +# 'w+') +# if args.outgroup: +# with open("%s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging( +# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, +# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, +# logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# position_label[row[0]] = row[1:] +# keep_logging('Generating different list of Positions and heatmap data matrix... \n', +# 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# f2.write('\t' + print_string_header.strip() + '\n') +# f3.write('\t' + print_string_header.strip() + '\n') +# f4.write('\t' + print_string_header.strip() + '\n') +# for value in position_label: +# lll = ['0', '2', '3', '4', '5', '6', '7'] +# ref_var = ['1', '1TRUE'] +# if set(ref_var) & set(position_label[value]): +# if set(lll) & set(position_label[value]): +# if int(value) not in outgroup_specific_positions: +# print_string = "" +# for i in position_label[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f3.write(STRR2) +# if position_label[value].count('1TRUE') >= 2: +# f4.write('1\n') +# else: +# f4.write('0\n') +# else: +# if int(value) not in outgroup_specific_positions: +# strr = value + "\n" +# f1.write(strr) +# STRR3 = value + "\t" + str(position_label[value]) + "\n" +# f2.write(STRR3) +# csv_file.close() +# f1.close() +# f2.close() +# f3.close() +# f4.close() +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# +# else: +# with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging( +# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, +# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, +# logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# position_label[row[0]] = row[1:] +# keep_logging('Generating different list of Positions and heatmap data matrix... \n', +# 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# f2.write('\t' + print_string_header.strip() + '\n') +# f3.write('\t' + print_string_header.strip() + '\n') +# f4.write('\t' + print_string_header.strip() + '\n') +# for value in position_label: +# lll = ['0', '2', '3', '4', '5', '6', '7'] +# ref_var = ['1', '1TRUE'] +# if set(ref_var) & set(position_label[value]): +# if set(lll) & set(position_label[value]): +# +# print_string = "" +# for i in position_label[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f3.write(STRR2) +# if position_label[value].count('1TRUE') >= 2: +# f4.write('1\n') +# else: +# f4.write('0\n') +# else: +# +# strr = value + "\n" +# f1.write(strr) +# STRR3 = value + "\t" + str(position_label[value]) + "\n" +# f2.write(STRR3) +# csv_file.close() +# f1.close() +# f2.close() +# f3.close() +# f4.close() +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# +# def temp_generate_position_label_data_matrix_All_label(): +# +# """ +# Read temp_label_final_raw.txt SNP position label data matrix for generating barplot statistics. +# """ +# temp_position_label = OrderedDict() +# f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# print_string_header = "\t" +# +# if args.outgroup: +# for i in vcf_filenames: +# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# else: +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# +# f33.write('\t' + print_string_header.strip() + '\n') +# keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') +# lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] +# ref_var = ['reference_allele', 'VARIANT'] +# +# if args.outgroup: +# print "here" +# with open("%s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# if set(ref_var) & set(row[1:]): +# if set(lll) & set(row[1:]): +# if int(row[0]) not in outgroup_specific_positions: +# +# print_string = "" +# for i in row[1:]: +# print_string = print_string + "\t" + i +# STRR2 = row[0] + print_string + "\n" +# f33.write(STRR2) +# csv_file.close() +# f33.close() +# +# else: +# with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# if set(ref_var) & set(row[1:]): +# if set(lll) & set(row[1:]): +# +# print_string = "" +# for i in row[1:]: +# print_string = print_string + "\t" + i +# STRR2 = row[0] + print_string + "\n" +# f33.write(STRR2) +# csv_file.close() +# f33.close() +# """ +# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ +# """ +# temp_position_label_FQ = OrderedDict() +# f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# +# for row in csv_reader: +# temp_position_label_FQ[row[0]] = row[1:] +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# f44.write('\t' + print_string_header.strip() + '\n') +# for value in temp_position_label_FQ: +# lll = ['LowFQ'] +# if set(lll) & set(temp_position_label_FQ[value]): +# +# print_string = "" +# for i in temp_position_label_FQ[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f44.write(STRR2) +# f44.close() +# csv_file.close() +# f44.close() +# +# """ +# Perform Sed on temp files. Find a faster way to do this. +# """ +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# +# +# """ +# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp +# """ +# temp_position_label_DP = OrderedDict() +# f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# temp_position_label_DP[row[0]] = row[1:] +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# f44.write('\t' + print_string_header.strip() + '\n') +# for value in temp_position_label_DP: +# lll = ['HighFQ_DP'] +# ref_var = ['reference_allele', 'VARIANT'] +# if set(lll) & set(temp_position_label_FQ[value]): +# +# print_string = "" +# for i in temp_position_label_FQ[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f44.write(STRR2) +# f44.close() +# csv_file.close() +# +# """ +# Perform Sed on temp files. Find a faster way to do this. +# """ +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# +# def barplot_stats(): +# keep_logging('\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', logger, 'info') +# """ +# Read each Sample columns and calculate the percentage of each label to generate barplot statistics. +# This will give a visual explanation of how many positions in each samples were filtered out because of different reason +# """ +# +# c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') +# columns = list(zip(*c_reader)) +# keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') +# counts = 1 +# +# if args.outgroup: +# end = len(vcf_filenames) + 1 +# end = end - 1 +# else: +# end = len(vcf_filenames) + 1 +# +# f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") +# f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") +# +# for i in xrange(1, end, 1): +# """ Bar Count Statistics: Variant Position Count Statistics """ +# true_variant = columns[i].count('VARIANT') +# unmapped_positions = columns[i].count('reference_unmapped_position') +# reference_allele = columns[i].count('reference_allele') +# Only_low_FQ = columns[i].count('LowFQ') +# Only_DP = columns[i].count('HighFQ_DP') +# Only_low_MQ = columns[i].count('HighFQ') +# low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP') +# high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL') +# other = low_FQ_other_parameters + high_FQ_other_parameters +# +# total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ +# +# filename_count = i - 1 +# +# if args.outgroup: +# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_FQ, Only_DP, Only_low_MQ, other) +# f_bar_count.write(bar_string) +# else: +# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( +# vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), +# unmapped_positions, reference_allele, true_variant, +# Only_low_FQ, Only_DP, Only_low_MQ, other) +# #f_bar_count.write(bar_string) +# """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ +# try: +# true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) +# except ZeroDivisionError: +# true_variant_perc = 0 +# try: +# unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) +# except ZeroDivisionError: +# unmapped_positions_perc = 0 +# try: +# reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) +# except ZeroDivisionError: +# reference_allele_perc = 0 +# try: +# Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total) +# except ZeroDivisionError: +# Only_low_FQ_perc = 0 +# try: +# Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total) +# except ZeroDivisionError: +# Only_DP_perc = 0 +# try: +# Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total) +# except ZeroDivisionError: +# Only_low_MQ_perc = 0 +# try: +# low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')) * 100) / total) +# except ZeroDivisionError: +# low_FQ_other_parameters_perc = 0 +# try: +# high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')) * 100) / total) +# except ZeroDivisionError: +# high_FQ_other_parameters_perc = 0 +# +# other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) +# if args.outgroup: +# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) +# else: +# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( +# vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), +# unmapped_positions_perc, reference_allele_perc, true_variant_perc, +# Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) +# f_bar_count.write(bar_string) +# f_bar_perc.write(bar_perc_string) +# f_bar_count.close() +# f_bar_perc.close() +# bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % (args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) +# barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') +# barplot_R_file.write(bargraph_R_script) +# keep_logging('Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, 'Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, logger, 'info') +# +# """ Methods Steps""" +# keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') +# generate_position_label_data_matrix_All_label() +# keep_logging('Running: Changing variables in data matrices to codes for faster processing...', 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') +# temp_generate_position_label_data_matrix_All_label() +# keep_logging('Running: Generating Barplot statistics data matrices...', 'Running: Generating Barplot statistics data matrices...', logger, 'info') +# barplot_stats() +# +# def generate_indel_position_label_data_matrix(): +# +# """ +# Generate different list of Positions using the matrix All_label_final_sorted_header.txt. +# +# (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). +# +# Filtered Position label matrix: +# List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. +# +# Only_ref_variant_positions_for_closely_matrix.txt : +# Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. +# +# :param: null +# :return: null +# +# """ +# def generate_indel_position_label_data_matrix_All_label(): +# position_label = OrderedDict() +# print "Generating Only_ref_indel_positions_for_closely" +# f1=open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') +# f2=open("%s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f3=open("%s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f4=open("%s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# +# if args.outgroup: +# with open("%s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging( +# 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, +# 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, +# logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# position_label[row[0]] = row[1:] +# keep_logging('Generating different list of Positions and heatmap data matrix...', +# 'Generating different list of Positions and heatmap data matrix...', logger, 'info') +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# # f.write('\t' + print_string_header.strip() + '\n') +# f2.write('\t' + print_string_header.strip() + '\n') +# f3.write('\t' + print_string_header.strip() + '\n') +# f4.write('\t' + print_string_header.strip() + '\n') +# for value in position_label: +# lll = ['0', '2', '3', '4', '5', '6', '7'] +# ref_var = ['1', '1TRUE'] +# if set(ref_var) & set(position_label[value]): +# if set(lll) & set(position_label[value]): +# if int(value) not in outgroup_indel_specific_positions: +# print_string = "" +# for i in position_label[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f3.write(STRR2) +# if position_label[value].count('1TRUE') >= 2: +# f4.write('1\n') +# else: +# f4.write('0\n') +# else: +# if int(value) not in outgroup_indel_specific_positions: +# strr = value + "\n" +# f1.write(strr) +# STRR3 = value + "\t" + str(position_label[value]) + "\n" +# f2.write(STRR3) +# csv_file.close() +# f1.close() +# f2.close() +# f3.close() +# f4.close() +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], +# shell=True) +# else: +# with open("%s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging('Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# position_label[row[0]] = row[1:] +# keep_logging('Generating different list of Positions and heatmap data matrix...', 'Generating different list of Positions and heatmap data matrix...', logger, 'info') +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# #f.write('\t' + print_string_header.strip() + '\n') +# f2.write('\t' + print_string_header.strip() + '\n') +# f3.write('\t' + print_string_header.strip() + '\n') +# f4.write('\t' + print_string_header.strip() + '\n') +# for value in position_label: +# +# lll = ['0', '2', '3', '4', '5', '6', '7'] +# ref_var = ['1', '1TRUE'] +# if set(ref_var) & set(position_label[value]): +# if set(lll) & set(position_label[value]): +# print_string = "" +# for i in position_label[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f3.write(STRR2) +# if position_label[value].count('1TRUE') >= 2: +# f4.write('1\n') +# else: +# f4.write('0\n') +# else: +# strr = value + "\n" +# f1.write(strr) +# STRR3 = value + "\t" + str(position_label[value]) + "\n" +# f2.write(STRR3) +# csv_file.close() +# f1.close() +# f2.close() +# f3.close() +# f4.close() +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# +# def temp_generate_indel_position_label_data_matrix_All_label(): +# +# """ +# Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. +# """ +# temp_position_label = OrderedDict() +# f33=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# print_string_header = "\t" +# if args.outgroup: +# for i in vcf_filenames: +# +# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# else: +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# +# f33.write('\t' + print_string_header.strip() + '\n') +# keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, logger, 'info') +# # lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] +# lll = ['reference_unmapped_position', 'LowAF', 'LowAF_DP', 'LowAF_QUAL', 'LowAF_DP_QUAL', 'LowAF_QUAL_DP', +# 'HighAF_DP', 'HighAF_QUAL', 'HighAF_DP_QUAL', 'HighAF_QUAL_DP', 'HighAF', 'LowAF_proximate_SNP', +# 'LowAF_DP_proximate_SNP', 'LowAF_QUAL_proximate_SNP', 'LowAF_DP_QUAL_proximate_SNP', +# 'LowAF_QUAL_DP_proximate_SNP', 'HighAF_DP_proximate_SNP', 'HighAF_QUAL_proximate_SNP', +# 'HighAF_DP_QUAL_proximate_SNP', 'HighAF_QUAL_DP_proximate_SNP', 'HighAF_proximate_SNP', '_proximate_SNP'] +# ref_var = ['reference_allele', 'VARIANT'] +# +# if args.outgroup: +# with open("%s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# if set(ref_var) & set(row[1:]): +# if set(lll) & set(row[1:]): +# if int(row[0]) not in outgroup_indel_specific_positions: +# print_string = "" +# for i in row[1:]: +# print_string = print_string + "\t" + i +# STRR2 = row[0] + print_string + "\n" +# f33.write(STRR2) +# csv_file.close() +# f33.close() +# else: +# with open("%s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# if set(ref_var) & set(row[1:]): +# if set(lll) & set(row[1:]): +# +# print_string = "" +# for i in row[1:]: +# print_string = print_string + "\t" + i +# STRR2 = row[0] + print_string + "\n" +# f33.write(STRR2) +# csv_file.close() +# f33.close() +# """ +# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of AF +# """ +# temp_position_label_AF = OrderedDict() +# f44=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging('Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# +# for row in csv_reader: +# temp_position_label_AF[row[0]] = row[1:] +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# f44.write('\t' + print_string_header.strip() + '\n') +# for value in temp_position_label_AF: +# lll = ['LowAF'] +# if set(lll) & set(temp_position_label_AF[value]): +# +# print_string = "" +# for i in temp_position_label_AF[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f44.write(STRR2) +# f44.close() +# csv_file.close() +# f44.close() +# +# """ +# Perform Sed on temp files. Find a faster way to do this. +# """ +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# +# +# """ +# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp +# """ +# temp_position_label_DP = OrderedDict() +# f44=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: +# keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, logger, 'info') +# csv_reader = csv.reader(csv_file, delimiter='\t') +# next(csv_reader, None) +# for row in csv_reader: +# temp_position_label_DP[row[0]] = row[1:] +# print_string_header = "\t" +# for i in vcf_filenames: +# print_string_header = print_string_header + os.path.basename(i) + "\t" +# f44.write('\t' + print_string_header.strip() + '\n') +# for value in temp_position_label_DP: +# lll = ['HighAF_DP'] +# ref_var = ['reference_allele', 'VARIANT'] +# if set(lll) & set(temp_position_label_AF[value]): +# print_string = "" +# for i in temp_position_label_AF[value]: +# print_string = print_string + "\t" + i +# STRR2 = value + print_string + "\n" +# f44.write(STRR2) +# f44.close() +# csv_file.close() +# +# """ +# Perform Sed on temp files. Find a faster way to do this. +# """ +# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF_DP/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/LowAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# subprocess.call(["sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) +# +# +# def barplot_indel_stats(): +# keep_logging('Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', 'Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', logger, 'info') +# """ +# Read each Sample columns and calculate the percentage of each label to generate barplot statistics. +# This will give a visual explanation of how many positions in each samples were filtered out because of different reason +# """ +# +# c_reader = csv.reader( +# open('%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, +# 'r'), delimiter='\t') +# columns = list(zip(*c_reader)) +# print len(columns) +# keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') +# counts = 1 +# +# if args.outgroup: +# end = len(vcf_filenames) + 1 +# end = end - 1 +# else: +# end = len(vcf_filenames) + 1 +# print end +# +# f_bar_count = open("%s/bargraph_indel_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f_bar_perc = open("%s/bargraph_indel_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_AF\tOnly_DP\tOnly_low_MQ\tother\n") +# f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_AF_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") +# for i in xrange(1, end, 1): +# """ Bar Count Statistics: Variant Position Count Statistics """ +# print i +# true_variant = columns[i].count('VARIANT') +# unmapped_positions = columns[i].count('reference_unmapped_position') +# reference_allele = columns[i].count('reference_allele') +# Only_low_AF = columns[i].count('LowAF') +# Only_DP = columns[i].count('HighAF_DP') +# Only_low_MQ = columns[i].count('HighAF') +# low_AF_other_parameters = columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[i].count('LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[i].count('LowAF_DP_proximate_SNP') + columns[i].count('LowAF_proximate_SNP') + columns[i].count('LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + columns[i].count('LowAF_QUAL') + columns[i].count('LowAF_DP') +# high_AF_other_parameters = columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[i].count('HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[i].count('HighAF_DP_proximate_SNP') + columns[i].count('HighAF_proximate_SNP') + columns[i].count('HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + columns[i].count('HighAF_QUAL') +# other = low_AF_other_parameters + high_AF_other_parameters +# total = true_variant + unmapped_positions + reference_allele + Only_low_AF + Only_DP + low_AF_other_parameters + high_AF_other_parameters + Only_low_MQ +# filename_count = i - 1 +# # bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_AF, Only_DP, Only_low_MQ, other) +# if args.outgroup: +# ### +# +# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( +# vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), +# unmapped_positions, reference_allele, true_variant, +# Only_low_AF, Only_DP, Only_low_MQ, other) +# else: +# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( +# vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), +# unmapped_positions, reference_allele, true_variant, +# Only_low_AF, Only_DP, Only_low_MQ, other) +# +# f_bar_count.write(bar_string) +# +# """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ +# try: +# true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) +# except ZeroDivisionError: +# true_variant_perc = 0 +# try: +# unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) +# except ZeroDivisionError: +# unmapped_positions_perc = 0 +# try: +# reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) +# except ZeroDivisionError: +# reference_allele_perc = 0 +# try: +# Only_low_AF_perc = float((columns[i].count('LowAF') * 100) / total) +# except ZeroDivisionError: +# Only_low_AF_perc = 0 +# try: +# Only_DP_perc = float((columns[i].count('HighAF_DP') * 100) / total) +# except ZeroDivisionError: +# Only_DP_perc = 0 +# try: +# Only_low_MQ_perc = float((columns[i].count('HighAF') * 100) / total) +# except ZeroDivisionError: +# Only_low_MQ_perc = 0 +# try: +# low_AF_other_parameters_perc = float(((columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[i].count('LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[i].count('LowAF_DP_proximate_SNP') + columns[i].count('LowAF_proximate_SNP') + columns[i].count('LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + columns[i].count('LowAF_QUAL') + columns[i].count('LowAF_DP')) * 100) / total) +# except ZeroDivisionError: +# low_AF_other_parameters_perc = 0 +# try: +# high_AF_other_parameters_perc = float(((columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[i].count('HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[i].count('HighAF_DP_proximate_SNP') + columns[i].count('HighAF_proximate_SNP') + columns[i].count('HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + columns[i].count('HighAF_QUAL')) * 100) / total) +# except ZeroDivisionError: +# high_AF_other_parameters_perc = 0 +# +# other_perc = float(low_AF_other_parameters_perc + high_AF_other_parameters_perc) +# if args.outgroup: +# ### +# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( +# os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), +# unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, +# other_perc) +# f_bar_perc.write(bar_perc_string) +# else: +# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( +# os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), +# unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, +# other_perc) +# f_bar_perc.write(bar_perc_string) +# +# f_bar_count.close() +# f_bar_perc.close() +# bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_indel_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot_indel.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % (args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) +# barplot_R_file = open("%s/bargraph_indel.R" % args.filter2_only_snp_vcf_dir, 'w+') +# barplot_R_file.write(bargraph_R_script) +# keep_logging('Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, 'Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, logger, 'info') +# +# +# """ Methods Steps""" +# keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') +# # if args.outgroup: +# # f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') +# # global outgroup_indel_specific_positions +# # outgroup_indel_specific_positions = [] +# # for i in f_outgroup: +# # outgroup_indel_specific_positions.append(i) +# # f_outgroup.close() +# # +# # f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') +# # global outgroup_specific_positions +# # outgroup_specific_positions = [] +# # for i in f_outgroup: +# # outgroup_specific_positions.append(i) +# # f_outgroup.close() +# # else: +# # global outgroup_specific_positions +# # global outgroup_indel_specific_positions +# # outgroup_indel_specific_positions = [] +# # outgroup_specific_positions = [] +# generate_indel_position_label_data_matrix_All_label() +# keep_logging('Running: Changing variables in data matrices to codes for faster processing...', 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') +# temp_generate_indel_position_label_data_matrix_All_label() +# keep_logging('Running: Generating Barplot statistics data matrices...', 'Running: Generating Barplot statistics data matrices...', logger, 'info') +# barplot_indel_stats() +# +# def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter): +# +# """ Generate jobs/scripts that creates core consensus fasta file. +# +# This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. +# Input for Fasttree, Beast and pairwise variant analysis. +# +# :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. +# :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. +# :return: +# :raises: +# """ +# if jobrun == "parallel-cluster": +# """ +# Supports only PBS clusters for now. +# """ +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) +# job_file_name = "%s_fasta.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# for i in pbs_scripts: +# keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') +# #os.system("qsub %s" % i) +# call("qsub %s" % i, logger) +# +# +# elif jobrun == "parallel-local" or jobrun == "cluster": +# """ +# Generate a Command list of each job and run it in parallel on different cores available on local system +# """ +# command_array = [] +# command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir +# f3 = open(command_file, 'w+') +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) +# job_file_name = "%s_fasta.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# for i in pbs_scripts: +# f3.write("bash %s\n" % i) +# f3.close() +# with open(command_file, 'r') as fpp: +# for lines in fpp: +# lines = lines.strip() +# command_array.append(lines) +# fpp.close() +# if args.numcores: +# num_cores = int(num_cores) +# else: +# num_cores = multiprocessing.cpu_count() +# results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) +# +# # elif jobrun == "cluster": +# # command_array = [] +# # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir +# # f3 = open(command_file, 'w+') +# # for i in vcf_filenames: +# # job_name = os.path.basename(i) +# # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) +# # job_file_name = "%s_fasta.pbs" % (i) +# # f1=open(job_file_name, 'w+') +# # f1.write(job_print_string) +# # f1.close() +# # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" +# # pbs_scripts = glob.glob(pbs_dir) +# # for i in pbs_scripts: +# # f3.write("bash %s\n" % i) +# # f3.close() +# # with open(command_file, 'r') as fpp: +# # for lines in fpp: +# # lines = lines.strip() +# # command_array.append(lines) +# # fpp.close() +# # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) +# else: +# """ +# Generate a Command list of each job and run it on local system one at a time +# """ +# command_array = [] +# command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir +# f3 = open(command_file, 'w+') +# +# +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) +# job_file_name = "%s_fasta.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# +# +# for i in pbs_scripts: +# f3.write("bash %s\n" % i) +# f3.close() +# with open(command_file, 'r') as fpp: +# for lines in fpp: +# lines = lines.strip() +# command_array.append(lines) +# fpp.close() +# #os.system("bash command_file") +# call("bash %s" % command_file, logger) +# +# def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, config_file): +# +# """ Generate jobs/scripts that creates core consensus fasta file. +# +# This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. +# Input for Fasttree, Beast and pairwise variant analysis. +# +# :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. +# :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. +# :return: +# :raises: +# """ +# if jobrun == "parallel-cluster": +# """ +# Supports only PBS clusters for now. +# """ +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) +# job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# for i in pbs_scripts: +# keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') +# #os.system("qsub %s" % i) +# call("qsub %s" % i, logger) +# +# +# elif jobrun == "parallel-local" or jobrun == "cluster": +# """ +# Generate a Command list of each job and run it in parallel on different cores available on local system +# """ +# command_array = [] +# command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir +# f3 = open(command_file, 'w+') +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) +# job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# for i in pbs_scripts: +# f3.write("bash %s\n" % i) +# f3.close() +# with open(command_file, 'r') as fpp: +# for lines in fpp: +# lines = lines.strip() +# command_array.append(lines) +# fpp.close() +# if args.numcores: +# num_cores = int(num_cores) +# else: +# num_cores = multiprocessing.cpu_count() +# results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) +# +# # elif jobrun == "cluster": +# # command_array = [] +# # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir +# # f3 = open(command_file, 'w+') +# # for i in vcf_filenames: +# # job_name = os.path.basename(i) +# # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) +# # job_file_name = "%s_fasta.pbs" % (i) +# # f1=open(job_file_name, 'w+') +# # f1.write(job_print_string) +# # f1.close() +# # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" +# # pbs_scripts = glob.glob(pbs_dir) +# # for i in pbs_scripts: +# # f3.write("bash %s\n" % i) +# # f3.close() +# # with open(command_file, 'r') as fpp: +# # for lines in fpp: +# # lines = lines.strip() +# # command_array.append(lines) +# # fpp.close() +# # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) +# else: +# """ +# Generate a Command list of each job and run it on local system one at a time +# """ +# command_array = [] +# command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir +# f3 = open(command_file, 'w+') +# +# +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) +# job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# +# +# for i in pbs_scripts: +# f3.write("bash %s\n" % i) +# f3.close() +# with open(command_file, 'r') as fpp: +# for lines in fpp: +# lines = lines.strip() +# command_array.append(lines) +# fpp.close() +# #os.system("bash command_file") +# call("bash %s" % command_file, logger) +# +# def create_job_DP(jobrun, vcf_filenames): +# """ +# Based on type of jobrun; generate jobs and run accordingly. +# :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. +# :param vcf_filenames: +# :return: +# """ +# +# if jobrun == "parallel-cluster": +# """ +# Supports only PBS clusters for now. +# """ +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) +# job_file_name = "%s_DP.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# for i in pbs_scripts: +# keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') +# #os.system("qsub %s" % i) +# call("qsub %s" % i, logger) +# +# +# elif jobrun == "parallel-local" or jobrun == "cluster" : +# """ +# Generate a Command list of each job and run it in parallel on different cores available on local system +# """ +# command_array = [] +# command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir +# f3 = open(command_file, 'w+') +# +# +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) +# job_file_name = "%s_DP.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# +# +# for i in pbs_scripts: +# f3.write("bash %s\n" % i) +# f3.close() +# with open(command_file, 'r') as fpp: +# for lines in fpp: +# lines = lines.strip() +# command_array.append(lines) +# fpp.close() +# print len(command_array) +# if args.numcores: +# num_cores = int(num_cores) +# else: +# num_cores = multiprocessing.cpu_count() +# results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) +# +# # elif jobrun == "cluster": +# # """ Test pending """ +# # command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir +# # f3 = open(command_file, 'w+') +# # for i in vcf_filenames: +# # job_name = os.path.basename(i) +# # job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) +# # job_file_name = "%s_DP.pbs" % (i) +# # f1=open(job_file_name, 'w+') +# # f1.write(job_print_string) +# # f1.close() +# # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" +# # pbs_scripts = glob.glob(pbs_dir) +# # for i in pbs_scripts: +# # f3.write("bash %s\n" % i) +# # f3.close() +# # os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) +# +# else: +# """ +# Generate a Command list of each job and run it on local system one at a time +# """ +# command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir +# f3 = open(command_file, 'w+') +# for i in vcf_filenames: +# job_name = os.path.basename(i) +# job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) +# job_file_name = "%s_DP.pbs" % (i) +# f1=open(job_file_name, 'w+') +# f1.write(job_print_string) +# f1.close() +# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" +# pbs_scripts = glob.glob(pbs_dir) +# for i in pbs_scripts: +# f3.write("bash %s\n" % i) +# f3.close() +# #os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) +# call("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir, logger) +# +# def generate_vcf_files(): +# if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": +# keep_logging('Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, 'Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, logger, +# 'info') +# # phage_positions = [] +# # phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir +# # with open(phage_region_positions, 'rU') as fp: +# # for line in fp: +# # phage_positions.append(line.strip()) +# # fp.close() +# +# +# functional_filter_pos_array = [] +# with open(functional_class_filter_positions, 'rU') as f_functional: +# for line_func in f_functional: +# functional_filter_pos_array.append(line_func.strip()) +# +# ref_variant_position_array = [] +# ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') +# for line in ffp: +# line = line.strip() +# if line not in functional_filter_pos_array: +# ref_variant_position_array.append(line) +# ffp.close() +# +# # Adding core indel support: 2018-07-24 +# ref_indel_variant_position_array = [] +# ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') +# for line in ffp: +# line = line.strip() +# if line not in functional_filter_pos_array: +# ref_indel_variant_position_array.append(line) +# ffp.close() +# +# else: +# functional_filter_pos_array = [] +# ref_variant_position_array = [] +# ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') +# for line in ffp: +# line = line.strip() +# ref_variant_position_array.append(line) +# ffp.close() +# +# # Adding core indel support: 2018-07-24 +# ref_indel_variant_position_array = [] +# ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') +# for line in ffp: +# line = line.strip() +# if line not in functional_filter_pos_array: +# ref_indel_variant_position_array.append(line) +# ffp.close() +# +# print "No. of core SNPs: %s" % len(ref_variant_position_array) +# print "No. of core INDELs: %s" % len(ref_indel_variant_position_array) +# +# f_file = open("%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, 'w+') +# for pos in ref_variant_position_array: +# f_file.write(pos + '\n') +# f_file.close() +# +# # Adding core indel support: 2018-07-24 +# f_file = open( +# "%s/Only_ref_indel_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, +# 'w+') +# for pos in ref_indel_variant_position_array: +# f_file.write(pos + '\n') +# f_file.close() +# +# base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] +# filter2_files_array = [] +# for i in vcf_filenames: +# filter2_file = i.replace('_no_proximate_snp.vcf', '') +# filter2_files_array.append(filter2_file) +# +# +# filtered_out_vcf_files = [] +# for i in filter2_files_array: +# print_array =[] +# with open(i) as file_open: +# for line in file_open: +# line = line.strip() +# if line.startswith("#"): +# print_array.append(line) +# else: +# split_array = re.split(r'\t+', line) +# if split_array[1] in ref_variant_position_array and 'INDEL' not in split_array[7]: +# print_array.append(line) +# file_open.close() +# file_name = i + "_core.vcf" +# keep_logging('Generating %s' % file_name, 'Generating %s' % file_name, logger, 'info') +# filtered_out_vcf_files.append(file_name) +# f1 = open(file_name, 'w+') +# for ios in print_array: +# print_string = str(ios) + "\n" +# f1.write(print_string) +# f1.close() +# +# filename = "%s/consensus.sh" % args.filter2_only_snp_vcf_dir +# keep_logging('Generating Consensus...', 'Generating Consensus...', logger, 'info') +# for file in filtered_out_vcf_files: +# f1 = open(filename, 'a+') +# bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) +# f1.write(bgzip_cmd) +# subprocess.call([bgzip_cmd], shell=True) +# tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) +# f1.write(tabix_cmd) +# subprocess.call([tabix_cmd], shell=True) +# fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s.fa\n" % (args.reference, base_vcftools_bin, file, file.replace('_filter2_final.vcf_core.vcf', '')) +# f1.write(fasta_cmd) +# subprocess.call([fasta_cmd], shell=True) +# base = os.path.basename(file) +# header = base.replace('_filter2_final.vcf_core.vcf', '') +# sed_command = "sed -i 's/>.*/>%s/g' %s.fa\n" % (header, file.replace('_filter2_final.vcf_core.vcf', '')) +# subprocess.call([sed_command], shell=True) +# f1.write(sed_command) +# keep_logging('The consensus commands are in : %s' % filename, 'The consensus commands are in : %s' % filename, logger, 'info') +# sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) +# #os.system(sequence_lgth_cmd) +# call("%s" % sequence_lgth_cmd, logger) +# +# def gatk_filter2(final_raw_vcf, out_path, analysis, reference): +# gatk_filter2_parameter_expression = "MQ > 50 && QUAL > 100 && DP > 9" +# gatk_filter2_command = "java -jar %s/%s/GenomeAnalysisTK.jar -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("gatk", Config)['gatk_bin'], reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) +# keep_logging('Running Command: [%s]' % gatk_filter2_command, 'Running Command: [%s]' % gatk_filter2_command, logger, 'info') +# #os.system(gatk_filter2_command) +# call("%s" % gatk_filter2_command, logger) +# filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % (out_path, analysis, out_path, analysis) +# call("%s" % filter_flag_command, logger) +# gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis) +# return gatk_filter2_final_vcf +# +# def remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference): +# all_position = [] +# remove_proximate_position_array = [] +# gatk_filter2_final_vcf_file_no_proximate_snp = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf" +# with open(gatk_filter2_final_vcf_file, 'rU') as csv_file: +# for line in csv_file: +# if not line.startswith('#'): +# line_array = line.split('\t') +# all_position.append(line_array[1]) +# for position in all_position: +# position_index = all_position.index(position) +# next_position_index = position_index + 1 +# +# if next_position_index < len(all_position): +# diff = int(all_position[next_position_index]) - int(position) +# if diff < 10: +# #print position + " " + all_position[next_position_index] +# if position not in remove_proximate_position_array and all_position[next_position_index] not in remove_proximate_position_array: +# remove_proximate_position_array.append(int(position)) +# remove_proximate_position_array.append(int(all_position[next_position_index])) +# f1=open(gatk_filter2_final_vcf_file_no_proximate_snp, 'w+') +# with open(gatk_filter2_final_vcf_file, 'rU') as csv_file2: +# for line in csv_file2: +# if line.startswith('gi') or line.startswith('MRSA_8058'): ##change this! +# line_array = line.split('\t') +# if int(line_array[1]) not in remove_proximate_position_array: +# print_string = line +# f1.write(print_string) +# else: +# print_string = line +# f1.write(print_string) +# gatk_filter2_final_vcf_file_no_proximate_snp_positions = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf_positions_array" +# f2=open(gatk_filter2_final_vcf_file_no_proximate_snp_positions, 'w+') +# for i in remove_proximate_position_array: +# position_print_string = str(i) + "\n" +# f2.write(position_print_string) +# return gatk_filter2_final_vcf_file_no_proximate_snp +# +# def FQ_analysis(): +# for i in vcf_filenames: +# filename_base = os.path.basename(i) +# aln_mpileup_vcf_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf_5bp_indel_removed.vcf') +# analysis = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '') +# #print aln_mpileup_vcf_file +# grep_reference_file = "grep \'^##reference\' %s" % aln_mpileup_vcf_file +# proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) +# (out, err) = proc.communicate() +# out = out.strip() +# reference_file = out.split(':') +# # Change it to multiprocessing +# gatk_filter2_final_vcf_file = gatk_filter2(aln_mpileup_vcf_file, temp_dir, analysis, reference_file[1]) +# #print gatk_filter2_final_vcf_file +# gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, temp_dir, analysis, reference_file[1]) +# grep_fq_field = "awk -F\'\\t\' \'{print $8}\' %s | grep -o \'FQ=.*\' | sed \'s/FQ=//g\' | awk -F\';\' \'{print $1}\' > %s/%s_FQ_values" % (gatk_filter2_final_vcf_file_no_proximate_snp, os.path.dirname(i), analysis) +# #os.system(grep_fq_field) +# call("%s" % grep_fq_field, logger) +# #print grep_fq_field +# +# def DP_analysis(): +# create_job_DP(args.jobrun, vcf_filenames) +# paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir +# for i in vcf_filenames: +# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') +# paste_command = paste_command + " " + label_file +# +# paste_file = args.filter2_only_snp_vcf_dir + "/paste_DP_files.sh" +# f2=open(paste_file, 'w+') +# paste_command = paste_command + " > %s/filtered_DP_values_temp.txt" % args.filter2_only_snp_vcf_dir +# #os.system(paste_command) +# f2.write(paste_command + '\n') +# cat_header = "cat %s/header.txt %s/filtered_DP_values_temp.txt > %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) +# #os.system(cat_header) +# f2.write(cat_header + '\n') +# sed_command = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir) +# #os.system(sed_command) +# f2.write(sed_command + '\n') +# cmd = "bash %s" % paste_file +# # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) +# +# def DP_analysis_barplot(): +# #os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) +# call("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir, logger) +# keep_logging('Generating DP barplots data...', 'Generating DP barplots data...', logger, 'info') +# c_reader = csv.reader(open('%s/filtered_DP_values.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') +# columns = list(zip(*c_reader)) +# counts = 1 +# end = len(vcf_filenames) + 1 +# f_bar_count = open("%s/DP_bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f_bar_perc = open("%s/DP_bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') +# f_bar_count.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") +# f_bar_perc.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") +# for i in xrange(1, end, 1): +# """ Bar Count Statistics: Variant Position Count Statistics """ +# reference_position = columns[i].count('NA') +# oneto5 = 0 +# for k in list(columns[i][1:]): +# if k != "": +# if k != "NA": +# if int(k) < 5: +# oneto5 += 1 +# sixto10 = 0 +# for k in list(columns[i][1:]): +# if k != "": +# if k != "NA": +# if int(k) >= 5 and int(k) <= 10: +# sixto10 += 1 +# elevento14 = 0 +# for k in list(columns[i][1:]): +# if k != "": +# if k != "NA": +# if int(k) >= 11 and int(k) <= 14: +# elevento14 += 1 +# fifteenorabove = 0 +# for k in list(columns[i][1:]): +# if k != "": +# if k != "NA": +# if int(k) >= 15: +# fifteenorabove += 1 +# total = reference_position + oneto5 + sixto10 + elevento14 + fifteenorabove +# filename_count = i - 1 +# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position, oneto5, sixto10, elevento14, fifteenorabove) +# f_bar_count.write(bar_string) +# +# """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ +# try: +# reference_position_perc = float(reference_position * 100 / total) +# except ZeroDivisionError: +# reference_position_perc = 0 +# try: +# oneto5_perc = float(oneto5 * 100 / total) +# except ZeroDivisionError: +# oneto5_perc = 0 +# try: +# sixto10_perc = float(sixto10 * 100 / total) +# except ZeroDivisionError: +# sixto10_perc = 0 +# try: +# elevento14_perc = float(elevento14 * 100 / total) +# except ZeroDivisionError: +# elevento14_perc = 0 +# try: +# fifteenorabove_perc = float(fifteenorabove * 100 / total) +# except ZeroDivisionError: +# fifteenorabove_perc = 0 +# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position_perc, oneto5_perc, sixto10_perc, elevento14_perc, fifteenorabove_perc) +# f_bar_perc.write(bar_perc_string) + +def extract_only_ref_variant_fasta(core_vcf_fasta_dir): + if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes" and ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": + functional_filter = "yes" + create_job_fasta(args.jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter) + +def extract_only_ref_variant_fasta_from_reference(): + if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes" and \ + ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": + ffp = open("%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir).readlines() + else: + ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir).readlines() + fasta_string = "" + #firstLine = ffp.pop(0) + for lines in ffp: + lines = lines.strip() + extract_base = "grep -v \'>\' %s | tr -d \'\\n\'| cut -b%s" % (args.reference, lines) + proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + out = out.strip() + fasta_string = fasta_string + out + if not out: + print lines + keep_logging('Error extracting reference allele', 'Error extracting reference allele', logger, 'info') + exit() + + pattern = re.compile(r'\s+') + fasta_string = re.sub(pattern, '', fasta_string) + final_fasta_string = ">%s\n" % os.path.basename(args.reference.replace('.fasta', '').replace('.fa', '')) + fasta_string + "\n" + fp = open("%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.reference.replace('.fasta', '').replace('.fa', ''))), 'w+') + fp.write(final_fasta_string) + fp.close() + +def extract_only_ref_variant_fasta_from_reference_allele_variant(): + ffp = open("%s/unique_positions_file" % args.filter2_only_snp_vcf_dir).readlines() + #unique_positions_array = [] + + fasta_string = "" + #firstLine = ffp.pop(0) + for lines in ffp: + lines = lines.strip() + #unique_positions_array.append(lines) + extract_base = "grep -v \'>\' %s | tr -d \'\\n\'| cut -b%s" % (args.reference, lines) + proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + out = out.strip() + fasta_string = fasta_string + out + if not out: + print lines + keep_logging('Error extracting reference allele', 'Error extracting reference allele', logger, 'info') + exit() + + pattern = re.compile(r'\s+') + fasta_string = re.sub(pattern, '', fasta_string) + final_fasta_string = ">%s\n" % os.path.basename(args.reference.replace('.fasta', '').replace('.fa', '')) + fasta_string + "\n" + fp = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.reference.replace('.fasta', '').replace('.fa', ''))), 'w+') + fp.write(final_fasta_string) + fp.close() + +def prepare_snpEff_db(reference_basename): + keep_logging('Preparing snpEff database requirements.', 'Preparing snpEff database requirements.', logger, 'info') + reference_basename = (os.path.basename(args.reference)).split(".") + if os.path.isfile("%s/%s/snpEff.config" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])): + #os.system("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir)) + keep_logging("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), "cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), logger, 'debug') + call("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), logger) + else: + keep_logging("Error: %s/%s/snpEff.config doesn't exists.\nExiting..." % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']),"Error: %s/%s/snpEff.config doesn't exists.\nExiting..." % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'exception') + exit() + make_sure_path_exists("%s/%s/data/%s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0])) + make_sure_path_exists("%s/%s/data/genomes/" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])) + #os.system("cp %s %s/%s/data/genomes/" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])) + keep_logging("cp %s %s/%s/data/genomes/%s.fa" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), "cp %s %s/%s/data/genomes/" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') + call("cp %s %s/%s/data/genomes/%s.fa" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), logger) + with open("%s/snpEff.config" % args.filter2_only_snp_vcf_dir, "a") as conf_file: + conf_file.write("\n\n##Building Custom Database###\n%s.genome\t: %s\n\n" % (reference_basename[0], reference_basename[0])) + conf_file.close() + #get the gff name from config file + if os.path.isfile("%s/%s.gff" % (os.path.dirname(args.reference), reference_basename[0])): + keep_logging("cp %s/%s.gff %s/%s/data/%s/genes.gff" % ( + os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], + ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), + "cp %s/%s.gff %s/%s/data/%s/genes.gff" % (os.path.dirname(args.reference), reference_basename[0], + ConfigSectionMap("bin_path", Config)['binbase'], + ConfigSectionMap("snpeff", Config)['snpeff_bin'], + reference_basename[0]), logger, 'debug') + keep_logging("cp %s/%s.gb* %s/%s/data/%s/genes.gbk" % ( + os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], + ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), + "cp %s/%s.gff %s/%s/data/%s/genes.gff" % (os.path.dirname(args.reference), reference_basename[0], + ConfigSectionMap("bin_path", Config)['binbase'], + ConfigSectionMap("snpeff", Config)['snpeff_bin'], + reference_basename[0]), logger, 'debug') + call("cp %s/%s.gff %s/%s/data/%s/genes.gff" % ( + os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], + ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), logger) + call("cp %s/%s.gb* %s/%s/data/%s/genes.gbk" % ( + os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], + ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), logger) + else: + keep_logging("Error: %s/%s.gff file doesn't exists. Make sure the GFF file has the same prefix as reference fasta file\nExiting..." % (os.path.dirname(args.reference), reference_basename[0]), + "Error: %s/%s.gff file doesn't exists. Make sure the GFF file has the same prefix as reference fasta file\nExiting..." % (os.path.dirname(args.reference), reference_basename[0]), logger, 'exception') + exit() + #keep_logging("java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') + keep_logging("java -jar %s/%s/%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') + + #call("java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger) + call("java -jar %s/%s/%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger) + keep_logging('Finished Preparing snpEff database requirements.', 'Finished Preparing snpEff database requirements.', logger, 'info') + +def variant_annotation(): + keep_logging('Annotating Variants using snpEff.', 'Annotating Variants using snpEff.', logger, 'info') + + if ConfigSectionMap("snpeff", Config)['prebuild'] == "yes": + if ConfigSectionMap("snpeff", Config)['db']: + print "Using pre-built snpEff database: %s" % ConfigSectionMap("snpeff", Config)['db'] + proc = subprocess.Popen(["java -jar %s/%s/%s databases | grep %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], + stdout=subprocess.PIPE, shell=True) + (out2, err2) = proc.communicate() + if out2: + snpeffdb = ConfigSectionMap("snpeff", Config)['db'] + else: + print "The database name %s provided was not found. Check the name and try again" % ConfigSectionMap("snpeff", Config)['db'] + exit() + else: + print "snpEff db section is not set in config file" + exit() + else: + reference_basename = (os.path.basename(args.reference)).split(".") + snpeffdb = reference_basename[0] + prepare_snpEff_db(reference_basename) + + annotate_vcf_cmd_array = [] + annotate_final_vcf_cmd_array = [] + for i in vcf_filenames: + raw_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf') + annotate_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ + (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, raw_vcf, raw_vcf) + print annotate_vcf_cmd + annotate_vcf_cmd_array.append(annotate_vcf_cmd) + final_vcf = i + annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ + (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) + annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) + if args.numcores: + num_cores = int(num_cores) + else: + num_cores = multiprocessing.cpu_count() + #print annotate_vcf_cmd_array + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) + results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) + +def indel_annotation(): + keep_logging('Annotating indels using snpEff.', 'Annotating indels using snpEff.', logger, 'info') + + if ConfigSectionMap("snpeff", Config)['prebuild'] == "yes": + if ConfigSectionMap("snpeff", Config)['db']: + print "Using pre-built snpEff database: %s" % ConfigSectionMap("snpeff", Config)['db'] + proc = subprocess.Popen(["java -jar %s/%s/%s databases | grep %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], + stdout=subprocess.PIPE, shell=True) + (out2, err2) = proc.communicate() + if out2: + snpeffdb = ConfigSectionMap("snpeff", Config)['db'] + else: + print "The database name %s provided was not found. Check the name and try again" % ConfigSectionMap("snpeff", Config)['db'] + exit() + else: + print "snpEff db section is not set in config file" + exit() + else: + reference_basename = (os.path.basename(args.reference)).split(".") + snpeffdb = reference_basename[0] + prepare_snpEff_db(reference_basename) + + + annotate_vcf_cmd_array = [] + annotate_final_vcf_cmd_array = [] + for i in vcf_filenames: + raw_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf') + annotate_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ + (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, raw_vcf, raw_vcf) + annotate_vcf_cmd_array.append(annotate_vcf_cmd) + final_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf') + annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ + (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) + annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) + if args.numcores: + num_cores = int(num_cores) + else: + num_cores = multiprocessing.cpu_count() + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) + results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) + +def gatk_combine_variants(files_gatk, reference, out_path, merged_file_suffix, logger, Config): + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ + 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + #files_gatk = "--variant " + ' --variant '.join(vcf_files_array) + keep_logging("java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix), "java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix), logger, 'debug') + merge_gatk_commands_file = "%s/gatk_merge.sh" % args.filter2_only_snp_vcf_dir + with open(merge_gatk_commands_file, 'w+') as fopen: + fopen.write("java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix) + '\n') + fopen.close() + # Commenting out calling gatk combine variants with a custom logging call method, problem with python subprocess, OSError: [Errno 7] Argument list too long + os.system("bash %s" % merge_gatk_commands_file) + return "%s/Final_vcf_gatk%s" % (out_path, merged_file_suffix) + +def annotated_snp_matrix(): + """ + :return: Annotate core vcf files generated at core_prep steps. + Read Genbank file and return a dictionary of Prokka ID mapped to Gene Name, Prokka ID mapped to Product Name. + This dictionary will then be used to insert annotation into SNP/Indel matrix + """ + + """Annotate all VCF file formats with SNPeff""" + # Commented for debugging + variant_annotation() + + indel_annotation() + + + """ Start of Extract Annotation information from Genbank file + + Extract Annotation information from Genbank file + + - Check if Reference genome Genbank file exists. + - Initiate dictionaries that maps locus tag to gene name and product. This information will be used for annotating SNP/Indel Matrix + - Read the locus tag and gene annotations into a dictionary that maps locus tags to gene name/product name + + """ + + reference_basename = (os.path.basename(args.reference)).split(".") + if os.path.isfile("%s/%s.gbf" % (os.path.dirname(args.reference), reference_basename[0])): + handle = open("%s/%s.gbf" % (os.path.dirname(args.reference), reference_basename[0]), 'rU') + else: + raise IOError('%s/%s.gbf does not exist.' % (os.path.dirname(args.reference), reference_basename[0])) + exit() + + locus_tag_to_gene_name = {} + locus_tag_to_product = {} + locus_tag_to_strand = {} + #locus_tag_to_uniprot = {} + #locus_tag_to_ec_number = {} + + keep_logging( + 'Reading annotations from Reference genome genbank file: %s/%s.gbf' % (os.path.dirname(args.reference), reference_basename[0]), + 'Reading annotations from Reference genome genbank file: %s/%s.gbf' % (os.path.dirname(args.reference), reference_basename[0]), + logger, 'info') + for record in SeqIO.parse(handle, 'genbank') : + for feature in record.features: + location = str(feature.location) + strand = location.split('(')[1].replace(')', '') + if 'locus_tag' in feature.qualifiers: + locus_tag_to_strand[str(feature.qualifiers['locus_tag'][0])] = strand + if 'gene' in feature.qualifiers: + locus_tag_to_gene_name[str(feature.qualifiers['locus_tag'][0])] = str(feature.qualifiers['gene'][0]) + else: + locus_tag_to_gene_name[str(feature.qualifiers['locus_tag'][0])] = "null or hypothetical protein" + if 'product' in feature.qualifiers: + locus_tag_to_product[str(feature.qualifiers['locus_tag'][0])] = str(feature.qualifiers['product'][0]) + else: + locus_tag_to_product[str(feature.qualifiers['locus_tag'][0])] = "null or hypothetical protein" + else: + keep_logging( + 'Error: locus_tag specifications for the below feature doesnt exists. Please check the format of genbank file\n%s' % str(feature), + 'Error: locus_tag specifications for the below feature doesnt exists. Please check the format of genbank file\n%s' % str(feature), + logger, 'exception') + + # Annotation Bug fix 1 + first_locus_tag = record.features[1].qualifiers['locus_tag'][0] + last_element = len(record.features) - 1 + last_locus_tag = record.features[last_element].qualifiers['locus_tag'][0] + + # #Debugging prints + # print first_locus_tag + # print locus_tag_to_gene_name[first_locus_tag] + # print last_locus_tag + # print locus_tag_to_gene_name[last_locus_tag] + + """ End of Extract Annotation information from Genbank file + + Extract Annotation information from Genbank file + + - Check if Reference genome Genbank file exists. + - Initiate dictionaries that maps locus tag to gene name and product. This information will be used for annotating SNP/Indel Matrix + - Read the locus tag and gene annotations into a dictionary that maps locus tags to gene name/product name + + """ + + + + """ Start of Merging Step: + + - Merge Individual Annotated raw and filtered vcf files to generate a Final merged vcf file using Gatk combine variants method. + - Parse this merged Final_vcf* file and generate a SNP/Indel matrix + + """ + + keep_logging('Merging Final Annotated VCF files into %s/Final_vcf_no_proximate_snp.vcf using bcftools' % args.filter2_only_snp_vcf_dir, 'Merging Final Annotated VCF files into %s/Final_vcf_no_proximate_snp.vcf using bcftools' % args.filter2_only_snp_vcf_dir, logger, 'info') + + #Commented for debugging + files_for_tabix = glob.glob("%s/*.vcf_no_proximate_snp.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) + files_for_tabix = glob.glob("%s/*_filter2_indel_final.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) + + files = ' '.join(vcf_filenames) + + + """ bcftools merging is deprecated. Replaced with GATK combinevariants """ + merge_commands_file = "%s/bcftools_merge.sh" % args.filter2_only_snp_vcf_dir + + with open(merge_commands_file, 'w+') as fopen: + fopen.write("%s/%s/bcftools merge -i ANN:join -m both -o %s/Final_vcf_no_proximate_snp.vcf -O v %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], args.filter2_only_snp_vcf_dir, files.replace("_filter2_final.vcf_no_proximate_snp.vcf", "_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz")) + '\n') + fopen.write("%s/%s/bcftools merge -i ANN:join -m both -o %s/Final_vcf_indel.vcf -O v %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], args.filter2_only_snp_vcf_dir,files.replace("_filter2_final.vcf_no_proximate_snp.vcf","_filter2_indel_final.vcf_ANN.vcf.gz")) + '\n') + + fopen.close() + + os.system("bash %s" % merge_commands_file) + + + """ Merge with Gatk combine variants method """ + # #Commented for debugging + merged_file_suffix = "_no_proximate_snp.vcf" + + annotated_no_proximate_snp_file = "%s/annotated_no_proximate_snp_list.txt" % args.filter2_only_snp_vcf_dir + annotated_no_proximate_snp_indel_file = "%s/annotated_no_proximate_snp_indel_list.txt" % args.filter2_only_snp_vcf_dir + + with open(annotated_no_proximate_snp_file, 'w+') as fopen: + for i in vcf_filenames: + fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz') + '\n') + fopen.close() + + with open(annotated_no_proximate_snp_indel_file, 'w+') as fopen: + for i in vcf_filenames: + fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_ANN.vcf.gz') + '\n') + fopen.close() + + #files_gatk = "--variant " + ' --variant '.join(vcf_filenames) + files_gatk = "" + for i in vcf_filenames: + files_gatk = files_gatk + " --variant " + i + final_gatk_snp_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz'), args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) + + # Test this merge and annotate this merged file - Testing Mode Right now. + #merged_file_suffix = "_no_proximate_snp_1.vcf" + #final_gatk_snp_merged_vcf_1 = gatk_combine_variants(files_gatk,args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) + merged_file_suffix = "_indel.vcf" + final_gatk_indel_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_indel_final.vcf_ANN.vcf.gz'), + args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, + logger, Config) + + """ Tabix index the combined GATK Final vcf file """ + files_for_tabix = glob.glob("%s/Final_vcf_*.vcf" % args.filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) + + + """ End of Merging Step. """ + + + """ Extract ANN information from bcftools Final vcf file. (There is a reason why i am using bcftools merged file to extract ANN information) """ + snp_var_ann_dict = {} + indel_var_ann_dict = {} + + for variants in VCF("%s/Final_vcf_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir): + snp_var_ann_dict[variants.POS] = variants.INFO.get('ANN') + + for variants in VCF("%s/Final_vcf_indel.vcf.gz" % args.filter2_only_snp_vcf_dir): + indel_var_ann_dict[variants.POS] = variants.INFO.get('ANN') + + """ End of Extract ANN information from bcftools Final vcf file""" + + + + """ This step is no longer required: Remove this after testing. print_string_header will be the column names of SNP matrix. Column names = Sample names""" + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + + + + """ Generate an array of core positions. Read Only_ref_variant_positions_for_closely* to get final core variant positions into core_positions array""" + core_positions = [] + if ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": + core_positions_file = "%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir + else: + core_positions_file = "%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir + with open(core_positions_file) as fp: + for line in fp: + line = line.strip() + core_positions.append(line) + fp.close() + + indel_core_positions = [] + if ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": + core_positions_file = "%s/Only_ref_indel_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir + else: + core_positions_file = "%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir + with open(core_positions_file) as fp: + for line in fp: + line = line.strip() + indel_core_positions.append(line) + fp.close() + + """ End: Generate an array of core positions. """ + + + + """ Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" + """ Read in functional class filter positions. """ + functional_filter_pos_array = [] + with open(functional_class_filter_positions, 'rU') as f_functional: + for line_func in f_functional: + functional_filter_pos_array.append(line_func.strip()) + + """ GET individual PHAGE/Repetitive/masked region positions to assign functional class group string """ + phage_positions = [] + repetitive_positions = [] + mask_positions = [] + if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": + if ConfigSectionMap("functional_filters", Config)['find_phage_region'] == "yes": + phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir + if os.path.isfile(phage_region_positions): + with open(phage_region_positions, 'rU') as fphage: + for line in fphage: + phage_positions.append(line.strip()) + fphage.close() + else: + raise IOError('%s/phage_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) + exit() + # GET REPETITIVE REGIONS + if ConfigSectionMap("functional_filters", Config)['find_repetitive_region'] == "yes": + repetitive_positions_file = "%s/repeat_region_positions.txt" % args.filter2_only_snp_vcf_dir + if os.path.isfile(repetitive_positions_file): + with open(repetitive_positions_file, 'rU') as frep: + for line in frep: + repetitive_positions.append(line.strip()) + frep.close() + else: + raise IOError('%s/repeat_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) + exit() + # GET MASK REGIONS + if ConfigSectionMap("functional_filters", Config)['mask_region'] == "yes": + mask_positions_file = "%s/mask_positions.txt" % args.filter2_only_snp_vcf_dir + if os.path.isfile(mask_positions_file): + with open(mask_positions_file, 'rU') as fmask: + for line in fmask: + mask_positions.append(line.strip()) + fmask.close() + else: + raise IOError('%s/mask_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) + exit() + + """ End: Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" + + + + + """ Read and parse final GATK merged vcf file cyvcf library; Generate a header string from the sample lis fo this merged vcf file""" + + final_merge_anno_file = VCF("%s/Final_vcf_gatk_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir) + + """ Prepare SNP/Indel Matrix print strings and add matrix row information subsequently """ + header_print_string = "Type of SNP at POS > ALT functional=PHAGE_REPEAT_MASK locus_tag=locus_id strand=strand; ALT|Effect|Impact|GeneID|Nrchange|Aachange|Nrgenepos|AAgenepos|gene_symbol|product" + for sample in final_merge_anno_file.samples: + # header_print_string = header_print_string + "," + sample + header_print_string = header_print_string + "\t" + sample + header_print_string = header_print_string + "\n" + + """ End """ + + + + + """ Prepare a All_indel_label_final_ordered_sorted.txt file with sorted unique variant positions. """ + paste_label_command = "paste %s/unique_positions_file " % args.filter2_only_snp_vcf_dir + paste_indel_label_command = "paste %s/unique_indel_positions_file " % args.filter2_only_snp_vcf_dir + paste_label_command_exclude_outgroup = "paste %s/unique_positions_file " % args.filter2_only_snp_vcf_dir + paste_indel_label_command_exclude_outgroup = "paste %s/unique_indel_positions_file " % args.filter2_only_snp_vcf_dir + + for filename_base in final_merge_anno_file.samples: + if "R1_001_final.fastq.gz" in filename_base: + second_part = filename_base.replace("R1_001_final.fastq.gz", "R2_001_final.fastq.gz") + first_part_split = filename_base.split('R1_001_final.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + elif "_R1.fastq.gz" in filename_base: + second_part = filename_base.replace("_R1.fastq.gz", "_R2.fastq.gz") + first_part_split = filename_base.split('_R1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + # Changed on 03/15/2019 + elif "R1.fastq.gz" in filename_base: + second_part = filename_base.replace("R1.fastq.gz", "R2.fastq.gz") + first_part_split = filename_base.split('R1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + # Changed on 03/15/2019 + first_part = re.sub("_S.*", "", first_part) + elif "1_combine.fastq.gz" in filename_base: + second_part = filename_base.replace("1_combine.fastq.gz", "2_combine.fastq.gz") + first_part_split = filename_base.split('1_combine.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + elif "1_sequence.fastq.gz" in filename_base: + second_part = filename_base.replace("1_sequence.fastq.gz", "2_sequence.fastq.gz") + first_part_split = filename_base.split('1_sequence.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + elif "_forward.fastq.gz" in filename_base: + second_part = filename_base.replace("_forward.fastq.gz", "_reverse.fastq.gz") + first_part_split = filename_base.split('_forward.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + elif "R1_001.fastq.gz" in filename_base: + second_part = filename_base.replace("R1_001.fastq.gz", "R2_001.fastq.gz") + first_part_split = filename_base.split('R1_001.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + elif "_1.fastq.gz" in filename_base: + second_part = filename_base.replace("_1.fastq.gz", "_2.fastq.gz") + first_part_split = filename_base.split('_1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + elif ".1.fastq.gz" in filename_base: + second_part = filename_base.replace(".1.fastq.gz", ".2.fastq.gz") + first_part_split = filename_base.split('.1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + sample_label_file = "%s/%s_filter2_final.vcf_no_proximate_snp.vcf_positions_label" % ( + args.filter2_only_snp_vcf_dir, first_part) + sample_indel_label_file = "%s/%s_filter2_indel_final.vcf_indel_positions_label" % ( + args.filter2_only_snp_vcf_dir, first_part) + paste_label_command = paste_label_command + sample_label_file + " " + paste_indel_label_command = paste_indel_label_command + sample_indel_label_file + " " + if args.outgroup: + if outgroup not in sample_label_file: + paste_label_command_exclude_outgroup = paste_label_command_exclude_outgroup + sample_label_file + " " + paste_indel_label_command_exclude_outgroup = paste_indel_label_command_exclude_outgroup + sample_indel_label_file + " " + + paste_label_command = paste_label_command + " > %s/All_label_final_ordered.txt" % args.filter2_only_snp_vcf_dir + paste_indel_label_command = paste_indel_label_command + " > %s/All_indel_label_final_ordered.txt" % args.filter2_only_snp_vcf_dir + sort_ordered_label_cmd = "sort -n -k1,1 %s/All_label_final_ordered.txt > %s/All_label_final_ordered_sorted.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + sort_ordered_indel_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_ordered.txt > %s/All_indel_label_final_ordered_sorted.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + + if args.outgroup: + paste_label_command_exclude_outgroup = paste_label_command_exclude_outgroup + " > %s/All_label_final_ordered_exclude_outgroup.txt" % args.filter2_only_snp_vcf_dir + paste_indel_label_command_exclude_outgroup = paste_indel_label_command_exclude_outgroup + " > %s/All_indel_label_final_ordered_exclude_outgroup.txt" % args.filter2_only_snp_vcf_dir + sort_ordered_label_cmd_exclude_outgroup = "sort -n -k1,1 %s/All_label_final_ordered_exclude_outgroup.txt > %s/All_label_final_ordered_exclude_outgroup_sorted.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + sort_ordered_indel_label_cmd_exclude_outgroup = "sort -n -k1,1 %s/All_indel_label_final_ordered_exclude_outgroup.txt > %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) + + + with open('%s/All_label_final_ordered.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: + outfile.write(paste_label_command + '\n') + outfile.write(sort_ordered_label_cmd + '\n') + outfile.write(paste_indel_label_command + '\n') + outfile.write(sort_ordered_indel_label_cmd + '\n') + outfile.close() + + os.system("bash %s/All_label_final_ordered.sh" % args.filter2_only_snp_vcf_dir) + + if args.outgroup: + # Just in case if os.system past commands doesn't work + with open('%s/All_label_final_ordered_exclude_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: + outfile.write(paste_label_command_exclude_outgroup + '\n') + outfile.write(sort_ordered_label_cmd_exclude_outgroup + '\n') + outfile.write(paste_indel_label_command_exclude_outgroup + '\n') + outfile.write(sort_ordered_indel_label_cmd_exclude_outgroup + '\n') + outfile.close() + + # Changed: Uncomment this + os.system("bash %s/All_label_final_ordered_exclude_outgroup.sh" % args.filter2_only_snp_vcf_dir) + + """ End: Prepare a All_indel_label_final_ordered_sorted.txt file with sorted unique variant positions. """ + + + + + + + + """ Generate a position_label and position_indel_label dictionary that will contain information about each unique variant position that passed variant filters in any sample and reasons for being filtered out in any sample """ + position_label = OrderedDict() + with open("%s/All_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging('Reading All label positions file: %s/All_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + position_label[row[0]] = ','.join(row[1:]) + csv_file.close() + + # #Commented for debugging + position_indel_label = OrderedDict() + with open("%s/All_indel_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_indel_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_indel_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + if row[0] not in position_label.keys(): + position_indel_label[row[0]] = ','.join(row[1:]) + else: + position_indel_label[row[0]] = ','.join(row[1:]) + keep_logging('Warning: position %s already present as a SNP' % row[0], + 'Warning: position %s already present as a SNP' % row[0], logger, 'info') + csv_file.close() + + """ End: Generate a position_label and position_indel_label dictionary """ + + + + + + """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ """ + mask_fq_mq_positions = [] + mask_fq_mq_positions_outgroup_specific = [] + if args.outgroup: + position_label_exclude_outgroup = OrderedDict() + with open("%s/All_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + position_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + csv_file.close() + + #Commented for debugging + position_indel_label_exclude_outgroup = OrderedDict() + with open("%s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + if row[0] not in position_label_exclude_outgroup.keys(): + position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + else: + position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + keep_logging('Warning: position %s already present as a SNP' % row[0], + 'Warning: position %s already present as a SNP' % row[0], logger, 'info') + csv_file.close() + + for key in position_label_exclude_outgroup.keys(): + label_sep_array = position_label_exclude_outgroup[key].split(',') + for i in label_sep_array: + if "LowFQ" in str(i): + if key not in mask_fq_mq_positions: + if int(key) not in outgroup_specific_positions: + mask_fq_mq_positions.append(key) + elif int(key) in outgroup_specific_positions: + mask_fq_mq_positions_outgroup_specific.append(key) + if i == "HighFQ": + if key not in mask_fq_mq_positions: + if int(key) not in outgroup_specific_positions: + mask_fq_mq_positions.append(key) + elif int(key) in outgroup_specific_positions: + mask_fq_mq_positions_outgroup_specific.append(key) + else: + for key in position_label.keys(): + label_sep_array = position_label[key].split(',') + for i in label_sep_array: + if "LowFQ" in str(i): + if key not in mask_fq_mq_positions: + mask_fq_mq_positions.append(key) + if i == "HighFQ": + if key not in mask_fq_mq_positions: + mask_fq_mq_positions.append(key) + + fp = open("%s/mask_fq_mq_positions.txt" % (args.filter2_only_snp_vcf_dir), 'w+') + for i in mask_fq_mq_positions: + fp.write(i + '\n') + fp.close() + + fp = open("%s/mask_fq_mq_positions_outgroup_specific.txt" % (args.filter2_only_snp_vcf_dir), 'w+') + for i in mask_fq_mq_positions_outgroup_specific: + fp.write(i + '\n') + fp.close() + + print "Length of mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) + print "Length of mask_fq_mq_positions specific to outgroup:%s" % len(mask_fq_mq_positions_outgroup_specific) + + """ End: Generate mask_fq_mq_positions array """ + + + + + + + + + + + """ Main: Generate SNP Matrix """ + + + """ Open Matrix files to write strings """ + fp_code = open("%s/SNP_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') + fp_allele = open("%s/SNP_matrix_allele_outdated.csv" % args.filter2_only_snp_vcf_dir, 'w+') + fp_allele_new = open("%s/SNP_matrix_allele_new.csv" % args.filter2_only_snp_vcf_dir, 'w+') + fp_allele_new_phage = open("%s/SNP_matrix_allele_unmasked.csv" % args.filter2_only_snp_vcf_dir, 'w+') + fp_code.write(header_print_string) + fp_allele.write(header_print_string) + fp_allele_new.write(header_print_string) + fp_allele_new_phage.write(header_print_string) + + """ Parse variant positions from the loaded cyvcf VCF object and generate the matrix row information """ + for variants in VCF("%s/Final_vcf_gatk_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir): + # Initiate print_string variable to add matrix row information. + # print_string generator no. 1 + print_string = "" + + # Initiate and assign Functional Field filter string => PHAGE/REPEAT/MASK/NULL + functional_field = "" + if str(variants.POS) in phage_positions: + functional_field = functional_field + "PHAGE_" + else: + functional_field = functional_field + "NULL_" + if str(variants.POS) in repetitive_positions: + functional_field = functional_field + "REPEATS_" + else: + functional_field = functional_field + "NULL_" + if str(variants.POS) in mask_positions: + functional_field = functional_field + "MASK" + else: + functional_field = functional_field + "NULL" + + # Initiate variant code string where the code means: + # REF allele = 0, core = 1, Filtered = 2, unmapped = -1, True but non-core = 3 + # This will be used as row information for SNP_matrix_code file + + code_string = position_label[str(variants.POS)] + code_string = code_string.replace('reference_allele', '0') + code_string = code_string.replace('reference_unmapped_position', '-1') + # Changing LowFQ code from 2 to -3 + # Changing HighFQ but LowMQ code from 2 to -4 + code_string = code_string.replace('LowFQ_QUAL_DP_proximate_SNP', '-3') + code_string = code_string.replace('LowFQ_DP_QUAL_proximate_SNP', '-3') + code_string = code_string.replace('LowFQ_QUAL_proximate_SNP', '-3') + code_string = code_string.replace('LowFQ_DP_proximate_SNP', '-3') + code_string = code_string.replace('LowFQ_proximate_SNP', '-3') + code_string = code_string.replace('LowFQ_QUAL_DP', '-3') + code_string = code_string.replace('LowFQ_DP_QUAL', '-3') + code_string = code_string.replace('LowFQ_QUAL', '-3') + code_string = code_string.replace('LowFQ_DP', '-3') + code_string = code_string.replace('HighFQ_QUAL_DP_proximate_SNP', '2') + code_string = code_string.replace('HighFQ_DP_QUAL_proximate_SNP', '2') + code_string = code_string.replace('HighFQ_QUAL_proximate_SNP', '2') + code_string = code_string.replace('HighFQ_DP_proximate_SNP', '2') + code_string = code_string.replace('HighFQ_proximate_SNP', '2') + code_string = code_string.replace('HighFQ_QUAL_DP', '2') + code_string = code_string.replace('HighFQ_DP_QUAL', '2') + code_string = code_string.replace('HighFQ_QUAL', '2') + code_string = code_string.replace('HighFQ_DP', '2') + code_string = code_string.replace('LowFQ', '-3') + code_string = code_string.replace('HighFQ', '-4') + + + if str(variants.POS) in core_positions: + code_string = code_string.replace('VARIANT', '1') + # Adding functional class status code to SNP matrix: 2018-07-24 + elif str(variants.POS) in functional_filter_pos_array: + # Changing Functional class filter code to -2 from 2: 2018-12-04 + code_string = code_string.replace('VARIANT', '-2') + else: + code_string = code_string.replace('VARIANT', '3') + + # Remove this commented section: Deprecated + # Changing SNP type: Date 28/05/2019 + # Assign type of snp: coding / non-coding + # if variants.INFO.get('ANN'): + # if "protein_coding" in variants.INFO.get('ANN'): + # snp_type = "Coding SNP" + # else: + # snp_type = "Non-coding SNP" + # else: + # if len(variants.ALT) > 1 and snp_var_ann_dict[variants.POS]: + # #print variants.ALT + # #print ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) + # #print variants.POS + # #print set(snp_var_ann_dict[variants.POS]) + # if "protein_coding" in set(snp_var_ann_dict[variants.POS].split(',')): + # snp_type = "Coding SNP" + # else: + # snp_type = "Non-coding SNP" + # else: + # snp_type = "Non-coding SNP" + # Remove this commented section: Deprecated + + # Annotation Bug fix 2 + # Changing SNP type: Date 28/05/2019 + if variants.POS in snp_var_ann_dict.keys(): + if snp_var_ann_dict[variants.POS] is not None: + if "protein_coding" in set(snp_var_ann_dict[variants.POS].split('|')) and "intergenic_region" not in set(snp_var_ann_dict[variants.POS].split('|')): + snp_type = "Coding SNP" + elif "protein_coding" in set(snp_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(snp_var_ann_dict[variants.POS].split('|')): + snp_type = "Coding and Non-coding SNP" + elif "protein_coding" not in set(snp_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(snp_var_ann_dict[variants.POS].split('|')): + snp_type = "Non-Coding SNP" + elif "protein_coding" not in set(snp_var_ann_dict[variants.POS].split('|')) and "intragenic_variant" in set(snp_var_ann_dict[variants.POS].split('|')): + snp_type = "Non-Coding SNP" + else: + print set((snp_var_ann_dict[variants.POS].split('|'))) + snp_type = "No_protein_coding/intergenic_region_field_in_ANN SNP" + #print snp_type + else: + keep_logging('Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, 'Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, logger, 'info') + print set((snp_var_ann_dict[variants.POS].split('|'))) + snp_type = "Not Found in Annotated VCF file" + + #print snp_type + + # print_string generator no. 2 + print_string = print_string + snp_type + " at %s > " % str(variants.POS) + str(",".join(variants.ALT)) + " functional=%s" % functional_field + + # Annotation Bug fix 3 + # Get ANN field from variant INFO column and save it as an array. Split and Go through each elements, add bells and whistles + if variants.INFO.get('ANN'): + + ann_array = (variants.INFO.get('ANN')).split(',') + + # Generate tag string before generating ann_string + if len(ann_array) > 1: + # print variants.INFO.get('ANN') + # print list(set(ann_array)) + tag_list = [] + + for i_again in set(snp_var_ann_dict[variants.POS].split(',')): + i_split_again = i_again.split('|') + + + + + if "-" not in i_split_again[4]: + if i_split_again[4] not in tag_list: + tag_list.append(i_split_again[4]) + + else: + split_tags = i_split_again[4].split('-') + for splittagsindividual in split_tags: + if splittagsindividual not in tag_list: + tag_list.append(splittagsindividual) + + if len(tag_list) == 1: + tag = tag_list[0] + + elif len(tag_list) == 2: + tag = str(tag_list[0]) + "-" + str(tag_list[1]) + + elif len(tag_list) > 2: + print tag_list + tag = tag.replace('CHR_START-', '') + tag = tag.replace('-CHR_END', '') + else: + for i in list(set(ann_array)): + i_split = i.split('|') + tag = str(i_split[4]).replace('CHR_START-', '') + tag = str(tag).replace('-CHR_END', '') + + + ann_string = ";" + for i in list(set(ann_array)): + i_split = i.split('|') + #ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" + + # MOve this tag before this for loop because of multiple tags associated. + # tag = str(i_split[4]).replace('CHR_START-', '') + # tag = str(tag).replace('-CHR_END', '') + + if "-" in tag: + #print tag + extra_tags = "" + tag_split = tag.split('-') + for i in tag_split: + if i in locus_tag_to_gene_name.keys(): + extra_tags = extra_tags + locus_tag_to_gene_name[i] + "," + else: + extra_tags = extra_tags + "None" + "," + extra_tags_prot = "" + for i in tag_split: + if i in locus_tag_to_product.keys(): + extra_tags_prot = extra_tags_prot + locus_tag_to_product[i] + "," + else: + extra_tags_prot = extra_tags_prot + "None" + "," + ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags, extra_tags_prot]) + ";" + # Changing SNP type: Date 28/05/2019 + elif tag == "": + print "ERROR: Issues with this locus tag. Check this tag in genbank file" + print list(set(ann_array)) + # Adding this so that Ann string is not empty: 30/05/2019 + if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): + extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) + else: + print "tag key not found: %s" % tag + extra_tags = "NULL" + "|" + "NULL" + # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" + # Added 2019-31-05 + if "ERROR_OUT_OF_CHROMOSOME_RANGE" in i: + ann_string = ann_string + '|'.join( + [i_split[0], "intergenic_region", i_split[2], "ERROR_OUT_OF_CHROMOSOME_RANGE", i_split[9], i_split[10], i_split[11], + i_split[13], extra_tags]) + ";" + else: + ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" + # Debugging + if i_split[3] == "CD630_00290": + print ann_string + # Changing SNP type: Date 28/05/2019 + else: + if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): + extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) + else: + print "tag key not found: %s" % tag + extra_tags = "NULL" + "|" + "NULL" + # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" + ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" + + # Annotation Bug fix 4 + # Changing SNP type: Date 28/05/2019 + # Working/Testing + else: + if len(variants.ALT) > 1 and snp_var_ann_dict[variants.POS]: + #print variants.ALT + #print ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) + + ann_string = ";%s" % ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) + # Get Tag here; Multiple tag names. + tag_list = [] + + + for i in set(snp_var_ann_dict[variants.POS].split(',')): + i_split = i.split('|') + if i_split[4] not in tag_list: + tag_list.append(i_split[4]) + if len(tag_list) > 1: + tag = str(tag_list[0]) + "-" + str(tag_list[1]) + else: + tag = tag_list[0] + + # if len(set(snp_var_ann_dict[variants.POS].split(','))) > 2: + # print tag + # print set(snp_var_ann_dict[variants.POS].split(',')) + + else: + ann_string = ";None" + + # Annotation Bug fix 5 + # Changing SNP type: Date 28/05/2019 + ann_string = ann_string.replace('ERROR_OUT_OF_CHROMOSOME_RANGE', '%s-%s' % (locus_tag_to_gene_name[last_locus_tag], locus_tag_to_gene_name[first_locus_tag])) + ann_string = ann_string.replace('CHR_END', '%s' % locus_tag_to_gene_name[first_locus_tag]) + + # SNP Matrix Bug + # No changes here: 28/05/2019 + ann_string_split = ann_string.split(';') + #print len(ann_string_split) + if len(ann_string_split) == 3: + first_allele_ann_string_split = ann_string_split[1].split('|') + second_allele_ann_string_split = ann_string_split[2].split('|') + if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10: + ann_string = ann_string + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10: + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + str(ann_string_split[2]) + + elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10: + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = str(ann_string_split[1]) + new_second_allele_ann_string + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10: + + + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + new_second_allele_ann_string + + + if len(ann_string_split) > 3: + first_allele_ann_string_split = ann_string_split[1].split('|') + second_allele_ann_string_split = ann_string_split[2].split('|') + third_allele_ann_string_split = ann_string_split[3].split('|') + if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len(third_allele_ann_string_split) == 10: + ann_string = ann_string + + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10 and len(third_allele_ann_string_split) == 10: + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + str(ann_string_split[2]) + str(ann_string_split[3]) + + elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10 and len(third_allele_ann_string_split) == 10: + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = str(ann_string_split[1]) + new_second_allele_ann_string + str(ann_string_split[3]) + + elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len(third_allele_ann_string_split) > 10: + + if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": + prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] + else: + prod = third_allele_ann_string_split[14] + "|" + third_allele_ann_string_split[15] + new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + third_allele_ann_string_split[1] + "|" + third_allele_ann_string_split[2] + "|" + \ + third_allele_ann_string_split[4] + "|" + third_allele_ann_string_split[9] + "|" + \ + third_allele_ann_string_split[10] + "|" + third_allele_ann_string_split[11] + "|" + \ + third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = str(ann_string_split[1]) + str(ann_string_split[2]) + new_third_allele_ann_string + + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10 and len(third_allele_ann_string_split) > 10: + #print ann_string + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": + prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] + else: + prod = third_allele_ann_string_split[14] + "|" + third_allele_ann_string_split[15] + new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + third_allele_ann_string_split[1] + "|" + third_allele_ann_string_split[2] + "|" + \ + third_allele_ann_string_split[4] + "|" + third_allele_ann_string_split[9] + "|" + \ + third_allele_ann_string_split[10] + "|" + third_allele_ann_string_split[11] + "|" + \ + third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + new_second_allele_ann_string + new_third_allele_ann_string + + + # print_string generator no. 3 + + # Annotation Bug fix 6 + # Changing Strandness string: Date 28/05/2019 + # Each Locus ID with a strand information + strandness = " Strand Information: " + if "-" in tag: + tagsplit = tag.split('-') + for i in tagsplit: + if i in locus_tag_to_strand.keys(): + if "," in locus_tag_to_strand[i]: + locus_tag_to_strand_split = locus_tag_to_strand[i].split(',') + strand = locus_tag_to_strand_split[0] + else: + strand = locus_tag_to_strand[i] + strandness = strandness + i + "=" + strand + "/" + else: + if i == "" or i == "None": + strandness = strandness + "NULL=" + "No Strand Information found" + "/" + else: + strandness = strandness + i + "=" + "No Strand Information found" + "/" + else: + if tag in locus_tag_to_strand.keys(): + # strandness = strandness + locus_tag_to_strand[tag] + if "," in locus_tag_to_strand[tag]: + locus_tag_to_strand_split = locus_tag_to_strand[tag].split(',') + strand = locus_tag_to_strand_split[0] + else: + strand = locus_tag_to_strand[tag] + strandness = strandness + tag + "=" + strand + else: + if tag == "" or tag == "None": + strandness = strandness + "NULL=" + "No Strand Information found" + else: + strandness = strandness + tag + "=" + "No Strand Information found" + + # Annotation Bug fix 7 + # Changing tag equals NULL: Date 30/05/2019 + if tag == "" or tag == "None": + tag = "NULL" + + print_string = print_string + " locus_tag=" + tag + strandness + ann_string + print_string_phage = print_string + + + + """ Go over each genotype for a variant and generate a gt_string variable """ + gt_string = "" + for gt in variants.gt_bases: + gt = gt.replace('./.', '.') + gt_string = gt_string + "," + gt + gt_string = gt_string.replace('A/A', 'A') + gt_string = gt_string.replace('G/G', 'G') + gt_string = gt_string.replace('C/C', 'C') + gt_string = gt_string.replace('T/T', 'T') + gt_string = gt_string.replace('.', variants.REF) + + + # print_string generator no. 4 + # Replace various seperators that were used in old matrix. Clean up this block of code + final_allele_string = print_string + gt_string.replace(',', '\t') + '\n' + # Replace code at Phage Positions with -2 + if str(variants.POS) in functional_filter_pos_array: + code_string_array = code_string.split(',') + for (i, item) in enumerate(code_string_array): + if item == "0": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "1": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "2": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "3": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "4": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "-1": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "-2": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "-3": + code_string_array[i] = "-2" + for (i, item) in enumerate(code_string_array): + if item == "-4": + code_string_array[i] = "-2" + code_string = ','.join(code_string_array) + + final_code_string = print_string + "\t" + code_string.replace(',', '\t') + '\n' + final_allele_string = final_allele_string.replace(',|', '|') + + final_allele_string = final_allele_string.replace(',;,', ':::') + final_allele_string = final_allele_string.replace(';,', ':::') + final_code_string = final_code_string.replace(',|', '|') + + + final_code_string = final_code_string.replace(',;,', ':::') + final_code_string = final_code_string.replace(';,', ':::') + final_code_string = final_code_string.replace(';\t\t', ';\t') + final_code_string = final_code_string.replace('\t\t', '\t') + final_allele_string = final_allele_string.replace('\t\t', '\t') + fp_allele.write(final_allele_string) + fp_code.write(final_code_string) + + + + ntd_string = "" + ntd_string_phage = "" + count = 0 + code_string_array = code_string.split(',') + gt_string_array = gt_string[1:].split(',') + + + for i in gt_string_array: + if str(code_string_array[count]) == "0" or str(code_string_array[count]) == "1" or str(code_string_array[count]) == "3": + ntd_string = ntd_string + "\t" + str(i) + ntd_string_phage = ntd_string_phage + "\t" + str(i) + if code_string_array[count] == "-1": + ntd_string = ntd_string + "\t" + "-" + ntd_string_phage = ntd_string_phage + "\t" + "-" + # Changing Functional class filter code to -2 from 2 and replacing variant allele with N: 2018-12-04 + if str(code_string_array[count]) == "2" or str(code_string_array[count]) == "-2" or str(code_string_array[count]) == "-3" or str(code_string_array[count]) == "-4": + + ntd_string = ntd_string + "\t" + "N" + if str(code_string_array[count]) == "2": + ntd_string_phage = ntd_string_phage + "\t" + "N" + if str(code_string_array[count]) == "-2": + ntd_string_phage = ntd_string_phage + "\t" + str(i) + count += 1 + + # Annotation Bug fix 8 + """ Mask Phage positions and LowFQ/MQ positions in SNP_matrix_allele_new.csv. This is the default matrix. """ + if str(variants.POS) in functional_filter_pos_array: + ntd_string_array = ntd_string.split('\t') + #print ntd_string_array + ntd_string = "" + for i in ntd_string_array[1:]: + ntd_string = ntd_string + "\t" + "N" + ntd_string_array = ntd_string.split('\t') + #print ntd_string_array + + + if str(variants.POS) in mask_fq_mq_positions: + ntd_string_array = ntd_string.split('\t') + #print ntd_string_array + ntd_string = "" + for i in ntd_string_array[1:]: + ntd_string = ntd_string + "\t" + "N" + ntd_string_array = ntd_string.split('\t') + #print ntd_string_array + + + """ Generate a print_string for each of the matrix - SNP_matrix_allele_new.csv and SNP_matrix_allele_phage.csv """ + print_string = print_string + ntd_string + "\n" + + print_string_phage = print_string_phage + ntd_string_phage + "\n" + + """ This is a hardcoded solution. Find the root cause of these strings getting into the print_strint variable """ + print_string.replace(',;,', '\t') + print_string.replace(';,', '\t') + print_string_phage.replace(',;,', '\t') + print_string_phage.replace(';,', '\t') + + fp_allele_new.write(print_string) + fp_allele_new_phage.write(print_string_phage) + + fp_code.close() + fp_allele.close() + fp_allele_new.close() + fp_allele_new_phage.close() + +###################################### + """ Indel matrix """ + """ Prepare SNP/Indel Matrix print strings and add matrix row information subsequently """ + header_print_string = "Type of SNP at POS > ALT functional=PHAGE_REPEAT_MASK locus_tag=locus_id strand=strand; ALT|Effect|Impact|GeneID|Nrchange|Aachange|Nrgenepos|AAgenepos|gene_symbol|product" + final_merge_anno_file = VCF("%s/Final_vcf_gatk_indel.vcf.gz" % args.filter2_only_snp_vcf_dir) + for sample in final_merge_anno_file.samples: + # header_print_string = header_print_string + "," + sample + header_print_string = header_print_string + "\t" + sample + header_print_string = header_print_string + "\n" + #header_print_string = header_print_string.replace(':::,', ':::') + #header_print_string = header_print_string.replace(':::,', '\t') + fp_code = open("%s/Indel_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') + fp_allele = open("%s/Indel_matrix_allele.csv" % args.filter2_only_snp_vcf_dir, 'w+') + fp_code.write(header_print_string) + fp_allele.write(header_print_string) + + # """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" + # mask_fq_mq_positions = [] + # for key in position_indel_label.keys(): + # label_sep_array = position_indel_label[key].split(',') + # for i in label_sep_array: + # if "LowAF" in i: + # if key not in mask_fq_mq_positions: + # mask_fq_mq_positions.append(key) + # if i == "HighAF": + # if key not in mask_fq_mq_positions: + # mask_fq_mq_positions.append(key) + # + # print "Length of indel mask_fq_mq_positions array:%s" % len(mask_fq_mq_positions) + + """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" + mask_fq_mq_positions = [] + mask_fq_mq_positions_outgroup_specific = [] + + if args.outgroup: + position_label_exclude_outgroup = OrderedDict() + with open("%s/All_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + position_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + csv_file.close() + + position_indel_label_exclude_outgroup = OrderedDict() + with open("%s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + if row[0] not in position_label_exclude_outgroup.keys(): + position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + else: + position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + keep_logging('Warning: position %s already present as a SNP' % row[0], + 'Warning: position %s already present as a SNP' % row[0], logger, 'info') + csv_file.close() + for key in position_label_exclude_outgroup.keys(): + label_sep_array = position_label_exclude_outgroup[key].split(',') + for i in label_sep_array: + if "LowFQ" in str(i): + if key not in mask_fq_mq_positions: + if int(key) not in outgroup_specific_positions: + mask_fq_mq_positions.append(key) + elif int(key) in outgroup_specific_positions: + mask_fq_mq_positions_outgroup_specific.append(key) + if i == "HighFQ": + if key not in mask_fq_mq_positions: + if int(key) not in outgroup_specific_positions: + mask_fq_mq_positions.append(key) + elif int(key) in outgroup_specific_positions: + mask_fq_mq_positions_outgroup_specific.append(key) + else: + for key in position_label.keys(): + label_sep_array = position_label[key].split(',') + for i in label_sep_array: + if "LowFQ" in str(i): + if key not in mask_fq_mq_positions: + mask_fq_mq_positions.append(key) + if i == "HighFQ": + if key not in mask_fq_mq_positions: + mask_fq_mq_positions.append(key) + + + + print "Length of Indel mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) + print "Length of Indel mask_fq_mq_positions specific to outgroup:%s" % len(mask_fq_mq_positions_outgroup_specific) + + + + + + + + for variants in VCF("%s/Final_vcf_gatk_indel.vcf.gz" % args.filter2_only_snp_vcf_dir): + print_string = "" + + functional_field = "" + if str(variants.POS) in phage_positions: + functional_field = functional_field + "PHAGE_" + else: + functional_field = functional_field + "NULL_" + if str(variants.POS) in repetitive_positions: + functional_field = functional_field + "REPEATS_" + else: + functional_field = functional_field + "NULL_" + if str(variants.POS) in mask_positions: + functional_field = functional_field + "MASK" + else: + functional_field = functional_field + "NULL" + + code_string = position_indel_label[str(variants.POS)] + code_string = code_string.replace('reference_allele', '0') + code_string = code_string.replace('reference_unmapped_position', '-1') + code_string = code_string.replace('LowAF_QUAL_DP_proximate_SNP', '2') + code_string = code_string.replace('LowAF_DP_QUAL_proximate_SNP', '2') + code_string = code_string.replace('LowAF_QUAL_proximate_SNP', '2') + code_string = code_string.replace('LowAF_DP_proximate_SNP', '2') + code_string = code_string.replace('LowAF_proximate_SNP', '2') + code_string = code_string.replace('LowAF_QUAL_DP', '2') + code_string = code_string.replace('LowAF_DP_QUAL', '2') + code_string = code_string.replace('LowAF_QUAL', '2') + code_string = code_string.replace('LowAF_DP', '2') + code_string = code_string.replace('HighAF_QUAL_DP_proximate_SNP', '2') + code_string = code_string.replace('HighAF_DP_QUAL_proximate_SNP', '2') + code_string = code_string.replace('HighAF_QUAL_proximate_SNP', '2') + code_string = code_string.replace('HighAF_DP_proximate_SNP', '2') + code_string = code_string.replace('HighAF_proximate_SNP', '2') + code_string = code_string.replace('HighAF_QUAL_DP', '2') + code_string = code_string.replace('HighAF_DP_QUAL', '2') + code_string = code_string.replace('HighAF_QUAL', '2') + code_string = code_string.replace('HighAF_DP', '2') + code_string = code_string.replace('LowAF', '-3') + code_string = code_string.replace('HighAF', '-4') + + if str(variants.POS) in indel_core_positions: + code_string = code_string.replace('VARIANT', '1') + # Adding functional class status code to SNP matrix: 2018-07-24 + elif str(variants.POS) in functional_filter_pos_array: + # Changing Functional class filter code to -2 from 2: 2018-12-04 + code_string = code_string.replace('VARIANT', '-2') + else: + code_string = code_string.replace('VARIANT', '3') + + + + + # Changing SNP type: Date 28/05/2019 + # Assign type of snp: coding / non-coding + if variants.POS in indel_var_ann_dict.keys(): + if indel_var_ann_dict[variants.POS] is not None: + if "protein_coding" in set(indel_var_ann_dict[variants.POS].split('|')) and "intergenic_region" not in set(indel_var_ann_dict[variants.POS].split('|')): + snp_type = "Coding Indel" + elif "protein_coding" in set(indel_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(indel_var_ann_dict[variants.POS].split('|')): + snp_type = "Coding and Non-coding Indel" + elif "protein_coding" not in set(indel_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(indel_var_ann_dict[variants.POS].split('|')): + snp_type = "Non-Coding Indel" + elif "protein_coding" not in set(indel_var_ann_dict[variants.POS].split('|')) and "intragenic_variant" in set(indel_var_ann_dict[variants.POS].split('|')): + snp_type = "Non-Coding Indel" + else: + print set((indel_var_ann_dict[variants.POS].split('|'))) + snp_type = "No_protein_coding/intergenic_region_field_in_ANN SNP" + #print snp_type + else: + keep_logging('Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, 'Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, logger, 'info') + print set((indel_var_ann_dict[variants.POS].split('|'))) + snp_type = "Not Found in Annotated VCF file" + + + + + print_string = print_string + snp_type + " at %s > " % str(variants.POS) + str(",".join(variants.ALT)) + " functional=%s" % functional_field + + # Get ANN field from variant INFO column and save it as an array. Split and Go through each elements, add bells and whistles + if variants.INFO.get('ANN'): + + ann_array = (variants.INFO.get('ANN')).split(',') + + # Generate tag string before generating ann_string + if len(ann_array) > 1: + # print variants.INFO.get('ANN') + # print list(set(ann_array)) + tag_list = [] + + for i_again in set(indel_var_ann_dict[variants.POS].split(',')): + i_split_again = i_again.split('|') + + if "-" not in i_split_again[4]: + if i_split_again[4] not in tag_list: + tag_list.append(i_split_again[4]) + + else: + split_tags = i_split_again[4].split('-') + for splittagsindividual in split_tags: + if splittagsindividual not in tag_list: + tag_list.append(splittagsindividual) + + if len(tag_list) == 1: + tag = tag_list[0] + + elif len(tag_list) == 2: + tag = str(tag_list[0]) + "-" + str(tag_list[1]) + + elif len(tag_list) > 2: + print tag_list + tag = tag.replace('CHR_START-', '') + tag = tag.replace('-CHR_END', '') + else: + for i in list(set(ann_array)): + i_split = i.split('|') + tag = str(i_split[4]).replace('CHR_START-', '') + tag = str(tag).replace('-CHR_END', '') + + ann_string = ";" + for i in list(set(ann_array)): + i_split = i.split('|') + # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" + + # MOve this tag before this for loop because of multiple tags associated. + # tag = str(i_split[4]).replace('CHR_START-', '') + # tag = str(tag).replace('-CHR_END', '') + + if "-" in tag: + # print tag + extra_tags = "" + tag_split = tag.split('-') + for i in tag_split: + if i in locus_tag_to_gene_name.keys(): + extra_tags = extra_tags + locus_tag_to_gene_name[i] + "," + else: + extra_tags = extra_tags + "None" + "," + extra_tags_prot = "" + for i in tag_split: + if i in locus_tag_to_product.keys(): + extra_tags_prot = extra_tags_prot + locus_tag_to_product[i] + "," + else: + extra_tags_prot = extra_tags_prot + "None" + "," + ann_string = ann_string + '|'.join( + [i_split[0], i_split[1], i_split[2], i_split[3], i_split[9], i_split[10], i_split[11], + i_split[13], extra_tags, extra_tags_prot]) + ";" + # Changing SNP type: Date 28/05/2019 + elif tag == "": + print "ERROR: Issues with this locus tag. Check this tag in genbank file" + print list(set(ann_array)) + # Adding this so that Ann string is not empty: 30/05/2019 + if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): + extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) + else: + print "tag key not found: %s" % tag + extra_tags = "NULL" + "|" + "NULL" + # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" + # Added 2019-31-05 + if "ERROR_OUT_OF_CHROMOSOME_RANGE" in i: + ann_string = ann_string + '|'.join( + [i_split[0], "intergenic_region", i_split[2], "ERROR_OUT_OF_CHROMOSOME_RANGE", i_split[9], + i_split[10], i_split[11], + i_split[13], extra_tags]) + ";" + else: + ann_string = ann_string + '|'.join( + [i_split[0], i_split[1], i_split[2], i_split[3], i_split[9], i_split[10], i_split[11], + i_split[13], extra_tags]) + ";" + # Debugging + if i_split[3] == "CD630_00290": + print ann_string + # Changing SNP type: Date 28/05/2019 + else: + if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): + extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) + else: + print "tag key not found: %s" % tag + extra_tags = "NULL" + "|" + "NULL" + # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" + ann_string = ann_string + '|'.join( + [i_split[0], i_split[1], i_split[2], i_split[3], i_split[9], i_split[10], i_split[11], + i_split[13], extra_tags]) + ";" + + + # Changing SNP type: Date 28/05/2019 + # Working/Testing + else: + if len(variants.ALT) > 1 and indel_var_ann_dict[variants.POS]: + # print variants.ALT + # print ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) + + ann_string = ";%s" % ';'.join(set(indel_var_ann_dict[variants.POS].split(','))) + # Get Tag here; Multiple tag names. + tag_list = [] + + for i in set(indel_var_ann_dict[variants.POS].split(',')): + i_split = i.split('|') + if i_split[4] not in tag_list: + tag_list.append(i_split[4]) + if len(tag_list) > 1: + tag = str(tag_list[0]) + "-" + str(tag_list[1]) + else: + tag = tag_list[0] + + # if len(set(snp_var_ann_dict[variants.POS].split(','))) > 2: + # print tag + # print set(snp_var_ann_dict[variants.POS].split(',')) + + else: + ann_string = ";None" + + + # Changing SNP type: Date 28/05/2019 + ann_string = ann_string.replace('ERROR_OUT_OF_CHROMOSOME_RANGE', '%s-%s' % (locus_tag_to_gene_name[last_locus_tag], locus_tag_to_gene_name[first_locus_tag])) + ann_string = ann_string.replace('CHR_END', '%s' % locus_tag_to_gene_name[first_locus_tag]) + + + # SNP Matrix Bug + ann_string_split = ann_string.split(';') + if len(ann_string_split) == 3: + first_allele_ann_string_split = ann_string_split[1].split('|') + second_allele_ann_string_split = ann_string_split[2].split('|') + if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10: + ann_string = ann_string + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10: + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ + first_allele_ann_string_split[1] + "|" + \ + first_allele_ann_string_split[2] + "|" + \ + first_allele_ann_string_split[4] + "|" + \ + first_allele_ann_string_split[9] + "|" + \ + first_allele_ann_string_split[10] + "|" + \ + first_allele_ann_string_split[11] + "|" + \ + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + str(ann_string_split[2]) + + elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10: + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ + second_allele_ann_string_split[1] + "|" + \ + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + \ + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + \ + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[ + 13] + "|" + prod + "|" + prod + ";" + + ann_string = str(ann_string_split[1]) + new_second_allele_ann_string + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10: + + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ + first_allele_ann_string_split[1] + "|" + \ + first_allele_ann_string_split[2] + "|" + \ + first_allele_ann_string_split[4] + "|" + \ + first_allele_ann_string_split[9] + "|" + \ + first_allele_ann_string_split[10] + "|" + \ + first_allele_ann_string_split[11] + "|" + \ + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ + second_allele_ann_string_split[1] + "|" + \ + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + \ + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + \ + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[ + 13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + new_second_allele_ann_string + + + if len(ann_string_split) > 3: + + first_allele_ann_string_split = ann_string_split[1].split('|') + second_allele_ann_string_split = ann_string_split[2].split('|') + third_allele_ann_string_split = ann_string_split[3].split('|') + + if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len( + third_allele_ann_string_split) == 10: + ann_string = ann_string + + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10 and len( + third_allele_ann_string_split) == 10: + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ + first_allele_ann_string_split[1] + "|" + \ + first_allele_ann_string_split[2] + "|" + \ + first_allele_ann_string_split[4] + "|" + \ + first_allele_ann_string_split[9] + "|" + \ + first_allele_ann_string_split[10] + "|" + \ + first_allele_ann_string_split[11] + "|" + \ + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + str(ann_string_split[2]) + str(ann_string_split[3]) + + elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10 and len( + third_allele_ann_string_split) == 10: + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ + second_allele_ann_string_split[1] + "|" + \ + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + \ + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + \ + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[ + 13] + "|" + prod + "|" + prod + ";" + + ann_string = str(ann_string_split[1]) + new_second_allele_ann_string + str(ann_string_split[3]) + + elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len( + third_allele_ann_string_split) > 10: + + if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": + prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] + else: + prod = third_allele_ann_string_split[14] + third_allele_ann_string_split[15] + new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + \ + third_allele_ann_string_split[1] + "|" + \ + third_allele_ann_string_split[2] + "|" + \ + third_allele_ann_string_split[4] + "|" + \ + third_allele_ann_string_split[9] + "|" + \ + third_allele_ann_string_split[10] + "|" + \ + third_allele_ann_string_split[11] + "|" + \ + third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = str(ann_string_split[1]) + str(ann_string_split[2]) + new_third_allele_ann_string + + elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10 and len( + third_allele_ann_string_split) > 10: + # print ann_string + if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": + prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] + else: + prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] + new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ + first_allele_ann_string_split[1] + "|" + \ + first_allele_ann_string_split[2] + "|" + \ + first_allele_ann_string_split[4] + "|" + \ + first_allele_ann_string_split[9] + "|" + \ + first_allele_ann_string_split[10] + "|" + \ + first_allele_ann_string_split[11] + "|" + \ + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": + prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] + else: + prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] + new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ + second_allele_ann_string_split[1] + "|" + \ + second_allele_ann_string_split[2] + "|" + \ + second_allele_ann_string_split[4] + "|" + \ + second_allele_ann_string_split[9] + "|" + \ + second_allele_ann_string_split[10] + "|" + \ + second_allele_ann_string_split[11] + "|" + \ + second_allele_ann_string_split[ + 13] + "|" + prod + "|" + prod + ";" + + if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": + prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] + else: + prod = third_allele_ann_string_split[14] + third_allele_ann_string_split[15] + new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + \ + third_allele_ann_string_split[1] + "|" + \ + third_allele_ann_string_split[2] + "|" + \ + third_allele_ann_string_split[4] + "|" + \ + third_allele_ann_string_split[9] + "|" + \ + third_allele_ann_string_split[10] + "|" + \ + third_allele_ann_string_split[11] + "|" + \ + third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" + + ann_string = new_first_allele_ann_string + new_second_allele_ann_string + new_third_allele_ann_string + + # print ann_string + + # # JUST FOR THE SAKE OF DEBUGGING + # ann_string_split = ann_string.split(';') + # for i in ann_string_split: + # if len(i.split('|')) != 10 and len(i.split('|')) != 1: + # print ann_string + + # Changing Strandness string: Date 28/05/2019 + # Each Locus ID with a strand information + strandness = " Strand Information: " + if "-" in tag: + tagsplit = tag.split('-') + for i in tagsplit: + if i in locus_tag_to_strand.keys(): + if "," in locus_tag_to_strand[i]: + locus_tag_to_strand_split = locus_tag_to_strand[i].split(',') + strand = locus_tag_to_strand_split[0] + else: + strand = locus_tag_to_strand[i] + strandness = strandness + i + "=" + strand + "/" + else: + if i == "" or i == "None": + strandness = strandness + "NULL=" + "No Strand Information found" + "/" + else: + strandness = strandness + i + "=" + "No Strand Information found" + "/" + else: + if tag in locus_tag_to_strand.keys(): + # strandness = strandness + locus_tag_to_strand[tag] + if "," in locus_tag_to_strand[tag]: + locus_tag_to_strand_split = locus_tag_to_strand[tag].split(',') + strand = locus_tag_to_strand_split[0] + else: + strand = locus_tag_to_strand[tag] + strandness = strandness + tag + "=" + strand + else: + if tag == "" or tag == "None": + strandness = strandness + "NULL=" + "No Strand Information found" + else: + strandness = strandness + tag + "=" + "No Strand Information found" + + + # Changing tag equals NULL: Date 30/05/2019 + if tag == "" or tag == "None": + tag = "NULL" + + print_string = print_string + " locus_tag=" + tag + strandness + ann_string + + gt_string = "" + for gt in variants.gt_bases: + gt = gt.replace('./.', '.') + if "/" in gt: + gt_split = gt.split('/') + gt = gt_split[1] + gt_string = gt_string + "," + gt + gt_string = gt_string.replace('.', variants.REF) + + """Replacing Phage/Functional filter position code""" + if str(variants.POS) in functional_filter_pos_array: + code_string_array = code_string.split(',') + code_string = "" + for i in code_string_array: + code_string = code_string + "," + "-2" + + final_allele_string = print_string + gt_string.replace(',', '\t') + '\n' + final_code_string = print_string + "\t" + code_string.replace(',', '\t') + '\n' + final_allele_string = final_allele_string.replace(',|', '|') + # final_allele_string = final_allele_string.replace(',;,', ':::') + # final_allele_string = final_allele_string.replace(';,', ':::') + final_allele_string = final_allele_string.replace(',;,', ':::') + final_allele_string = final_allele_string.replace(';,', ':::') + final_code_string = final_code_string.replace(',|', '|') + # final_code_string = final_code_string.replace(',;,', ':::') + # final_code_string = final_code_string.replace(';,', ':::') + final_code_string = final_code_string.replace(',;,', ':::') + final_code_string = final_code_string.replace(';,', ':::') + final_code_string = final_code_string.replace('\t\t', '\t') + final_allele_string = final_allele_string.replace('\t\t', '\t') + fp_allele.write(final_allele_string) + fp_code.write(final_code_string) + fp_code.close() + fp_allele.close() + +def core_prep_snp(core_vcf_fasta_dir): + """ Generate SNP Filter Label Matrix """ + generate_paste_command() + + generate_paste_command_outgroup() + + """ Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. """ + generate_position_label_data_matrix() + + """ Generate VCF files from final list of variants in Only_ref_variant_positions_for_closely; generate commands for consensus generation """ + generate_vcf_files() + + """ Generate consensus fasta file from core vcf files """ + extract_only_ref_variant_fasta_from_reference() + + """ Generate consensus fasta file with only reference and variant position bases """ + extract_only_ref_variant_fasta(core_vcf_fasta_dir) + + # """ Analyze the positions that were filtered out only due to insufficient depth""" + # DP_analysis() + +def core_prep_indel(core_vcf_fasta_dir): + """ Generate SNP Filter Label Matrix """ + generate_indel_paste_command() + + generate_indel_paste_command_outgroup() + + """ Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. """ + generate_indel_position_label_data_matrix() + +""" report methods """ +def alignment_report(data_matrix_dir): + keep_logging('Generating Alignment report...', 'Generating Alignment report...', logger, 'info') + varcall_dir = os.path.dirname(args.results_dir) + print varcall_dir + report_string = "" + header = "Sample,QC-passed reads,Mapped reads,% mapped reads,mean depth,%_bases_above_5,%_bases_above_10,%_bases_above_15,unmapped_positions,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,unmapped reads,% unmapped reads" + fp = open("%s/Report_alignment.txt" % (data_matrix_dir), 'w+') + fp.write(header + '\n') + for vcf in vcf_filenames: + sample = os.path.basename(vcf.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) + #print sample + report_string = sample + "," + qc = (subprocess.check_output("grep \'QC-passed\' %s/%s/%s_alignment_stats | sed \'s/ + 0 in total (QC-passed reads + QC-failed reads)//g\'" % (varcall_dir, sample, sample), shell=True)).strip() + mapped = (subprocess.check_output("grep \'mapped (\' %s/%s/%s_alignment_stats | awk -F\' \' \'{print $1}\'" % (varcall_dir, sample, sample), shell=True)).strip() + replace = "%:-nan%)" + perc_mapped = (subprocess.check_output("grep \'mapped (\' %s/%s/%s_alignment_stats | awk -F\' \' \'{print $5}\' | sed \'s/%s//g\' | sed \'s/(//g\'" % (varcall_dir, sample, sample, replace), shell=True)).strip() + depth_of_coverage = (subprocess.check_output("awk -F\'\\t\' \'{OFS=\",\"};FNR==2{print $3,$7,$8,$9}\' %s/%s/%s_depth_of_coverage.sample_summary" % (varcall_dir, sample, sample), shell=True)).strip() + unmapped_positions = (subprocess.check_output("wc -l %s/%s/%s_unmapped.bed_positions | cut -d\' \' -f1" % (varcall_dir, sample, sample), shell=True)).strip() + opt_dup = (subprocess.check_output("awk -F\'\\t\' \'{OFS=\",\"};FNR==8{print $7,$8,$5}\' %s/%s/%s_markduplicates_metrics" % (varcall_dir, sample, sample), shell=True)).strip() + perc_unmapped = str(100 - float(perc_mapped)) + myList = ','.join(map(str, (sample, qc, mapped, perc_mapped, depth_of_coverage, unmapped_positions, opt_dup, perc_unmapped))) + #print myList + fp.write(myList + '\n') + fp.close() + keep_logging('Alignment report can be found in %s/Report_alignment.txt' % data_matrix_dir, 'Alignment report can be found in %s/Report_alignment.txt' % data_matrix_dir, logger, 'info') + +def variant_report(data_matrix_dir): + keep_logging('Generating Variants report...', 'Generating Variants report...', logger, 'info') + varcall_dir = os.path.dirname(os.path.abspath(args.results_dir)) + report_string = "" + header = "Sample,Total Unique Variants,core SNPs,unmapped_positions,reference_allele,true_variant,Only_low_FQ,Only_DP,Only_low_MQ,other,unmapped_positions_perc,true_variant_perc,Only_low_FQ_perc,Only_DP_perc,Only_low_MQ_perc,other_perc" + fp = open("%s/Report_variants.txt" % (data_matrix_dir), 'w+') + fp.write(header + '\n') + + for vcf in vcf_filenames: + sample = os.path.basename(vcf.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) + report_string = sample + "," + unmapped_positions = (subprocess.check_output("wc -l %s/core_temp_dir/unique_positions_file | cut -d\' \' -f1" % (varcall_dir), shell=True)).strip() + core_snps = (subprocess.check_output("wc -l %s/core_temp_dir/Only_ref_variant_positions_for_closely | cut -d\' \' -f1" % (varcall_dir), shell=True)).strip() + filtered_snp_count = (subprocess.check_output("grep -w \'^%s\' %s/core_temp_dir/bargraph_counts.txt | awk -F\'\\t\' \'{OFS=\",\"};{print $2,$3,$4,$5,$6,$7}\'" % (sample, varcall_dir), shell=True)).strip() + filtered_snp_perc = (subprocess.check_output("grep -w \'^%s\' %s/core_temp_dir/bargraph_percentage.txt | awk -F\'\\t\' \'{OFS=\",\"};{print $2,$3,$4,$5,$6,$7}\'" % (sample, varcall_dir), shell=True)).strip() + myList = ','.join(map(str, (sample, unmapped_positions, core_snps, filtered_snp_count, filtered_snp_perc))) + fp.write(myList + '\n') + fp.close() + keep_logging('Variant call report can be found in %s/Report_variants.txt' % data_matrix_dir, 'Variant call report can be found in %s/Report_variants.txt' % data_matrix_dir, logger, 'info') + +def gubbins(gubbins_dir, input_fasta, jobrun, logger, Config): + keep_logging('\nRunning Gubbins on input: %s\n' % input_fasta, '\nRunning Gubbins on input: %s\n' % input_fasta, + logger, + 'info') + + + call("module load bioperl python-anaconda2/201607 biopython dendropy reportlab fasttree RAxML fastml/gub gubbins", logger) + #os.system("module load bioperl python-anaconda2/201607 biopython dendropy reportlab fasttree RAxML fastml/gub gubbins") + #gubbins_cmd = "%s/%s --prefix %s/%s %s" % ( + # ConfigSectionMap("gubbins", Config)['gubbins_bin'], ConfigSectionMap("gubbins", Config)['base_cmd'], gubbins_dir, + # (os.path.basename(input_fasta)).replace('.fa', ''), input_fasta) + + load_module = "module load bioperl python-anaconda2/201607 biopython dendropy reportlab fasttree RAxML fastml/gub gubbins" + gubbins_cmd = "%s --threads 6 --prefix %s/%s %s" % ( + ConfigSectionMap("gubbins", Config)['base_cmd'], gubbins_dir, + (os.path.basename(input_fasta)).replace('.fa', ''), input_fasta) + keep_logging('\nRunning Gubbins on: %s' % input_fasta, '\nRunning Gubbins: %s\n' % input_fasta, + logger, + 'info') + + keep_logging('Running: %s' % gubbins_cmd, '%s' % gubbins_cmd, logger, 'info') + if jobrun == "parallel-local" or jobrun == "local": + call("cd %s" % gubbins_dir, logger) + call(gubbins_cmd, logger) + elif jobrun == "cluster": + call("cd %s" % gubbins_dir, logger) + call(gubbins_cmd, logger) + elif jobrun == "parallel-cluster": + job_file_name = "%s/gubbins_%s.pbs" % (gubbins_dir, os.path.basename(input_fasta)) + job_name = os.path.basename(job_file_name) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l nodes=1:ppn=12,mem=47000mb,walltime=250:00:00\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\ncd %s\n%s\n%s" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], gubbins_dir, load_module, gubbins_cmd) + f1=open(job_file_name, 'w+') + f1.write(job_print_string) + f1.close() + #os.system("qsub %s" % job_file_name) + call("qsub %s" % job_file_name, logger) + +def get_outgroup(): + """ + Prepare Outgroup Sample name from the argument. + """ + if args.outgroup: + if "R1_001_final.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('R1_001_final.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + elif "_R1.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('_R1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + elif "R1.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('R1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + first_part = re.sub("_S.*_", "", first_part) + outgroup = re.sub("_S.*", "", first_part) + + elif "1_combine.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('1_combine.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + elif "1_sequence.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('1_sequence.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + elif "_forward.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('_forward.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + elif "R1_001.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('R1_001.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + elif "_1.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('_1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + elif ".1.fastq.gz" in args.outgroup: + first_part_split = args.outgroup.split('.1.fastq.gz') + first_part = first_part_split[0].replace('_L001', '') + outgroup = re.sub("_S.*_", "", first_part) + + keep_logging( + 'Using %s as Outgroup Sample Name' % outgroup, + 'Using %s as Outgroup Sample Name' % outgroup, + logger, 'info') + + return outgroup + else: + keep_logging('Outgroup Sample Name not provided\n', 'Outgroup Sample Name not provided\n', logger, 'info') + outgroup = "" + +def mask_fq_mq_positions_specific_to_outgroup(): + """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" + mask_fq_mq_positions = [] + mask_fq_mq_positions_outgroup_specific = [] + if args.outgroup: + position_label_exclude_outgroup = OrderedDict() + with open("%s/All_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + position_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + csv_file.close() + + position_indel_label_exclude_outgroup = OrderedDict() + with open("%s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, + 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + for row in csv_reader: + if row[0] not in position_label_exclude_outgroup.keys(): + position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + else: + position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) + keep_logging('Warning: position %s already present as a SNP' % row[0], + 'Warning: position %s already present as a SNP' % row[0], logger, 'info') + csv_file.close() + for key in position_label_exclude_outgroup.keys(): + label_sep_array = position_label_exclude_outgroup[key].split(',') + for i in label_sep_array: + if "LowFQ" in str(i): + if key not in mask_fq_mq_positions: + if int(key) not in outgroup_specific_positions: + mask_fq_mq_positions.append(key) + elif int(key) in outgroup_specific_positions: + mask_fq_mq_positions_outgroup_specific.append(key) + if i == "HighFQ": + if key not in mask_fq_mq_positions: + if int(key) not in outgroup_specific_positions: + mask_fq_mq_positions.append(key) + elif int(key) in outgroup_specific_positions: + mask_fq_mq_positions_outgroup_specific.append(key) + + fp = open("%s/mask_fq_mq_positions_outgroup_specific.txt" % (args.filter2_only_snp_vcf_dir), 'w+') + for i in mask_fq_mq_positions_outgroup_specific: + fp.write(i + '\n') + fp.close() + print "Length of mask_fq_mq_positions specific to outgroup:%s" % len(mask_fq_mq_positions_outgroup_specific) + + outgroup = get_outgroup() + fqmqpositionsspecifictooutgroup = [] + + fopen = open("%s/mask_fq_mq_positions_outgroup_specific.txt" % (args.filter2_only_snp_vcf_dir), 'r+') + for i in fopen: + i = i.strip() + fqmqpositionsspecifictooutgroup.append(i) + fopen.close() + + print "Length of low MQ/FQ positions specific to outgroup: %s" % len(fqmqpositionsspecifictooutgroup) + + vcf_filename_unmapped = "%s/%s_ref_allele_unmapped_masked.vcf" % (args.filter2_only_snp_vcf_dir, outgroup) + + fp = open("%s/%s_ref_allele_unmapped_masked.vcf" % (args.filter2_only_snp_vcf_dir, outgroup), 'w+') + + vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % outgroup + fp.write(vcf_header) + + for variants in VCF("%s/%s_ref_allele_unmapped.vcf.gz" % (args.filter2_only_snp_vcf_dir, outgroup)): + print_string = "" + if str(variants.POS) in fqmqpositionsspecifictooutgroup: + print_string_array = [str(variants.CHROM), str(variants.POS), '.', str(variants.REF), 'N', '221.999', + '.', '.', '.', '.', '.'] + + + else: + print_string_array = [str(variants.CHROM), str(variants.POS), '.', str(variants.REF), + str(variants.ALT[0]), '221.999', '.', '.', '.', '.', '.'] + print_string = '\t'.join(print_string_array) + fp.write(print_string + '\n') + fp.close() + base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + \ + ConfigSectionMap("vcftools", Config)[ + 'vcftools_bin'] + bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], + vcf_filename_unmapped) + + tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], + vcf_filename_unmapped) + + fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_unmapped_variants.fa\n" % ( + args.reference, base_vcftools_bin, vcf_filename_unmapped, outgroup) + + # print bgzip_cmd + # print tabix_cmd + # print fasta_cmd + + subprocess.call([bgzip_cmd], shell=True) + subprocess.call([tabix_cmd], shell=True) + subprocess.call([fasta_cmd], shell=True) + sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_unmapped_variants.fa\n" % (outgroup, outgroup) + subprocess.call([sed_command], shell=True) + # print sed_command + + + else: + for key in position_label.keys(): + label_sep_array = position_label[key].split(',') + for i in label_sep_array: + if "LowFQ" in str(i): + if key not in mask_fq_mq_positions: + mask_fq_mq_positions.append(key) + if i == "HighFQ": + if key not in mask_fq_mq_positions: + mask_fq_mq_positions.append(key) + + fp = open("%s/mask_fq_mq_positions.txt" % (args.filter2_only_snp_vcf_dir), 'w+') + for i in mask_fq_mq_positions: + fp.write(i + '\n') + fp.close() + + print "Length of mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) + +""" +Pending inclusion + +class FuncThread(threading.Thread): + def __init__(self, target, *args): + self._target = target + self._args = args + threading.Thread.__init__(self) + def run(self): + self._target(*self._args) + +def someOtherFunc(data, key): + print "someOtherFunc was called : data=%s; key=%s" % (str(data), str(key)) + +Pending inclusion +""" + + + +if __name__ == '__main__': + + """ + Main Function for Variant Calling Core Pipeline + :param: + :return: + + This function runs "core_prep" step to generate intermediate files required for extracting core variants at "core" step. + Using these core variants, a "report" step will generate the final reports and output results of the pipeline as well as runs "tree" step to generate fasttree and raxml results + using the core variants consensus in Date_Time_core_results folder. + Steps: + 1. core_prep + 2. core + 3. report + 4. tree + """ + + # Start Timer to use it for generating folder names and Log prefixes. + start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + start_time_2 = datetime.now() + log_unique_time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + global logger + analysis_name_log = "step_" + str(args.steps) + logger = generate_logger(args.filter2_only_snp_vcf_dir, analysis_name_log, log_unique_time) + keep_logging('\nThe Script started at: %s' % start_time, '\nThe Script started at: %s' % start_time, logger, 'info') + print_details = "This step will parse final vcf files(*_no_proximate_snp.vcf) generated at the end of Variant Calling Pipeline. At the end of this step, the following results will be generated and placed in output directory:\n\n" \ + "1. Final Core SNP Positions list(Variant positions that were not filtered out in any of the samples and passed all the filters)\n" \ + "2. SNP Positions that were filtered out with labels indicating the reason (Depth, FQ, MQ, Unmapped in one or other samples, Proximate SNPS, Quality of Variant) why they were filtered out.\n" \ + "3. Barplot Statistics about the filtered variants and their reason for getting filtered.\n" \ + "4. Final Consensus fasta file using only Core SNP Positions\n" + keep_logging('%s' % print_details, '%s' % print_details, logger, 'info') + + # Create temporary Directory core_temp_dir/temp for storing temporary intermediate files. Check if core_temp_dir contains all the required files to run these pipeline. + global temp_dir + temp_dir = args.filter2_only_snp_vcf_dir + "/temp" + + # Read Config file into Config object that will be used to extract configuration settings set up in config file. + global config_file + if args.config: + config_file = args.config + else: + config_file = os.path.dirname(os.path.abspath(__file__)) + "/config" + global Config + Config = ConfigParser.ConfigParser() + Config.read(config_file) + keep_logging('Path to config file: %s' % config_file, 'Path to config file: %s' % config_file, logger, 'info') + + make_sure_path_exists(temp_dir) + + # Get outgroup_Sample name + outgroup = get_outgroup() + outgroup_vcf_filename = str(outgroup) + "_filter2_final.vcf_no_proximate_snp.vcf" + outgroup_indel_vcf_filename = str(outgroup) + "_filter2_indel_final.vcf" + + # Read filenames. Core variants and final results will be extracted considering only these files. + filter2_only_snp_vcf_filenames = args.filter2_only_snp_vcf_filenames + vcf_filenames_temp = [] + vcf_filenames_temp_outgroup = [] + + with open(filter2_only_snp_vcf_filenames) as fp: + for line in fp: + line = line.strip() + line = args.filter2_only_snp_vcf_dir + line + vcf_filenames_temp.append(line) + if args.outgroup: + if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in line: + vcf_filenames_temp_outgroup.append(line) + fp.close() + vcf_filenames = sorted(vcf_filenames_temp) + vcf_filenames_outgroup = sorted(vcf_filenames_temp_outgroup) + + make_sure_files_exists(vcf_filenames, Config, logger) + + log_file_handle = "%s/%s_%s.log.txt" % (args.filter2_only_snp_vcf_dir, log_unique_time, analysis_name_log) + + # Start Variant Calling Core Pipeline steps based on steps argument supplied. + if "1" in args.steps: + """ + core_prep step + """ + + # Gather SNP positions from each final *_no_proximate_snp.vcf file (that passed the variant filter parameters from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files for use in downstream methods + keep_logging('Gathering SNP position information from each final *_no_proximate_snp.vcf file...', 'Gathering SNP position information from each final *_no_proximate_snp.vcf file...', logger, 'info') + + core_prep_label(vcf_filenames, args.filter2_only_snp_vcf_dir, args.outgroup, args.reference, log_unique_time, log_file_handle, logger, args.jobrun, Config) + + if "2" in args.steps: + """ + core step + """ + + # Set variables; check if the output from core_prep steps (*label files) exists and was completed without any errors. + snp_unique_positions_file = args.filter2_only_snp_vcf_dir + "/unique_positions_file" + indel_unique_positions_file = args.filter2_only_snp_vcf_dir + "/unique_indel_positions_file" + uniq_snp_positions = sum(1 for line in open('%s' % snp_unique_positions_file)) + uniq_indel_positions = sum(1 for line in open('%s' % indel_unique_positions_file)) + if not os.path.isfile(snp_unique_positions_file) and not os.path.isfile(indel_unique_positions_file): + keep_logging('Error finding unique_positions_file/unique_indel_positions_file. Please rerun core_prep step.','Error finding unique_positions_file/unique_indel_positions_file. Please rerun core_prep step.', logger,'exception') + exit() + + make_sure_label_files_exists(vcf_filenames, uniq_snp_positions, uniq_indel_positions, Config, logger) + + # Set up Report and results directories to transfer the final results. + data_matrix_dir = args.results_dir + '/data_matrix' + core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' + make_sure_path_exists(data_matrix_dir) + make_sure_path_exists(core_vcf_fasta_dir) + + functional_class_filter_positions = "%s/Functional_class_filter_positions.txt" % args.filter2_only_snp_vcf_dir + + global outgroup_specific_positions + global outgroup_indel_specific_positions + + # Get outgroup specific variant positions + if args.outgroup: + f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') + + outgroup_indel_specific_positions = [] + for i in f_outgroup: + i = i.strip() + outgroup_indel_specific_positions.append(int(i)) + f_outgroup.close() + + f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') + + outgroup_specific_positions = [] + for i in f_outgroup: + i = i.strip() + outgroup_specific_positions.append(int(i)) + f_outgroup.close() + + print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) + print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) + else: + + outgroup_indel_specific_positions = [] + outgroup_specific_positions = [] + print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) + print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) + + # Run core steps. Generate SNP and data Matrix results. Extract core SNPS and consensus files. + core_prep_indel(core_vcf_fasta_dir) + + core_prep_snp(core_vcf_fasta_dir) + + # Moving this up before core_prep_snp; for some weird reason, it is failing to generate Only_ref_indel + #core_prep_indel(core_vcf_fasta_dir) + + # Annotate core variants. Generate SNP and Indel matrix. + annotated_snp_matrix() + + # Read new allele matrix and generate fasta; generate a seperate function + keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info') + + create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file) + + #extract_only_ref_variant_fasta_from_reference_allele_variant() + + mask_fq_mq_positions_specific_to_outgroup() + + call("cp %s %s/Logs/core/" % ( + log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) + + if "3" in args.steps: + """ + report step + """ + + # Get outgroup_Sample name + outgroup = get_outgroup() + + keep_logging('Step 3: Generate Reports and Results folder.', 'Step 3: Generate Reports and Results folder.', logger, 'info') + + ## Temporary fix. A bug was introduced that is causing the pipeline to generate *vcf_no_proximate_snp.vcf_filter2_consensus.fa + call("rm %s/*vcf_no_proximate_snp.vcf_filter2_consensus.fa" % args.filter2_only_snp_vcf_dir, logger) + + # Generate DP barplots data and Analyze the FQ values of all the unique variant + # DP_analysis_barplot() + # FQ_analysis() + + # Set up Report and results directories to transfer the final results. + # Set up Report and results directories to transfer the final results. + data_matrix_dir = args.results_dir + '/data_matrix' + core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' + make_sure_path_exists(args.results_dir) + make_sure_path_exists(data_matrix_dir) + make_sure_path_exists(core_vcf_fasta_dir) + data_matrix_dir = args.results_dir + '/data_matrix' + data_matrix_snpeff_dir = data_matrix_dir + '/snpEff_results' + core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' + consensus_var_dir = core_vcf_fasta_dir + '/consensus_variant_positions' + core_vcf_dir = core_vcf_fasta_dir + '/core_vcf' + consensus_allele_var_dir = core_vcf_fasta_dir + '/consensus_allele_variant_positions' + consensus_ref_allele_var_dir = core_vcf_fasta_dir + '/consensus_ref_allele_variant_positions' + consensus_ref_var_dir = core_vcf_fasta_dir + '/consensus_ref_variant_positions' + consensus_ref_allele_unmapped_variant_dir = core_vcf_fasta_dir + '/consensus_ref_allele_unmapped_variant' + make_sure_path_exists(data_matrix_dir) + make_sure_path_exists(data_matrix_snpeff_dir) + make_sure_path_exists(core_vcf_fasta_dir) + make_sure_path_exists(consensus_var_dir) + make_sure_path_exists(core_vcf_dir) + make_sure_path_exists(consensus_allele_var_dir) + #make_sure_path_exists(consensus_ref_allele_var_dir) + make_sure_path_exists(consensus_ref_var_dir) + make_sure_path_exists(consensus_ref_allele_unmapped_variant_dir) + reference_base = os.path.basename(args.reference).split('.')[0] + # Move results to the results directory + move_data_matrix_results = "cp -r %s/unique_positions_file %s/unique_indel_positions_file %s/*.csv %s/*.txt %s/temp_* %s/All* %s/Only* %s/*.R %s/R_scripts/generate_diagnostics_plots.R %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, os.path.dirname(os.path.abspath(__file__)), data_matrix_dir) + #move_core_vcf_fasta_results = "cp %s/*_core.vcf.gz %s/*.fa %s/*_variants.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) + move_core_vcf_fasta_results = "mv %s/*_core.vcf.gz* %s/*_ANN* %s/*.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) + + + move_consensus_var_fasta_results = "mv %s/*_variants.fa %s/" % (core_vcf_fasta_dir, consensus_var_dir) + move_consensus_ref_var_fasta_results = "mv %s/*.fa %s/" % (core_vcf_fasta_dir, consensus_ref_var_dir) + move_core_vcf = "mv %s/*_core.vcf.gz %s/*vcf_core.vcf.gz.tbi %s/" % (core_vcf_fasta_dir, core_vcf_fasta_dir, core_vcf_dir) + move_consensus_allele_var_fasta_results = "mv %s/*allele_variants.fa %s/" % (consensus_var_dir, consensus_allele_var_dir) + remove_ref_allele = "rm %s/*_ref_allele_variants.fa" % consensus_allele_var_dir + #move_consensus_ref_allele_var_fasta_results = "mv %s/*_ref_allele_variants.fa %s/" % (consensus_allele_var_dir, consensus_ref_allele_var_dir) + move_consensus_ref_allele_unmapped_var_fasta_results = "mv %s/*_ref_allele_unmapped_variants.fa %s/" % (consensus_var_dir, consensus_ref_allele_unmapped_variant_dir) + move_snpeff_results = "mv %s/*ANN* %s/" % (data_matrix_dir, data_matrix_snpeff_dir) + move_snpeff_vcf_results = "mv %s/*ANN* %s/" % (core_vcf_fasta_dir, data_matrix_snpeff_dir) + copy_reference = "cp %s %s/%s.fa" % (args.reference, consensus_ref_var_dir, reference_base) + #copy_reference_2 = "cp %s %s/%s.fa" % (args.reference, consensus_ref_allele_var_dir, reference_base) + + call("%s" % move_data_matrix_results, logger) + call("%s" % move_core_vcf_fasta_results, logger) + call("%s" % move_consensus_var_fasta_results, logger) + call("%s" % move_consensus_ref_var_fasta_results, logger) + call("%s" % move_core_vcf, logger) + call("%s" % move_consensus_allele_var_fasta_results, logger) + call("%s" % remove_ref_allele, logger) + #call("%s" % move_consensus_ref_allele_var_fasta_results, logger) + call("%s" % move_consensus_ref_allele_unmapped_var_fasta_results, logger) + call("%s" % copy_reference, logger) + #call("%s" % copy_reference_2, logger) + call("%s" % move_snpeff_results, logger) + call("%s" % move_snpeff_vcf_results, logger) + subprocess.call(["sed -i 's/title_here/%s/g' %s/generate_diagnostics_plots.R" % (os.path.basename(args.results_dir), data_matrix_dir)], shell=True) + + # Sanity Check if the variant consensus files generated are of same length + count = 0 + for line in open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % data_matrix_dir).xreadlines(): + count += 1 + ref_variants = count - 1 + variant_consensus_files = glob.glob("%s/*_variants.fa" % core_vcf_fasta_dir) + for f in variant_consensus_files: + cmd2 = "%s/%s/bioawk -c fastx '{ print length($seq) }' < %s" % ( + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'], f) + proc = subprocess.Popen([cmd2], stdout=subprocess.PIPE, shell=True) + (out2, err2) = proc.communicate() + + try: + int(out2) != int(ref_variants) + except OSError as exception: + if exception.errno != errno.EEXIST: + keep_logging('Error generating variant consensus position file: %s' % f, + 'Error generating variant consensus position file: %s' % f, logger, 'info') + keep_logging('Error generating variant consensus position file: %s' % f, + 'Error generating variant consensus position file: %s' % f, logger, 'exception') + exit() + + # Move and organize data_matrix_dir directory + os.chdir(data_matrix_dir) + plots_dir = "%s/plots" % data_matrix_dir + matrices_dir = "%s/matrices" % data_matrix_dir + functional_ann_dir = "%s/Functional_annotation_results" % data_matrix_dir + logs_dir = "%s/logs" % data_matrix_dir + make_sure_path_exists(plots_dir) + make_sure_path_exists(matrices_dir) + make_sure_path_exists(functional_ann_dir) + make_sure_path_exists(logs_dir) + call("mv *.log.txt %s" % logs_dir, logger) + call("mv summary.txt detail.txt Functional_class_filter_positions.txt inexact_repeat_region_positions.txt phage_region_positions.txt repeat_region_positions.txt %s" % functional_ann_dir, logger) + call("mv temp_* All* Only* SNP_matrix_* Indel* extract_DP_positions.txt header.txt unique_indel_positions_file unique_positions_file %s" % matrices_dir, logger) + call("mv annotated_no_proximate_snp_* %s/snpEff_results/" % data_matrix_dir, logger) + call("mv bargraph* generate_diagnostics_plots.R %s" % plots_dir, logger) + call("cp %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt %s/" % (matrices_dir, plots_dir), logger) + + # """ Generate alignment report """ + # alignment_report(data_matrix_dir) + # + # """ Generate core snps report """ + # variant_report(data_matrix_dir) + + """ Generating Gubbins MFA files""" + reference_base = os.path.basename(args.reference).split('.')[0] + gubbins_dir = args.results_dir + '/gubbins' + tree_dir = args.results_dir + '/trees' + + make_sure_path_exists(gubbins_dir) + #make_sure_path_exists(tree_dir) + + + prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_ref_allele.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_var_consensus_input = "%s/gubbins/%s_%s_core_var_aln.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_noncore_plus_core_variants_aln.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + #prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''),reference_base) + prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_alt_allele_unmapped.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + + prepare_ref_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_var_consensus_input) + prepare_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_variant_positions/*_variants.fa > %s" % (args.results_dir, prepare_var_consensus_input) + prepare_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_allele_variant_positions/*_allele_variants.fa > %s" % ( + args.results_dir, prepare_allele_var_consensus_input) + #prepare_ref_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_allele_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_allele_var_consensus_input) + prepare_ref_allele_unmapped_consensus_input_cmd = "cat %s %s/core_snp_consensus/consensus_ref_allele_unmapped_variant/*.fa > %s" % (args.reference, args.results_dir, prepare_ref_allele_unmapped_consensus_input) + call("%s" % prepare_ref_var_consensus_input_cmd, logger) + call("%s" % prepare_var_consensus_input_cmd, logger) + call("%s" % prepare_allele_var_consensus_input_cmd, logger) + #call("%s" % prepare_ref_allele_var_consensus_input_cmd, logger) + call("%s" % prepare_ref_allele_unmapped_consensus_input_cmd, logger) + # os.system(prepare_ref_var_consensus_input_cmd) + # os.system(prepare_var_consensus_input_cmd) + + print_details = "Results for core pipeline can be found in: %s\n" \ + "Description of Results:\n" \ + "1. data_matrix folder contains all the data matrices and other temporary files generated during the core pipeline. bargraph_counts.txt and bargraph_percentage.txt: contains counts/percentage of unique positions filtered out due to different filter parameters for each sample. Run bargraph.R to plot bargraph statistics." \ + "2. core_snp_consensus contains all the core vcf and fasta files. *_core.vcf.gz: core vcf files, *.fa and *_variants.fa: core consensus fasta file and core consensus fasta with only variant positions." % (args.results_dir) + keep_logging(print_details, print_details, logger, 'info') + + call("cp %s %s/Logs/report/" % ( + log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) + + if "4" in args.steps: + """ + Gubbins/Raxml step + """ + + + keep_logging('Step 4: Run Gubbins on core alignments and generate iqtree/RaxML trees.', 'Step 4: Run Gubbins on core alignments and generate iqtree/RaxML trees.', logger, 'info') + + #parse_phaster(args.reference) + reference_base = os.path.basename(args.reference).split('.')[0] + gubbins_dir = args.results_dir + '/gubbins' + tree_dir = args.results_dir + '/trees' + + make_sure_path_exists(gubbins_dir) + #make_sure_path_exists(tree_dir) + + + prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_ref_allele.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_var_consensus_input = "%s/gubbins/%s_%s_core_var_aln.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_noncore_plus_core_variants_aln.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + #prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''),reference_base) + prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_alt_allele_unmapped.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + + prepare_ref_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_var_consensus_input) + prepare_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_variant_positions/*_variants.fa > %s" % (args.results_dir, prepare_var_consensus_input) + prepare_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_allele_variant_positions/*_allele_variants.fa > %s" % ( + args.results_dir, prepare_allele_var_consensus_input) + #prepare_ref_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_allele_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_allele_var_consensus_input) + prepare_ref_allele_unmapped_consensus_input_cmd = "cat %s %s/core_snp_consensus/consensus_ref_allele_unmapped_variant/*.fa > %s" % (args.reference, args.results_dir, prepare_ref_allele_unmapped_consensus_input) + call("%s" % prepare_ref_var_consensus_input_cmd, logger) + call("%s" % prepare_var_consensus_input_cmd, logger) + call("%s" % prepare_allele_var_consensus_input_cmd, logger) + call("%s" % prepare_ref_allele_unmapped_consensus_input_cmd, logger) + + + if args.gubbins and args.gubbins == "yes": + os.chdir(gubbins_dir) + if args.outgroup: + # Get outgroup_Sample name + outgroup = get_outgroup() + keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), + '%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), logger, 'info') + call("%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), logger) + keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), + '%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), + logger, 'info') + call("%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), logger) + # call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_var_consensus_input), logger) + else: + keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input), + '%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input), + logger, 'info') + call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input), logger) + keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input), + '%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( + os.path.dirname(os.path.abspath(__file__)), + prepare_ref_allele_unmapped_consensus_input), + logger, 'info') + call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input), logger) + #call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_var_consensus_input), logger) + else: + if args.outgroup: + # Get outgroup_Sample name + outgroup = get_outgroup() + keep_logging('The gubbins argument is set to No.', 'The gubbins argument is set to No.', logger, 'info') + keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), + '%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), + logger, 'info') + print "%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup) + keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), + '%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( + os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), + logger, 'info') + print "%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup) + else: + keep_logging('The gubbins argument is set to No.', 'The gubbins argument is set to No.', logger, 'info') + print "%s/scripts/gubbins_iqtree_raxml.sh %s 0" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input) + print "%s/scripts/gubbins_iqtree_raxml.sh %s 0" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input) + + call("cp %s %s/Logs/tree/" % ( + log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) + + """ The below steps are for debugging purpose only.""" + if "5" in args.steps: + """ + Debugging Purposes only: Run only SNP matrix annotation step + """ + + keep_logging('Step 5: Running SNP matrix annotation step.', 'Step 5: Running SNP matrix annotation step.', logger, 'info') + + functional_class_filter_positions = "%s/Functional_class_filter_positions.txt" % args.filter2_only_snp_vcf_dir + + global outgroup_specific_positions + global outgroup_indel_specific_positions + + # Get outgroup specific variant positions + if args.outgroup: + f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') + + outgroup_indel_specific_positions = [] + for i in f_outgroup: + i = i.strip() + outgroup_indel_specific_positions.append(int(i)) + f_outgroup.close() + + f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') + + outgroup_specific_positions = [] + for i in f_outgroup: + i = i.strip() + outgroup_specific_positions.append(int(i)) + f_outgroup.close() + + print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) + print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) + else: + + outgroup_indel_specific_positions = [] + outgroup_specific_positions = [] + print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) + print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) + + # Annotate core variants. Generate SNP and Indel matrix. + annotated_snp_matrix() + + # # Read new allele matrix and generate fasta; generate a seperate function + keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info') + + create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file) + + extract_only_ref_variant_fasta_from_reference_allele_variant() + + mask_fq_mq_positions_specific_to_outgroup() + + call("cp %s %s/Logs/core/" % ( + log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) + + if "6" in args.steps: + """ + Debugging Purposes only: Run only Gubbins + """ + reference_base = os.path.basename(args.reference).split('.')[0] + gubbins_dir = args.results_dir + '/gubbins' + tree_dir = args.results_dir + '/trees' + + make_sure_path_exists(gubbins_dir) + #make_sure_path_exists(tree_dir) + + + prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_ref_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_var_consensus_input = "%s/gubbins/%s_%s_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_allele_var_consensus.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_ref_allele_unmapped_consensus.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + + if args.gubbins and args.gubbins == "yes": + gubbins(gubbins_dir, prepare_ref_var_consensus_input, args.jobrun, logger, Config) + #gubbins(gubbins_dir, prepare_ref_allele_var_consensus_input, logger, Config) + gubbins(gubbins_dir, prepare_ref_allele_unmapped_consensus_input,args.jobrun, logger, Config) + call("cp %s %s/Logs/tree/" % ( + log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) + + if "7" in args.steps: + """ + Debugging Purposes only: Run iqtree + """ + reference_base = os.path.basename(args.reference).split('.')[0] + gubbins_dir = args.results_dir + '/gubbins' + tree_dir = args.results_dir + '/trees' + + make_sure_path_exists(gubbins_dir) + #make_sure_path_exists(tree_dir) + + + prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_ref_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_var_consensus_input = "%s/gubbins/%s_%s_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_allele_var_consensus.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_ref_allele_unmapped_consensus.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + iqtree(tree_dir, prepare_ref_allele_var_consensus_input, args.jobrun, logger, Config) + iqtree(tree_dir, prepare_ref_var_consensus_input, args.jobrun, logger, Config) + iqtree(tree_dir, prepare_var_consensus_input, args.jobrun, logger, Config) + iqtree(tree_dir, prepare_ref_allele_unmapped_consensus_input, args.jobrun, logger, Config) + + time_taken = datetime.now() - start_time_2 + if args.remove_temp: + del_command = "rm -r %s" % temp_dir + os.system(del_command) + + + +