Skip to content

Commit

Permalink
Making changes to Barplot code - excludeing Phage positions
Browse files Browse the repository at this point in the history
  • Loading branch information
alipirani88 committed Oct 21, 2019
1 parent 88dbe81 commit a96fbce
Showing 1 changed file with 86 additions and 32 deletions.
118 changes: 86 additions & 32 deletions modules/variant_diagnostics/core_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,31 @@ def temp_generate_position_label_data_matrix_All_label():
print_string_header = print_string_header + os.path.basename(i) + "\t"

f33.write('\t' + print_string_header.strip() + '\n')

""" GET individual PHAGE/Repetitive/masked region positions to assign functional class group string """

phage_positions = []
repetitive_positions = []
mask_positions = []

phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir
if os.path.isfile(phage_region_positions):
with open(phage_region_positions, 'rU') as fphage:
for line in fphage:
phage_positions.append(line.strip())
fphage.close()
else:
raise IOError('%s/phage_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir)
exit()

""" End: Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files"""

f_open_temp_Only_filtered_positions_for_closely_matrix = open("%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir, 'w+')

f_open_temp_Only_filtered_positions_for_closely_matrix.write('\t' + print_string_header.strip() + '\n')



keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info')
lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP']
ref_var = ['reference_allele', 'VARIANT']
Expand All @@ -1120,8 +1145,16 @@ def temp_generate_position_label_data_matrix_All_label():
print_string = print_string + "\t" + i
STRR2 = row[0] + print_string + "\n"
f33.write(STRR2)

if str(row[0]) not in phage_positions:
print_string_2 = ""
for i in row[1:]:
print_string_2 = print_string_2 + "\t" + i
STRR3 = row[0] + print_string_2 + "\n"
f_open_temp_Only_filtered_positions_for_closely_matrix.write(STRR3)
csv_file.close()
f33.close()
f_open_temp_Only_filtered_positions_for_closely_matrix.close()

else:
with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file:
Expand All @@ -1130,12 +1163,19 @@ def temp_generate_position_label_data_matrix_All_label():
for row in csv_reader:
if set(ref_var) & set(row[1:]):
if set(lll) & set(row[1:]):

print_string = ""
for i in row[1:]:
print_string = print_string + "\t" + i
STRR2 = row[0] + print_string + "\n"
f33.write(STRR2)
if str(row[0]) not in phage_positions:
print_string_2 = ""
for i in row[1:]:
print_string_2 = print_string_2 + "\t" + i
STRR3 = row[0] + print_string_2 + "\n"
f_open_temp_Only_filtered_positions_for_closely_matrix.write(STRR3)


csv_file.close()
f33.close()
"""
Expand Down Expand Up @@ -1259,8 +1299,19 @@ def barplot_stats():
This will give a visual explanation of how many positions in each samples were filtered out because of different reason
"""

c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
columns = list(zip(*c_reader))
print "Exluding Phage regions from temp_Only_filtered_positions_for_closely_matrix.txt file. The results will be outputed to temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt"





temp_Only_filtered_positions_for_closely_matrix_exclude_phage = "%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir
print temp_Only_filtered_positions_for_closely_matrix_exclude_phage
#c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
c_reader_2 = csv.reader(
open(temp_Only_filtered_positions_for_closely_matrix_exclude_phage, 'r'), delimiter='\t')
columns_2 = list(zip(*c_reader_2))
print len(columns_2)
keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info')
counts = 1

Expand All @@ -1277,14 +1328,14 @@ def barplot_stats():

for i in xrange(1, end, 1):
""" Bar Count Statistics: Variant Position Count Statistics """
true_variant = columns[i].count('VARIANT')
unmapped_positions = columns[i].count('reference_unmapped_position')
reference_allele = columns[i].count('reference_allele')
Only_low_FQ = columns[i].count('LowFQ')
Only_DP = columns[i].count('HighFQ_DP')
Only_low_MQ = columns[i].count('HighFQ')
low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')
high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')
true_variant = columns_2[i].count('VARIANT')
unmapped_positions = columns_2[i].count('reference_unmapped_position')
reference_allele = columns_2[i].count('reference_allele')
Only_low_FQ = columns_2[i].count('LowFQ')
Only_DP = columns_2[i].count('HighFQ_DP')
Only_low_MQ = columns_2[i].count('HighFQ')
low_FQ_other_parameters = columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP')
high_FQ_other_parameters = columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL')
other = low_FQ_other_parameters + high_FQ_other_parameters

total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ
Expand All @@ -1302,35 +1353,35 @@ def barplot_stats():
#f_bar_count.write(bar_string)
""" Bar Count Percentage Statistics: Variant Position Percentage Statistics """
try:
true_variant_perc = float((columns[i].count('VARIANT') * 100) / total)
true_variant_perc = float((columns_2[i].count('VARIANT') * 100) / total)
except ZeroDivisionError:
true_variant_perc = 0
try:
unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total)
unmapped_positions_perc = float((columns_2[i].count('reference_unmapped_position') * 100) / total)
except ZeroDivisionError:
unmapped_positions_perc = 0
try:
reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total)
reference_allele_perc = float((columns_2[i].count('reference_allele') * 100) / total)
except ZeroDivisionError:
reference_allele_perc = 0
try:
Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total)
Only_low_FQ_perc = float((columns_2[i].count('LowFQ') * 100) / total)
except ZeroDivisionError:
Only_low_FQ_perc = 0
try:
Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total)
Only_DP_perc = float((columns_2[i].count('HighFQ_DP') * 100) / total)
except ZeroDivisionError:
Only_DP_perc = 0
try:
Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total)
Only_low_MQ_perc = float((columns_2[i].count('HighFQ') * 100) / total)
except ZeroDivisionError:
Only_low_MQ_perc = 0
try:
low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')) * 100) / total)
low_FQ_other_parameters_perc = float(((columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP')) * 100) / total)
except ZeroDivisionError:
low_FQ_other_parameters_perc = 0
try:
high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')) * 100) / total)
high_FQ_other_parameters_perc = float(((columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL')) * 100) / total)
except ZeroDivisionError:
high_FQ_other_parameters_perc = 0

Expand Down Expand Up @@ -2988,11 +3039,6 @@ def annotated_snp_matrix():
""" End: Prepare a All_indel_label_final_ordered_sorted.txt file with sorted unique variant positions. """







""" Generate a position_label and position_indel_label dictionary that will contain information about each unique variant position that passed variant filters in any sample and reasons for being filtered out in any sample """
position_label = OrderedDict()
with open("%s/All_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file:
Expand Down Expand Up @@ -3024,9 +3070,6 @@ def annotated_snp_matrix():
""" End: Generate a position_label and position_indel_label dictionary """





""" Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ """
mask_fq_mq_positions = []
mask_fq_mq_positions_outgroup_specific = []
Expand Down Expand Up @@ -4532,8 +4575,19 @@ def mask_fq_mq_positions_specific_to_outgroup():


else:
position_label = OrderedDict()

with open("%s/All_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file:
keep_logging(
'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir,
'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir,
logger, 'info')
csv_reader = csv.reader(csv_file, delimiter='\t')
next(csv_reader, None)
for row in csv_reader:
position_label[row[0]] = row[1:]
for key in position_label.keys():
label_sep_array = position_label[key].split(',')
label_sep_array = str(position_label[key]).split(',')
for i in label_sep_array:
if "LowFQ" in str(i):
if key not in mask_fq_mq_positions:
Expand Down Expand Up @@ -4656,14 +4710,14 @@ def someOtherFunc(data, key):
unique_position_file = create_positions_filestep(vcf_filenames)
unique_indel_position_file = create_indel_positions_filestep(vcf_filenames)

# bgzip and tabix all the vcf files in core_temp_dir.
files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir)
tabix(files_for_tabix, "vcf", logger, Config)
# # bgzip and tabix all the vcf files in core_temp_dir.
# files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir)
# tabix(files_for_tabix, "vcf", logger, Config)

# Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file.
tmp_dir = "/tmp/temp_%s/" % log_unique_time

create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir)
#create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir)

create_indel_job(args.jobrun, vcf_filenames, unique_indel_position_file, tmp_dir)

Expand Down

0 comments on commit a96fbce

Please sign in to comment.