-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_condenser.py
142 lines (125 loc) · 6.15 KB
/
read_condenser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
__author__ = 'mwelland'
class Condenser:
"""
A class called from fqRunner.py, used to combine several FastQ files into a single file pair
For each gene and transcript, the modified contents are combined with the unmodified contents
For single transcripts this will mean each variant appears as a perfect heterozygote
For multiple transcripts the variant will be progressively diluted
- Transcript 1: modified dictionary has 100% of variant locations as the changed bases
default dictionary has 100% unmodified bases
combined this gives 50% modified at variant positions
This is written to the output file
- Transcript 2: Same again, with a default copy combined with a 100% variant copy
This will be added to the final output file along with transcript 1
assuming the variants are not made at the same positions, this will give
75% normal bases, 25% variant at each location
So far this has not been a problem, as the read depth is more than enough to pick these up
Transcripts will not always overlap as well, so the dilution of every site is a worst-case
It may be counter-productive to have mentioned this, but its too late now
"""
def __init__(self, gene_list, run_number):
"""
:param gene_list: A set object containing all genes processed
:param run_number: The numerical identifier of the run
"""
self.gene_list = gene_list
self.run_number = run_number
self.file_list = os.listdir('fastQs')
self.gene_dictionary = {}
def run(self):
"""
Control method for condenser, this is called from fqRunner
"""
for gene in self.gene_list:
# The length of the name is used in substring selection to find appropriate files
name_length = len(gene)
# identify all files in the fastQ folder relevant to the gene
# This includes R1 & 2s, as well as separating the modified and unmodified
self.populate_dict(gene, name_length)
r1_pairs = self.create_file_pairings(gene, name_length, 1)
r2_pairs = self.create_file_pairings(gene, name_length, 2)
self.combine_files(r1_pairs)
self.combine_files(r2_pairs)
self.erase_old_files()
def populate_dict(self, gene, name_length):
"""
The dictionary will contain the names of all files relating to the selected gene. This method uses the gene
name and the transcript numbers to identify the unchanged sequence and the accompanying changed transcripts.
This is done to allow each separate transcript to be paired with the unchanged sequence, so as to represent
a homozygous variant as well as improving the read depth for each area.
:param gene:
:param name_length:
:return:
"""
self.gene_dictionary = {1: {'ref': '', 'transcripts': []}, 2: {'ref': '', 'transcripts': []}}
read1s = []
read2s = []
fq_list = [name for name in self.file_list if name[:name_length] == gene]
# Separate read 1s from read 2s
for filename in fq_list:
first_part_of_name = filename.split('.')[0]
if first_part_of_name[-1] == '1':
read1s.append(filename)
elif first_part_of_name[-1] == '2':
read2s.append(filename)
# For each of Read 1 and 2, separate reference from altered
for filename in read1s:
transcript = filename[name_length:name_length+1]
if transcript == '0':
self.gene_dictionary[1]['ref'] = filename
else:
self.gene_dictionary[1]['transcripts'].append(filename)
for filename in read2s:
transcript = filename[name_length:name_length+1]
if transcript == '0':
self.gene_dictionary[2]['ref'] = filename
else:
self.gene_dictionary[2]['transcripts'].append(filename)
def create_file_pairings(self, gene, name_length, read):
"""
:param gene: gene name
:param name_length: length of gene name
:param read: 1 or 2
:return: a list of 3-element tuples
element 1: reference file name (unchanged seq)
element 2: changed file name
element 3: a name for the file once combined
"""
file_pairs = []
read_dict = self.gene_dictionary[read]
for filename in read_dict['transcripts']:
file_pairs.append([read_dict['ref'], filename, self.create_file_name(gene, name_length, filename, read)])
return file_pairs
@staticmethod
def create_file_name(gene, name_length, filename, read):
"""
Creates a new file name which combines the gene name, the transcript number and the read number
"""
transcript = filename[name_length:name_length+1]
filename = '%s_transcript%s_R%d.fq' % (gene, transcript, read)
return filename
@staticmethod
def combine_files(read_pairs):
"""
This will combine the created files in memory and write the output to a new file
the filename for the new file will be the one created in create_file_name
:param read_pairs:
:return:
"""
for triple_tuple in read_pairs:
outfile_name = triple_tuple[2]
with open(os.path.join('fastQs', outfile_name), 'w') as output_file:
reference_file_name = triple_tuple[0]
reference_file = open(os.path.join('fastQs', reference_file_name), "r")
contents = reference_file.readlines()
output_file.writelines(contents)
alt_file = triple_tuple[1]
contents = open(os.path.join('fastQs', alt_file), 'r').readlines()
output_file.writelines(contents)
def erase_old_files(self):
"""
Deletes all the old files which have now been combined
"""
for filename in self.file_list:
os.remove(os.path.join('fastQs', filename))