-
Notifications
You must be signed in to change notification settings - Fork 0
/
sequence_modifier.py
131 lines (122 loc) · 5.7 KB
/
sequence_modifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import random
__author__ = 'mwelland'
class Modifier:
""" This class creates single nucleotide changes in each exon of the input sequence
The exons are located using the exon coordinates from the input file
Random is used to choose a random location within the exon and a new base
The base substitution is made and the position is recorded in a dict
"""
def __init__(self, file_dict, type):
self.type = type
self.dict = file_dict
self.output_dict = {}
self.genomic = file_dict['full sequence']
self.padding = file_dict['offset']
self.transcript_dict = {}
self.transcript_pos = 0
def run_modifier(self):
"""
Main control method for the class
:return: the completed dictionary object
"""
for transcript in self.dict['transcripts']:
self.transcript_pos = 0
self.transcript_dict = self.dict['transcripts'][transcript]
self.output_dict[transcript] = {'variants': {}, 'exons': {}}
exon_list = self.transcript_dict['list_of_exons']
self.output_dict[transcript]['exon list'] = exon_list
for exon in exon_list:
try:
start = self.transcript_dict['exons'][exon]['genomic_start']
end = self.transcript_dict['exons'][exon]['genomic_end']
b_or_a = self.transcript_dict['exons'][exon]['cds']
old_cds = self.transcript_dict['old_cds_offset']
self.modify(exon, transcript, start, end, b_or_a, old_cds)
except IndexError:
print 'The index was out of range, line 43 seq_mod'
for exon in exon_list:
start = self.transcript_dict['exons'][exon]['genomic_start']
end = self.transcript_dict['exons'][exon]['genomic_end']
exon_seq = self.genomic[start - self.padding: end + self.padding]
self.output_dict[transcript]['exons'][exon] = {'start': start,
'end': end,
'padded seq': exon_seq,
'padded length': len(exon_seq),
'length': end - start}
return self.output_dict
def modify(self, exon_number, transcript, start, end, b_or_a, old_cds):
"""
Method to execute substitutions. This is called for each exon number.
This creates a single substitution per exon.
:param exon_number:
:param transcript:
"""
if self.type == 'lrg':
start -= 1
old_exon = self.genomic[start:end]
edit_coord = random.randint(start, end-1)
base = str(random.sample(['A', 'C', 'G', 'T'], 1)[0])
old_base = self.genomic[edit_coord]
while base == old_base:
base = str(random.sample(['A', 'C', 'G', 'T'], 1)[0])
self.genomic[edit_coord] = base
# Identify the position of the variant within the transcript (For HGVS)
if b_or_a == 'before':
cds_delay = old_cds
else:
cds_delay = self.transcript_dict['cds_offset']
p_length = self.transcript_dict['protein_length'] + 3 # For stop codon length
exon_length = end - start
"""
Put in something here to deal with if the exon is pre-coding
b_or_a is 'before' if the exon is pre-coding sequence
"""
if b_or_a == 'before':
hgvs_pos = (edit_coord) - (old_cds - 1) # Trying to fix indexing
else:
variant_pos = (edit_coord + 1) - start
hgvs_pos = (self.transcript_pos + variant_pos) - cds_delay
"""
This is the correct hgvs nomenclature for annotation by annovar
"""
if hgvs_pos <= 0 and self.type == 'lrg':
hgvs = 'c.%d%s>%s' % (hgvs_pos, old_base, base)
elif hgvs_pos <= 0:
hgvs = 'c.%d%s>%s' % (hgvs_pos - 1, old_base, base)
elif hgvs_pos > p_length:
after_coding = hgvs_pos - p_length # compensate for no 0s
hgvs = 'c.*%d%s>%s' % (after_coding, old_base, base)
else:
hgvs = 'c.%s%d%s' % (old_base, hgvs_pos, base)
self.transcript_pos += exon_length
"""
# Un comment this section for positional debugging
print 'start: %d, end: %d' % (start, end)
print 'Length: %d' % (exon_length)
print 'for gene: %s, file: %s cds: %d' % (self.dict['genename'], self.dict['filename'], cds_delay)
print 'position %d in exon %d' % (variant_pos, exon_number)
print 'changed %s to %s' % (old_base, base)
print 'HGVS: %s' % hgvs
print 'Exon: \n%s' % self.genomic[start:end]
new_exon = self.genomic[start:end]
i = 0
while i <= len(new_exon):
base1 = old_exon[i]
base2 = new_exon[i]
if base1 == base2:
i += 1
else:
print base1
print base2
print 'position: %d' % (i + 1)
j = (i + 1) - cds_delay
if j <= 0:
j -= 1
print 'Assumed HGVS: c.%s%d%s' % (base1, j, base2)
break
this = raw_input()
"""
self.output_dict[transcript]['variants'][exon_number] = {'position': edit_coord,
'new base': base,
'hgvs': hgvs,
'transcript': self.transcript_dict['NM_number']}