forked from InSilicoSolutions/COVID19
-
Notifications
You must be signed in to change notification settings - Fork 1
/
extractByGene.py
63 lines (49 loc) · 1.72 KB
/
extractByGene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
geneAlias = {''}
#load reference sequence of each virus gene. Return it in a hash keyed by gene.
#Put the sequence in a list becasue we will add more sequences when samples are loaded.
def load_ref():
ref_file = open("data/reference.fasta", "r")
line = ref_file.readline()
gene = ""
seq_dict = {}
while (line):
if line.startswith(">"):
if gene != "":
seq_dict[gene] = ['Reference|' + sequence]
gene = line.split()[1].upper()
sequence = ""
else:
sequence += line.strip()
line = ref_file.readline()
seq_dict[gene] = [sequence]
return(seq_dict)
#Load the sequencing samples in the down loaded genbank records. For now,
#just look for translated protein sequences that identify the gene. Don't
#worry about the DNA sequences.
def load_samples(sequences):
samp_file = open("data/genbank_sequences.gb", "r")
line = samp_file.readline()
id = ''
gene = ''
seq_dict = {}
while (line):
line = line.strip()
if line.startswith("LOCUS"):
toks = line.split()
id = toks[1] + '(' + toks[len(toks)-1] + ')'
if line.startswith("/country="):
id += ' ' + line[10:-1]
if line.startswith("/gene="):
gene=line[6:-1]
if line.startswith("/translation="):
sequence = line[14:-1]
line = samp_file.readline()
seq_dict[gene] = [sequence]
return seq_dict
sequences = load_ref()
samples = load_samples(sequences)
print(sequences['ORF7B'])
print(sequences.keys())
print(samples)
# = open("data/genbank_sequences.gb", "r")