-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_parsing.py
60 lines (51 loc) · 1.76 KB
/
data_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"http://biopython.org/DIST/docs/tutorial/Tutorial.html"
import os
import pandas
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
import csv
keys_match={
"Pid":"protein_id",
"Name":"gene",
"Name_alt":"locus_tag",
"Product":"product",
"Note":"note",
"Seqa":"translation",
}
orf_file = pandas.read_csv("CP001509_170113.orf",sep='\t')
for row in orf_file.iterrows():
print(row)
break
orfs=[]
keys=[]
record = SeqIO.read("CP001509_20180315.gbk.txt", "genbank")
for i,feature in enumerate(record.features):
if feature.type!="source":
matches=[doc for doc in orfs if doc["Begin"]==int(feature.location.start) and doc["End"]==int(feature.location.end)]
if len(matches)==0:
doc={
"Genome_accn":record.id,
"Id_orf":i+1,
"Begin":int(feature.location.start),
"End":int(feature.location.end),
"Strand":"+" if feature.location.strand>0 else "-",
"Size":int(feature.location.end)-int(feature.location.start),
"Seqn":record.seq[int(feature.location.start):int(feature.location.end)]
}
orfs.append(doc)
else:
match=matches[0]
for key in keys_match.keys():
# print(key, feature.qualifiers.keys())
if keys_match[key] in feature.qualifiers.keys():
match[key]=feature.qualifiers[keys_match[key]]
for rc in orfs:
print(rc)
# output_file=open("output_file.orf", 'w')
# writer = csv.DictWriter(output_file, fieldnames=orfs[0].keys())
# writer.writeheader()
# for data in orfs:
# writer.writerow(data)
exit()