-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.uniprot.py
executable file
·115 lines (90 loc) · 4.13 KB
/
parser.uniprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# Parser to translate UniProt XML into protein metadata
#
# Copyright Vera-Licona Research Group (C) 2016
#
# This software is licensed under the Artistic License 2.0, see the
# LICENSE file or
# http://www.opensource.org/licenses/artistic-license-2.0.php for
# details
import argparse
from lxml import etree
import simplejson as json
import logging
# Helper class to handle JSON encoding with sets
class SetEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set) or isinstance(obj, frozenset):
return sorted(list(obj))
return json.JSONEncoder.default(self, obj)
def parse_xml(filename):
# Return dict of metadata
#
# The result is keyed with UniProt IDs and contains name, HUGO
# gene IDs, HUGO gene names, function descriptions, and
# information about the drugs that target that protein.
logging.info(u"Reading XML file")
try:
tree = etree.parse(filename)
except IOError:
raise ValueError("Could not read DB file {0}".format(filename))
logging.info(u"Parsing XML tree")
root = tree.getroot()
ns = {'dbns': root.nsmap[None]}
proteins = {}
for protein in root.findall("dbns:entry", namespaces=ns):
primary_id = protein.findtext("dbns:accession", namespaces=ns).strip()
logging.debug("Processing protein {}".format(primary_id))
all_ids = [accession.text.strip() for accession in protein.findall("dbns:accession", namespaces=ns)]
protein_name_entry = protein.find("dbns:protein", namespaces=ns).find("dbns:recommendedName", namespaces=ns)
if protein_name_entry is None:
protein_name_entry = protein.find("dbns:protein", namespaces=ns)[0]
protein_name = protein_name_entry.findtext("dbns:fullName", namespaces=ns).strip()
gene_entry = protein.find("dbns:gene", namespaces=ns)
if gene_entry is None:
gene_names = []
else:
gene_names = [name.text.strip() for name in gene_entry.findall("dbns:name", namespaces=ns)]
function_entry = protein.find("dbns:comment[@type='function']", namespaces=ns)
if function_entry is None:
function = ""
else:
function = function_entry.findtext("dbns:text", namespaces=ns).strip()
isoforms = {}
for refseq_entry in protein.findall("dbns:dbReference[@type='RefSeq']", namespaces=ns):
refseq_id = refseq_entry.get("id").split(".")[0]
try:
isoform_id = refseq_entry.find("dbns:molecule", namespaces=ns).get("id").split("-")[1]
except AttributeError:
isoform_id = None
if isoform_id not in isoforms:
isoforms[isoform_id] = []
isoforms[isoform_id].append(refseq_id)
protein_data = {"upids": all_ids, "name": protein_name, "geneNames": gene_names, "function": function, "isoforms": isoforms}
logging.debug("Protein data: {}".format(protein_data))
proteins[primary_id] = protein_data
logging.info(u"Found {} proteins in UniProt XML file".format(len(proteins)))
return proteins
def main():
# Set up argument processing
parser = argparse.ArgumentParser(description="DrugBank data parser")
parser.add_argument("drugbank_db_file", help="XML file of UniProt data (download from http://www.uniprot.org/uniprot/?query=proteome:UP000005640)")
parser.add_argument("json_output_file", help="JSON file to write with protein data")
parser.add_argument('-v', '--verbose', action="count", default=0, help="Print verbose logs (may be used multiple times)")
args = parser.parse_args()
# Set up logging
if args.verbose == 0:
log_level = logging.WARNING
elif args.verbose == 1:
log_level = logging.INFO
else:
log_level = logging.DEBUG
logging.basicConfig(level = log_level)
# Parse the XML file into a dict for JSON serialization
proteins = parse_xml(args.drugbank_db_file)
# Write the result
logging.info(u"Writing JSON file")
with open(args.json_output_file, 'w') as outfile:
json.dump(proteins, outfile, cls=SetEncoder)
if __name__ == "__main__":
main()