forked from JMante1/jet-plugin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
input_data.py
105 lines (84 loc) · 4.22 KB
/
input_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import json
import requests
from pandas.io.json import json_normalize
def input_data(uri, instance):
"""
Finds information about an SBOL part based on its uri
Requirements
-------
import json
import requests
from pandas.io.json import json_normalize
Input_Query.txt
Parameters
----------
uri : string
the unique identifier of a part, note that due to spoofing it may not be the same as the url
e.g. uri = 'https://synbiohub.org/public/igem/BBa_E0040/1' (url may be https://dev.synbiohub.org/public/igem/BBa_E0040/1)
instance : string
the synbiohub instance where information is to be retrieved from (where the sparql query is to be run)
e.g. 'https://synbiohub.org/'
Returns
-------
self_df: pandas dataframe, shape()
Dataframe with the columns: 'count' (same as count below), 'deff' (uri), 'displayId' (same as display_id below),
'title' (same as title below), 'role' (same as role below)
display_id: string
The display id of the poi e.g. 'BBa_E0040'
title: string
The human readable name of the poi e.g. 'GFP'
role: string
The number (as a string) of the sequence ontology of the role of the poi e.g. '0000316'
count: integer
The number of times the poi is used (how often it is a subpart) e.g. 2348
Example
--------
self_df, display_id, title, role, count = input_data('https://synbiohub.org/public/igem/BBa_E0040/1', 'https:dev.synbiohub.org/')
Output:
self_df: (dataframe with 1 row with index zero, columns shown below in the correct order)
count 2348
deff https://synbiohub.org/public/igem/BBa_E0040/1
display_id BBa_E0040
title GFP
role 0000316
display_id: 'BBa_E0040'
title: 'GFP'
role: '0000316'
count: 2348
"""
status = 200
req = requests.get(instance)
if req.status_code != 200: #if synbiohub is offline return an error
status = 424
else:
fl = open("Input_Query.txt", "r")
sparqlquery = fl.read()
#replace the uri in the pre written sparql query with the uri of the part
sparqlquery = sparqlquery.replace('https://synbiohub.org/public/igem/BBa_B0012/1',uri)
#accept repsonses
r = requests.post(instance+"sparql", data = {"query":sparqlquery}, headers = {"Accept":"application/json"})
#format responses
d = json.loads(r.text)
a = json_normalize(d['results']['bindings'])
#renames columns
rename_dict = {'count.datatype':'cd', 'count.type':'ct', 'count.value':'count', 'def.type':'dt', 'def.value':'deff', 'displayId.type':'dist', 'displayId.value':'displayId', 'role.type':'rt', 'role.value':'roletog', 'title.type':'tt', 'title.value':'title'}
a.columns = [rename_dict[col] for col in a.columns]
#split column roletog at SO: to leave the http://identifiers.org/so in the column http
#and the roler number (e.g. 0000141) in the column role
a[['http','role']] = a.roletog.str.split("SO:",expand=True)
#drop unnecessary columns to leave: ['count', 'deff', 'displayId', 'title', 'role']
a = a.drop(['cd', 'ct', 'dt', 'dist', 'rt', 'roletog', 'tt', 'http'],axis=1)
#creates a df that has only one row (where the deff is the part in question)
self_df = a[a.deff == uri]
#obtains the displayid using the self df
display_id = self_df['displayId'][0]
#obtains the title/human readable name using the self df
title = self_df['title'][0]
#in case there was no title
if str(title) == "nan":
title = display_id
#obtains the role (as a number, e.g. 000141) using the self df
role = self_df['role'][0]
#obtains the count using the self df
count = self_df['count'][0]
return (self_df, display_id, title, role, count)