-
Notifications
You must be signed in to change notification settings - Fork 0
/
semcor_db.py
196 lines (153 loc) · 6.26 KB
/
semcor_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""semcor_db.py
Contains functions for performing useful queries over the sqlite database
semcor.db, created by semcor_db.sh.
Dependencies: nested_dict package must be installed
SCHEMA:
sent(
sent_id integer primary key,
doc_id varchar(10),
para_no integer,
sent_no integer,
first_token_id integer,
sentence text)
token(
token_id integer primary key,
sent_id integer,
token_no integer,
surface text,
lemma text,
pos varchar(6),
sense_no integer,
sense_key text,
ssid varchar(10),
dom_token_no integer,
dom_token_id integer,
rel varchar(14) )
USAGE:
Open a connection and get a cursor:
>>> c = semcor_db.get_cursor()
Given a cursor, lemma, synset_id (ssid), and relation, return a list of [lemma,
ssid] pairs for dominating words:
>>> r = semcor_db.get_dom_lemma_ssid(c, "victory", "07488581", "dobj")
>>> r
[['give', '01632595'], ['gain', '02293158'], ['describe', '00007846'], ['give', '01632595'], ['total', '02651091']]
"""
import os
import sys
import itertools
import pdb
import sqlite3
from collections import defaultdict
from nested_dict import nested_dict
from config import CLDATA_DIR
from wordnet import WordNet
import semcor_cl
# to reload: >>> from importlib import reload
# c = semcor_db.get_cursor()
def get_cursor():
"""establish a connection to semcor.db and return a connection object"""
database_filename = "semcor.db"
db_path = os.path.join(CLDATA_DIR, database_filename)
conn = sqlite3.connect(db_path)
# set row_factory method to allow access to fields in a row by name
conn.row_factory = sqlite3.Row
c = conn.cursor()
return(c)
# victory|07488581|dobj|
# r = semcor_db.get_dom(c, "victory", "07488581", "dobj")
def get_dom(c, lemma, ssid, rel):
# select m.lemma, m.rel, d.lemma, d.ssid from token m inner join token d on m.dom_token_id = d.token_id where m.rel = "dobj";
sql = "SELECT d.lemma, d.ssid, m.sent_id from token m INNER JOIN token d on m.dom_token_id = d.token_id where m.lemma = ? AND m.ssid = ? AND m.rel = ?"
args = (lemma, ssid, rel)
c.execute(sql, args)
results = c.fetchall()
return(results)
# return matches for sense, lemma, or synset, given a list of dominant relations
# Note that in order to place a variable number of ? into the IN clause of the query,
# we have to do this dynamically using the length of the rels list. Then rels have
# to be added to the list of args for the query.
# rs = semcor_db.get_sense_dom(c, "victory", "07488581", ["dobj", "nsubj"])
def get_sense_dom(c, lemma, ssid, rels):
# select m.lemma, m.rel, d.lemma, d.ssid from token m inner join token d on m.dom_token_id = d.token_id where m.rel = "dobj";
sql = "SELECT d.lemma, d.ssid, m.sent_id, m.rel from token m INNER JOIN token d on m.dom_token_id = d.token_id where m.lemma = ? AND m.ssid = ? AND m.rel IN (" + ",".join(["?"]*len(rels)) + ")"
args = [lemma, ssid]
args.extend(rels)
c.execute(sql, args)
results = c.fetchall()
return(results)
# rss = semcor_db.get_synset_dom(c, "07488581", ["dobj", "nsubj"])
def get_synset_dom(c, ssid, rels):
# select m.lemma, m.rel, d.lemma, d.ssid from token m inner join token d on m.dom_token_id = d.token_id where m.rel = "dobj";
sql = "SELECT d.lemma, d.ssid, m.sent_id, m.rel from token m INNER JOIN token d on m.dom_token_id = d.token_id where m.ssid = ? AND m.rel IN (" + ",".join(["?"]*len(rels)) + ")"
args = [ssid]
args.extend(rels)
c.execute(sql, args)
results = c.fetchall()
return(results)
# rl = semcor_db.get_lemma_dom(c, "victory", ["dobj", "nsubj"])
def get_lemma_dom(c, lemma, rels):
# select m.lemma, m.rel, d.lemma, d.ssid from token m inner join token d on m.dom_token_id = d.token_id where m.rel = "dobj";
sql = "SELECT d.lemma, d.ssid, m.sent_id, m.rel from token m INNER JOIN token d on m.dom_token_id = d.token_id where m.lemma = ? AND m.rel IN (" + ",".join(["?"]*len(rels)) + ")"
args = [lemma]
args.extend(rels)
c.execute(sql, args)
results = c.fetchall()
return(results)
# r = semcor_db.get_sent(c, sent_id)
def get_sent(c, sent_id):
sql = "SELECT sentence, doc_id, sent_no FROM sent WHERE sent_id = ?"
# Note the need to include a trailing comma if the arg list contains only one parameter.
args = (sent_id, )
c.execute(sql, args)
results = c.fetchall()
return(results)
# r = semcor_db.get_dom_lemma_ssid(c, "victory", "07488581", "dobj")
def get_dom_lemma_ssid(c, lemma, ssid, rel):
rs = get_dom(c, lemma, ssid, rel)
doms = []
for r in rs:
doms.append( [ r['lemma'], r['ssid'] ] )
return(doms)
# l_results is a list of result lists, so that we
# can combine counts from multiple lists (as generated by sister synsets,
# for example). All result lists should be of the same type, however.
# result_type is lemma, sense, or synset
# drl = semcor_db.dictify_mult([rss], 'synset')
def dictify_mult(l_results, result_type):
nd = nested_dict(2, int)
for each_results in l_results:
for row in each_results:
#pdb.set_trace()
rel = row['rel']
if result_type == 'lemma':
type_instance = row['lemma']
elif result_type == 'synset':
type_instance = row['ssid']
elif result_type == 'sense':
type_instance = (row['lemma'], row['ssid'])
nd[rel][type_instance] += 1
for key1 in nd.keys():
sub_dic = nd[key1]
print("key: %s" % key1)
for key2 in sub_dic.keys():
print("%s: %i" % (key2, sub_dic[key2]))
return(nd)
# dictify a single result
def dictify(results, result_type):
nd = nested_dict(2, int)
for row in results:
#pdb.set_trace()
rel = row['rel']
if result_type == 'lemma':
type_instance = row['lemma']
elif result_type == 'synset':
type_instance = row['ssid']
elif result_type == 'sense':
type_instance = (row['lemma'], row['ssid'])
nd[rel][type_instance] += 1
for key1 in nd.keys():
sub_dic = nd[key1]
print("key: %s" % key1)
for key2 in sub_dic.keys():
print("%s: %i" % (key2, sub_dic[key2]))
return(nd)