-
Notifications
You must be signed in to change notification settings - Fork 46
/
clean.py
executable file
·171 lines (131 loc) · 5.5 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import datetime
import sys
sys.path.append( '.' )
sys.path.append( './lib/' )
import SQLite
# ascit was refactored from senAdd in favor of ascit in fwork.
# They differ by 1 line. Leave this comment until covered
# by unit test.
from senAdd import *
from fwork import *
import locFunc
import orgClean
debug = False
#debug = True
t1 = datetime.datetime.now()
#print "Start", t1
# TODO: Move the location handling code into a different script,
# or call it from the driver file.
##### Run B2_LocationMatch.py
#import B2_LocationMatch
# Geocode needs to run by itself.
print "START: geocode", t1
import geocode
#print " - Loc Merge", "\n -", datetime.datetime.now()-t1
print"DONE: geocode"
print " -", datetime.datetime.now()-t1
# TODO: Refactor assignee statements
### Create copy of assignee table, add column for assigneeAsc
s = SQLite.SQLite(db = 'assignee.sqlite3', tbl = 'assignee_1')
s.conn.create_function("ascit", 1, ascit)
s.conn.create_function("cc", 3, locFunc.cityctry)
def handle_assignee():
#s.attach(database='NBER_asg',name='NBER')
s.attach(db='NBER_asg',name='NBER')
s.c.execute("DROP TABLE IF EXISTS assignee_1")
s.replicate(tableTo = 'assignee_1', table = 'assignee')
#s.addSQL(data='assignee', insert="IGNORE")
s.c.execute("INSERT INTO assignee_1 SELECT * FROM assignee %s" % (debug and "LIMIT 2500" or ""))
s.add('assigneeAsc', 'VARCHAR(30)')
s.c.execute("UPDATE assignee_1 SET assigneeAsc = ascit(assignee);")
s.commit()
#print "DONE: assignee_1 table created in assignee.sqlite3 with new column assigneeAsc", "\n -", datetime.datetime.now()-t1
#s.merge(key=[['AsgNum', 'pdpass']], on=[['assigneeAsc', 'assignee']], keyType=['INTEGER'], tableFrom='main', db='db')
#s.attach(database = 'NBER_asg')
#print "Tables call from script ", s.tables()
s.merge(key=[['AsgNum', 'pdpass']], on=[['assigneeAsc', 'assignee']],
keyType=['INTEGER'], tableFrom='assignee', db='NBER')
#s.merge(key=[['AsgNum', 'pdpass']], on=['assigneeAsc', 'assignee'], keyType=['INTEGER'], tableFrom='assignee', db='NBER')
s.c.execute("UPDATE assignee_1 SET AsgNum=NULL WHERE AsgNum<0")
print"DONE: NBER pdpass added to assignee_1 in column AsgNum", "\n -", datetime.datetime.now()-t1
s.commit()
handle_assignee()
# TODO: Refactor to function
### Run orgClean.py and generate grp
def run_org_clean():
org = orgClean.orgClean(db = 'assignee.sqlite3', fld = 'assigneeAsc', table = 'assignee_1', other = "")
org.disambig()
print"DONE: orgClean"
#print " -", datetime.datetime.now()-t1
# Copy assignee num from grp to assignee table
s.merge(key=[['AsgNum', 'AsgNum2']], on=['AssigneeAsc'], tableFrom='grp')
print "DONE: Replaced Asgnum!", "\n -", datetime.datetime.now()-t1
s.c.execute("""update assignee_1 set City = cc(city, country, 'city'), Country = cc(city, country, 'ctry');""")
s.attach('hashTbl.sqlite3')
s.merge(key=['NCity', 'NState', 'NCountry', 'NZipcode', 'NLat', 'NLong'],
on=['City', 'State', 'Country'],
tableFrom='locMerge', db='db')
s.commit()
print "DONE: Asg Locationize!", "\n -", datetime.datetime.now()-t1
run_org_clean()
s.close()
###########################
### ###
## I N V E N T O R ##
### ###
###########################
def handle_inventor():
## Clean inventor: ascit(Firstname, Lastname, Street)
## Create new table inventor_1 to hold prepped data
i = SQLite.SQLite(db = 'inventor.sqlite3', tbl = 'inventor_1')
i.conn.create_function("ascit", 1, ascit)
i.conn.create_function("cc", 3, locFunc.cityctry)
i.c.execute('drop table if exists inventor_1')
i.replicate(tableTo = 'inventor_1', table = 'inventor')
i.c.execute('insert or ignore into inventor_1 select * from inventor %s' % (debug and "LIMIT 2500" or ""))
i.c.execute("""
UPDATE inventor_1
SET firstname = ascit(firstname),
lastname = ascit(lastname),
street = ascit(street),
City = cc(city, country, 'city'),
Country = cc(city, country, 'ctry');
""")
i.commit()
i.attach('hashTbl.sqlite3')
i.merge(key=['NCity', 'NState', 'NCountry', 'NZipcode', 'NLat', 'NLong'],
on=['City', 'State', 'Country'],
tableFrom='locMerge',
db='db')
i.merge(key=['NCity', 'NState', 'NCountry', 'NZipcode', 'NLat', 'NLong'],
on=['City', 'State', 'Country', 'Zipcode'],
tableFrom='locMerge',
db='db')
i.commit()
i.close()
print "DONE: Inv Locationize!", "\n -", datetime.datetime.now()-t1
handle_inventor()
###########################
### ###
## C L A S S ##
### ###
###########################
# Clean up classes
# see CleanDataSet.py --> classes()
# FIXME: Module importing not allowed in function.
from CleanDataset import *
classes()
print "DONE: Classes!", "\n -", datetime.datetime.now()-t1
###########################
### ###
## P A T E N T ##
### ###
###########################
def handle_patent():
p = SQLite.SQLite(db = 'patent.sqlite3', tbl = 'patent')
p.conn.create_function('dVert', 1, dateVert)
p.c.execute("""update patent set AppDate=dVert(AppDate), GDate=dVert(GDate);""")
p.commit()
p.close()
print "DONE: Patent Date!", "\n -", datetime.datetime.now()-t1
handle_patent()