-
Notifications
You must be signed in to change notification settings - Fork 0
/
combineLaptops.py
319 lines (284 loc) · 10.8 KB
/
combineLaptops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import csv
import threading
import pandas as pd
from fuzzywuzzy import fuzz
import pymysql
conn = pymysql.connect(host='localhost', user='root', db='productdata')
conn.set_charset('utf8')
cursor = conn.cursor()
sql = 'SELECT * from `laptops`;'
cursor.execute(sql)
countrow = cursor.execute(sql)
print(countrow)
# The combined table of laptop is extracted and placed into a pandas dataframe
combinedData = pd.read_sql("SELECT * FROM laptops", con= conn)
# Each product is sorted by vendor and stored into appropriate dataframe
bestBuy = pd.read_sql("SELECT * FROM laptops WHERE VENDOR='bestbuy'", con= conn)
amazon = pd.read_sql("SELECT * FROM laptops WHERE VENDOR='amazon'", con= conn)
newEgg = pd.read_sql("SELECT * FROM laptops WHERE VENDOR='newegg'", con= conn)
flipkart = pd.read_sql("SELECT * FROM laptops WHERE VENDOR='flipkart'", con= conn)
source = pd.read_sql("SELECT * FROM laptops WHERE VENDOR='source'", con= conn)
staples = pd.read_sql("SELECT * FROM laptops WHERE VENDOR='staples'", con= conn)
walmart = pd.read_sql("SELECT * FROM laptops WHERE VENDOR='walmart'", con= conn)
# The price correction method removes any string literals in the price such as '$' ',' and outputs only the price
def priceCorrection(dataset):
newPriceBestBuy= []
for price in dataset["Price"]:
try:
aInt = int(filter(str.isdigit, price))
except:
print "wrong price"
aInt = 0
if ("." in str(price)):
newPriceBestBuy.append(aInt * 0.01)
else:
newPriceBestBuy.append(aInt)
dataset.drop('Price', axis=1, inplace= True)
dataset.insert(4, "Price", newPriceBestBuy)
priceCorrection(bestBuy)
priceCorrection(amazon)
priceCorrection(source)
priceCorrection(newEgg)
priceCorrection(staples)
priceCorrection(walmart)
dollars = []
dollars = []
# The price of the Indian items is converted to Canadian Dollars
for price in flipkart["Price"]:
aInt = int(filter(str.isdigit, price))
dollars.append(round(aInt * 0.019,2))
flipkart.drop('Price', axis=1, inplace= True)
flipkart.insert(4, "Price", dollars)
print flipkart.head()
# All datasets are combined into a master dataframe
combinedDataset = [bestBuy, amazon, source, newEgg, staples, walmart, flipkart]
# To make matching easier some of the key brands are identified
brands = []
brands.append("acer")
brands.append("asus")
brands.append("dell")
brands.append("hp")
brands.append("lenovo")
brands.append("microsoft")
brands.append("toshiba")
brands.append("apple")
brands.append("other")
# The brands are identified and a new column is added to the dataframe
def identifyBrand(brands, dataset):
company = []
notFound = True
counter = 0
for name in dataset["Name"]:
counter += 1
for brand in brands:
if brand in str(name).lower():
company.append(brand)
notFound = False
break
if notFound:
company.append("other")
notFound = True
dataset.insert(5, "Company", company)
identifyBrand(brands, bestBuy)
identifyBrand(brands, amazon)
identifyBrand(brands, newEgg)
identifyBrand(brands, source)
identifyBrand(brands, flipkart)
identifyBrand(brands, staples)
identifyBrand(brands, walmart)
# groupByBrand accepts:
# combined Dataset
# combination of all the various vendors
# d
# the dictionary of the specific Brand
# companyNames
# A list containing all the vendor Names
# brandName
# The current brandName to group
def groupByBrand(combinedDataset , d, companyNames, brandName):
i = 0
# For each dataset in the combined dataset
# The dataset is grouped by the Company
# A dynamic variable is set and used as the key for the dictionary to store the dataframe
#bestBuyData or amazonData etc
for dataset in combinedDataset:
name = dataset.groupby("Company")
try:
name.get_group(name=brandName)
d["{0}Data".format(companyNames[i])] = pd.DataFrame
print brandName + str(len(name.get_group(name=brandName)))
d["{0}Data".format(companyNames[i])] = name.get_group(name=brandName)
print d["{0}Data".format(companyNames[i])].head()
print d["{0}Data".format(companyNames[i])].shape
i += 1
except KeyError as e:
i+=1
continue
# Dictionaries are Initialized to hold each of the dataframes
dAcer = {}
dAsus = {}
dDell = {}
dHp = {}
dLenovo = {}
dMicrosoft = {}
dToshiba = {}
dApple = {}
dOther = {}
combinedBrandDataset = [dAcer, dAsus, dDell, dHp, dLenovo, dMicrosoft, dToshiba, dApple, dOther]
companyNames = []
companyNames.append("bestbuy")
companyNames.append("amazon")
companyNames.append("source")
companyNames.append("newEgg")
companyNames.append("staples")
companyNames.append("walmart")
companyNames.append("flipkart")
groupByBrand(combinedDataset, dAcer, companyNames, "acer")
groupByBrand(combinedDataset, dAsus, companyNames, "asus")
groupByBrand(combinedDataset, dDell, companyNames, "dell")
groupByBrand(combinedDataset, dHp, companyNames, "hp")
groupByBrand(combinedDataset, dLenovo, companyNames, "lenovo")
groupByBrand(combinedDataset, dMicrosoft, companyNames, "microsoft")
groupByBrand(combinedDataset, dToshiba, companyNames, "toshiba")
groupByBrand(combinedDataset, dApple, companyNames, "apple")
groupByBrand(combinedDataset, dOther, companyNames, "other")
#An array is initialized to hold all the threads
threads = []
# The check equality checks the equality of two products utilizing an external library called fuzzy wuzzy
# A similarity percentage is extracted
# Prices are adjusted
# If the similarity is greater than 85 and the price difference is less than 250 it is declared as a match
def checkEquality (name1, name2, price1,price2, countSame,vendor):
similarity = fuzz.token_set_ratio(name1, name2)
if (vendor != "flipkart"):
try:
modPrice = int(filter(str.isdigit, price2))
except:
print "wrong price"
modPrice = 0
if ("." in str(price2)):
modPrice = (modPrice * 0.01)
else:
modPrice = int(filter(str.isdigit, price2))
modPrice = (round(modPrice * 0.019, 2))
if (similarity > 85):
if (abs(modPrice - price1) < 250.0):
return True
else:
return False
else:
return False
# A list to hold all the masters
combinedMaster = []
# The comparePlain is iterated through
# The name of the product in the master is extracted
# The product is checked with the current product for equality
# If match is found then product is appended to the ID
# Else process repeats
def comparePlain (combinedPlain, master):
count = 0
found = False
for dataset in combinedPlain:
for i, row in dataset.iterrows():
for modelNumber in master.keys():
count += 1
productID = master[modelNumber][0]
rowName = combinedData[combinedData.ID == productID].Name.item()
rowPrice = combinedData[combinedData.ID == productID].Price.item()
rowVendor = combinedData[combinedData.ID == productID].Vendor.item()
if (checkEquality(rowName, row["Name"], row["Price"], rowPrice, count, rowVendor)):
productDetails = master[modelNumber]
productDetails.append(row["ID"])
master[modelNumber] = productDetails
found = True
break
if found == False:
print count
print "Entered"
productDetails = []
productDetails.append(row["ID"])
master[row["Name"]] = productDetails
found = False
combinedMaster.append(master)
# Each brand is iterated though
# A master dictionary is initialized to hold all the resulting combination of products
# Key: Model Number or Name
# Value: List of ID's
# Each dataset from each vendor is taken from the dBrand dictionary
# The dataset is broken into two, one without modelNumbers and one with Model Numbers
# The modelNumber dataframe is iterated through
# If model number exists in the master its appended to the ID list
# Else a new key is made and stored
# A new thread is started on the comparePlain method passing in the plain dataframe and the master
for dBrand in combinedBrandDataset:
combinedPlain = []
master = {}
for data in dBrand:
d = dBrand[data]
dPlain = d[d["ModelNumber"] == "none"]
dModelNumber = d[d["ModelNumber"] != "none"]
combinedPlain.append(dPlain)
for i, row in dModelNumber.iterrows():
if row['ModelNumber'].strip() in master:
productDetails = master[row['ModelNumber'].strip()]
productDetails.append(row["ID"])
else:
productDetails = []
productDetails.append(row["ID"])
master[row["ModelNumber"].strip()] = productDetails
t1 = threading.Thread(target=comparePlain, args=(combinedPlain,master))
t1.start()
threads.append(t1)
# Program waits for all threads to finish execution
for thread in threads:
thread.join()
laptopMaster = {}
# For each master Diction in the combined Master
# If more than one product is matched then the rating is set as 2
# The data is inserted into the laptopCombination Database
for masterDict in combinedMaster:
IDMapping = {}
for key in masterDict:
productIDs = masterDict[key]
rating = 2
if (len(productIDs) > 2):
rating = 1
try:
cursor.execute(
"INSERT into laptopcombination(Name,Rating) VALUES ('%s', '%s')" % \
(key, rating))
except:
print "error while inserting into laptopsCombo"
continue
IDMapping[key] = cursor.lastrowid
conn.commit()
# For all the IDs in the master dictionary
# The Foreign key is set to the updated value
for key in masterDict:
try:
insertedId = IDMapping[key]
productIDs = masterDict[key]
for currid in productIDs:
try:
cursor.execute("UPDATE laptops SET laptopMapping=%d WHERE ID=%d" % (insertedId, currid))
except:
print "error while inserting into laptops"
continue
except:
print "wrong key"
continue
conn.commit()
conn.close()
# print len(laptopMaster)
# count = 0
# for i in laptopMaster:
# list = laptopMaster[i]
# if (len(list) > 2):
# count +=1
#
# print count
#
# with open('dictLaptops.csv', 'wb') as csv_file:
# writer = csv.writer(csv_file)
# for key, value in laptopMaster.items():
# writer.writerow([key, value])