-
Notifications
You must be signed in to change notification settings - Fork 0
/
reducer_newSimilarity_test.py
163 lines (148 loc) · 6.64 KB
/
reducer_newSimilarity_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
import sys
import math
from itertools import groupby
from operator import itemgetter
'''
reducer input:
bucket_idx \t profile1
bucket_idx \t profile2
...
reducer output: each line is a cluster
bucket_idx \t identifier1, identifier2, ..._city1, city2, ...
Then by using bucket_idx, we can compare three groups of buckets
This can also be used to evaluate how good our LSH is...
'''
CITY_IND = 5
TIME_IND = 3
HID_IND = 17
REQ_ONLY = 31
BEC_ONLY = 27
RB_COMMON = 20
BEC_CAT = 21
RB_UNION_CAT = 32
RB_UNION = 38
IS_PREMISE = 18
IS_PREFETCH = 28
THRESHOLD = 0.893
def cal_jaccard (record1, record2):
num = 0
denom = 0
for i in range(len(record1)):
if (record1[i].lower() != "null" or record2[i].lower() != "null") and (record1[i].lower() != "n/a" or record2[i].lower() != "n/a") and (record1[i].lower() != "na" or record2[i].lower() != "na"):
if i != IS_PREFETCH and i != IS_PREMISE:
if record1[i] != '0' and record2[i] != '0':
denom = denom + 1
if record1[i] == record2[i]:
num = num + 1
else:
denom = denom + 1
if record1[i] == record2[i]:
num = num + 1
return float(num) / denom
def cal_cosine(record1, record2):
cross = 0.0
norm1 = 0.0
norm2 = 0.0
for i in range(len(record1)):
r1 = float(record1[i])
r2 = float(record2[i])
cross += r1*r2
norm1 += r1*r1
norm2 += r2*r2
denom = math.sqrt(norm1) * math.sqrt(norm2)
if denom != 0:
#print cross / denom
return cross / denom
else:
return 0.0
def getSimilarity(profile1, profile2):
x_list = profile1.split(',')
y_list = profile2.split(',')
score = 0.0
if len(x_list) > len(y_list):
#swap x_list and y_list, so len(x) <= len(y)
tmp = x_list
x_list = y_list
y_list = tmp
if len(x_list) == len(y_list):
if len(x_list) == REQ_ONLY:
request1 = [x_list[i] for i in range(REQ_ONLY) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
request2 = [y_list[i] for i in range(REQ_ONLY) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
score = cal_jaccard(request1, request2)
elif len(x_list) == BEC_ONLY:
request1 = [x_list[i] for i in range(BEC_CAT) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
request2 = [y_list[i] for i in range(BEC_CAT) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
beacon1 = x_list[BEC_CAT:]
beacon2 = y_list[BEC_CAT:]
score = 0.75 * cal_jaccard(request1, request2) + 0.25 * cal_cosine(beacon1, beacon2)
else:
request1 = [x_list[i] for i in range(RB_UNION_CAT) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
request2 = [y_list[i] for i in range(RB_UNION_CAT) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
beacon1 = x_list[RB_UNION_CAT:]
beacon2 = y_list[RB_UNION_CAT:]
score = 0.8 * cal_jaccard(request1, request2) + 0.2 * cal_cosine(beacon1, beacon2)
else:
if len(x_list) == BEC_ONLY and len(y_list) == REQ_ONLY:
request1 = [x_list[i] for i in range(RB_COMMON) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
request2 = [y_list[i] for i in range(RB_COMMON) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
score = cal_jaccard(request1, request2)
elif len(x_list) == BEC_ONLY and len(y_list) == RB_UNION:
request1 = [x_list[i] for i in range(BEC_CAT) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
request2 = [y_list[i] for i in range(RB_COMMON) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
request2.append(y_list[REQ_ONLY])
beacon1 = x_list[BEC_CAT:]
beacon2 = y_list[RB_UNION_CAT:]
score = 0.75 * cal_jaccard(request1, request2) + 0.25 * cal_cosine(beacon1, beacon2)
elif len(x_list) == REQ_ONLY and len(y_list) == RB_UNION:
request1 = [x_list[i] for i in range(REQ_ONLY) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
request2 = [y_list[i] for i in range(REQ_ONLY) if i != CITY_IND and i!= TIME_IND and i != HID_IND]
score = cal_jaccard(request1, request2)
return score
def main(separator='\t'):
#data = read_mapper_output(filename, '\t')
#data = read_mapper_output(sys.stdin, separator=separator)
# groupby groups multiple word-count pairs by word,
# and creates an iterator that returns consecutive keys and their group:
# current_word - string containing a word (the key)
# group - iterator yielding all ["<current_word>", "<count>"] items
last_key = None
this_key = None
running_features = []
for input_line in sys.stdin:
input_line = input_line.strip()
if not input_line:
continue
this_key, value = input_line.split("\t", 1)
if last_key == this_key and this_key:
running_features.append(value)
else:
if last_key:
for x in range(len(running_features)):
for y in range(x+1, len(running_features)):
x_list = running_features[x].split(',')
profile1 = ','.join(x_list[:-1])
x_id = x_list[-1]
y_list = running_features[y].split(',')
profile2 = ','.join(y_list[:-1])
y_id = y_list[-1]
score = getSimilarity(profile1, profile2)
emit_key = x_id + ',' + y_id
print ("%s%s%s" % (emit_key, separator, str(score)))
running_features = []
running_features.append(value)
last_key = this_key
if last_key == this_key and this_key:
for x in range(len(running_features)):
for y in range(x+1, len(running_features)):
x_list = running_features[x].split(',')
profile1 = ','.join(x_list[:-1])
x_id = x_list[-1]
y_list = running_features[y].split(',')
profile2 = ','.join(y_list[:-1])
y_id = y_list[-1]
score = getSimilarity(profile1, profile2)
emit_key = x_id + ',' + y_id
print ("%s%s%s" % (emit_key, separator, str(score)))
if __name__ == "__main__":
main()