-
Notifications
You must be signed in to change notification settings - Fork 26
/
match_within.py
217 lines (160 loc) · 6.24 KB
/
match_within.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
Given two lists of words, return the elements from the
first list that are in the second list.
"""
import random
import string
import pandas as pd
from typing import List, Tuple, Set, Dict
from utils.profiler import time_this, timed_report
from utils.profiler import ExponentialRange
def random_words(n: int) -> List[str]:
characters = string.ascii_uppercase + string.digits
big_list = random.choices(characters, k=n*7)
return [''.join(big_list[i:(i+7)]) for i in range(n)]
def random_characters(n: int) -> List[str]:
characters = string.ascii_uppercase + string.digits
return random.choices(characters, k=n)
@time_this(lambda *args, **kwargs: 2*len(args[0]))
def very_slow_match_within(first_list: List[str],
second_list: List[str]) -> Set[str]:
"""
This algorithm is O(nm) for n total words in the first
list and m total words in the second list
"""
words = set()
for word in first_list:
if word in second_list:
words.add(word)
return words
@time_this(lambda *args, **kwargs: 2*len(args[0]))
def slow_match_within(first_list: List[str],
second_list: List[str]) -> List[str]:
"""
This algorithm is O(nm) for n unique words in the first
list and m total words in the second list
"""
# Get the unique words
unique_words = set(first_list)
words = list()
for word in unique_words:
if word in second_list:
words.append(word)
return words
@time_this(lambda *args, **kwargs: 2*len(args[0]))
def fast_match_within(first_list: List[str],
second_list: List[str]) -> List[str]:
"""
This algorithm is O(n + m) for n unique words in the
first list and m total words in the second list
"""
# Get the unique words
first_list_unique_words = set(first_list)
second_list_unique_words = set(second_list)
words = list()
for word in first_list_unique_words:
if word in second_list_unique_words:
words.append(word)
return words
@time_this(lambda *args, **kwargs: 2*len(args[0]))
def fast_intersection(first_list: List[str],
second_list: List[str]) -> Set[str]:
"""
This algorithm is O(n + m) for n words in the first
list and m words in the second list.
This problem boils down to a set intersection.
"""
return set(first_list) & set(second_list)
@time_this(lambda *args, **kwargs: 2*len(args[0]))
def pd_naive_index_match(first_list: List[str],
second_list: List[str]) -> List[str]:
"""
This algorithm is O(n + m) for n words in the first
list and m words in the second list
This is to show that pandas indexes are dict-like
even when the elements are not unique
"""
# Create a pandas index of of the list as-is
index: pd.Index = pd.Index(first_list)
words: List[str] = []
for word in set(second_list):
if word in index:
words.append(word)
return words
@time_this(lambda *args, **kwargs: 2*len(args[0]))
def pd_unique_index_match(first_list: List[str],
second_list: List[str]) -> List[str]:
"""
This algorithm is O(n + m) for n words in the first
list and m words in the second list
This is to show that pandas indexes are dict-like,
and performance increases for unique elements
"""
# Create a pandas index from unique elements
index: pd.Index = pd.Index(set(first_list))
words: List[str] = []
for word in set(second_list):
if word in index:
words.append(word)
return words
@time_this(lambda *args, **kwargs: 2*len(args[0]))
def pd_native_index_match(first_list: List[str],
second_list: List[str]) -> pd.Index:
"""
This algorithm is O(n + m) for n words in the first
list and m words in the second list
This is to show that pandas indexes are dict-like
"""
# Create a pandas index from unique elements
first_index = pd.Index(set(first_list))
second_index = pd.Index(set(second_list))
return first_index.intersection(second_index)
if __name__ == '__main__':
# words_a = ['A', 'B', 'C', 'D']
# words_b = ['B', 'C', 'D', 'E']
# print(very_slow_match_within(words_a, words_b))
# print(slow_match_within(words_a, words_b))
# print(fast_match_within(words_a, words_b))
# print(fast_intersection(words_a, words_b))
# print(pd_unique_index_match(words_a, words_b))
# print(pd_native_index_match(words_a, words_b))
exp_range = ExponentialRange(0, 7, 1/4)
words_a = random_words(exp_range.max)
words_b = random_words(exp_range.max)
with timed_report():
for i in exp_range.iterator(4):
very_slow_match_within(words_a[:i], words_b[:i])
for i in exp_range.iterator(4):
slow_match_within(words_a[:i], words_b[:i])
for i in exp_range.iterator():
fast_match_within(words_a[:i], words_b[:i])
for i in exp_range.iterator():
fast_intersection(words_a[:i], words_b[:i])
for i in exp_range.iterator():
pd_naive_index_match(words_a[:i], words_b[:i])
for i in exp_range.iterator():
pd_unique_index_match(words_a[:i], words_b[:i])
for i in exp_range.iterator():
pd_native_index_match(words_a[:i], words_b[:i])
print('words_a', len(words_a), len(set(words_a)))
print('words_a', len(words_b), len(set(words_b)))
exp_range = ExponentialRange(0, 8, 1/4)
words_a = random_characters(exp_range.max)
words_b = random_characters(exp_range.max)
with timed_report():
for i in exp_range.iterator(5):
very_slow_match_within(words_a[:i], words_b[:i])
for i in exp_range.iterator():
slow_match_within(words_a[:i], words_b[:i])
for i in exp_range.iterator():
fast_match_within(words_a[:i], words_b[:i])
for i in exp_range.iterator():
fast_intersection(words_a[:i], words_b[:i])
for i in exp_range.iterator():
pd_naive_index_match(words_a[:i], words_b[:i])
for i in exp_range.iterator():
pd_unique_index_match(words_a[:i], words_b[:i])
for i in exp_range.iterator():
pd_native_index_match(words_a[:i], words_b[:i])
print('words_a', len(words_a), len(set(words_a)))
print('words_a', len(words_b), len(set(words_b)))