-
Notifications
You must be signed in to change notification settings - Fork 1
/
SetJoin.h
145 lines (119 loc) · 3.12 KB
/
SetJoin.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#ifndef _SETJOIN_H_
#define _SETJOIN_H_
#include <iostream>
#include <fstream>
#include <vector>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <algorithm>
#include <string.h>
#include <queue>
#include <inttypes.h>
#include <sys/time.h>
#include <cmath>
#include <cstdio>
using namespace std;
#define google_unordered_map unordered_map
#define google_unordered_set unordered_set
#define PACK(x, y) ((x << 32) + y)
#define PRIME 2017
#define EPS 1e-5
#define NEG -1
#define INF 100000000
#define MAX_LINE_LENGTH 1000000
#define CACHE_SIZE 5
extern vector<pair<int, int>> cacheVec;
extern vector<vector<pair<int, int>>> indexVecs;
class SimPairx
{
public:
int id1;
int id2;
double sim;
SimPairx() {}
SimPairx(int i1, int i2, double s)
{
id1 = i1;
id2 = i2;
sim = s;
}
bool operator<(const SimPairx& rhs) const
{
return this->sim > rhs.sim;
}
};
class SetJoin
{
public:
int same, diff;
double det;
uint64_t resultNum = 0;
uint64_t candidateNum = 0;
uint64_t lengthSum = 0;
uint64_t listlens = 0;
int prime_exp[MAX_LINE_LENGTH];
vector<vector<int>> dataset;
vector<double> &wordwt;
vector<double> &recordwt;
int maxlimit;
bool has_limit;
vector<pair<int, int>> result_pairs;
priority_queue<SimPairx> result_pairs_;
void get_results()
{
while(!result_pairs_.empty())
{
result_pairs.emplace_back(result_pairs_.top().id1, result_pairs_.top().id2);
result_pairs_.pop();
}
}
struct invertedList {
int vec_no, cnt;
pair<int, int> cache[CACHE_SIZE];
vector<pair<int, int>>& getVector() const {
if (cnt <= CACHE_SIZE) {
cacheVec.assign(cache, cache + cnt);
return cacheVec;
} else
return indexVecs[vec_no];
}
void add(pair<int, int> data) {
if (cnt < CACHE_SIZE) cache[cnt++] = data;
else {
if (CACHE_SIZE == cnt) {
indexVecs.push_back(vector<pair<int, int>>());
vec_no = indexVecs.size() - 1;
indexVecs[vec_no].assign(cache, cache + CACHE_SIZE);
}
++cnt;
indexVecs[vec_no].push_back(data);
}
}
};
struct invIndexStruct {
int list_no;
int* oneList;
//vector<int> oneList;
invIndexStruct() : list_no(0) {}
};
vector<invertedList> indexLists;
SetJoin(vector<vector<int>> &sorted_records, vector<double> &ww, vector<double> &rw, int ml, bool islimit)
: wordwt(ww), recordwt(rw)
{
dataset = sorted_records;
maxlimit = ml;
has_limit = islimit;
indexVecs.clear();
cacheVec.clear();
cacheVec.resize(CACHE_SIZE);
}
~SetJoin()
{
cacheVec.clear();
indexVecs.clear();
}
bool overlap(int x, int y, int posx = 0, int posy = 0, int current_overlap = 0);
void setjoin(double threshold);
};
#endif