forked from yuangh-x/2022-NIPS-Tenrec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
splitter.py
192 lines (149 loc) · 7.24 KB
/
splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import numpy as np
from sklearn.model_selection import KFold
class TestSplitter(object):
def __init__(self, args):
self.test_method = args.test_method
self.test_size = args.test_size
self.uid = 'user_id'
self.tid = 'item_id'
def split(self, df):
train_index, test_index = split_test(df, self.test_method, self.test_size, self.uid, self.tid)
return train_index, test_index
class ValidationSplitter(object):
def __init__(self, args):
self.val_method = args.val_method
# self.fold_num = args.fold_num
self.val_size = args.val_size
self.uid = 'user_id'
self.tid = 'item_id'
def split(self, df):
train_val_index_zip = split_validation(df, self.val_method, self.val_size, self.uid, self.tid)
return train_val_index_zip
def split_test(df, test_method='rsbr', test_size=.2, uid='user', tid='timestamp'):
"""
method of splitting data into training data and test data
Parameters
----------
df : pd.DataFrame raw data waiting for test set splitting
test_method : str, way to split test set
'rsbr': random split by ratio
'tsbr': timestamp split by ratio
'tloo': timestamp leave one out
'rloo': random leave one out
'ufo': split by ratio in user level
'utfo': time-aware split by ratio in user level
test_size : float, size of test set
Returns
-------
train_ids : np.array index for training dataset
test_ids : np.array index for test dataset
"""
if test_method == 'ufo':
test_ids = df.groupby(uid).apply(
lambda x: x.sample(frac=test_size).index
).explode().values
test_ids = np.array(list(test_ids))
train_ids = np.setdiff1d(df.index.values, test_ids)
elif test_method == 'utfo':
# make sure df already been sorted by timestamp
# df = df.sort_values([tid]).reset_index(drop=True)
def time_split(grp):
start_idx = grp.index[0]
split_len = int(np.ceil(len(grp) * (1 - test_size)))
split_idx = start_idx + split_len
end_idx = grp.index[-1]
return list(range(split_idx, end_idx + 1))
test_ids = df.groupby(uid).apply(time_split).explode().values
train_ids = np.setdiff1d(df.index.values, test_ids)
elif test_method == 'tsbr':
split_idx = int(np.ceil(len(df) * (1 - test_size)))
train_ids, test_ids = np.arange(split_idx), np.arange(split_idx, len(df))
elif test_method == 'rsbr':
# train_set, test_set = train_test_split(df, test_size=test_size, random_state=2019)
test_ids = np.random.choice(df.index.values, size=int(len(df) * test_size), replace=False)
train_ids = np.setdiff1d(df.index.values, test_ids)
elif test_method == 'tloo': # utloo
df['rank_latest'] = df.groupby([uid])[tid].rank(method='first', ascending=False)
train_ids, test_ids = df.index.values[df['rank_latest'] > 1], df.index.values[df['rank_latest'] == 1]
del df['rank_latest']
elif test_method == 'rloo': # urloo
test_ids = df.groupby([uid]).apply(lambda grp: np.random.choice(grp.index))
train_ids = np.setdiff1d(df.index.values, test_ids)
else:
raise ValueError('Invalid data_split value, expect: rloo, rsbr, tloo, tsbr')
return train_ids, test_ids
def split_validation(train_set, val_method='rsbr', val_size=.1, uid='user', tid='timestamp'):
"""
method of split data into training data and validation data.
Parameters
----------
train_set : pd.DataFrame train set waiting for split validation
val_method : str, way to split validation
'cv': combine with fold_num => fold_num-CV
'rsbr': combine with fold_num & val_size => fold_num-Split by ratio(9:1)
'tsbr': Split by ratio with timestamp, combine with val_size => 1-Split by ratio(9:1)
'tloo': Leave one out with timestamp => 1-Leave one out
'rloo': combine with fold_num => fold_num-Leave one out
'ufo': split by ratio in user level with K-fold
'utfo': time-aware split by ratio in user level
fold_num : int, the number of folder need to be validated, only work when val_method is 'cv', 'rloo', or 'rsbr'
val_size: float, the size of validation dataset
Returns
-------
train_set_list : List, list of index for generated training datasets
val_set_list : List, list of index for generated validation datasets
cnt : cnt: int, the number of train-validation pair
"""
train_set = train_set.reset_index(drop=True)
# train_set_list, val_set_list = [], []
if val_method == 'ufo':
# for _ in range(fold_num):
val_ids = train_set.groupby(uid).apply(
lambda x: x.sample(frac=val_size).index
).explode().values
val_ids = np.array(list(val_ids))
train_ids = np.setdiff1d(train_set.index.values, val_ids)
# train_set_list.append(train_ids)
# val_set_list.append(val_ids)
if val_method == 'utfo':
def time_split(grp):
start_idx = grp.index[0]
split_len = int(np.ceil(len(grp) * (1 - val_size)))
split_idx = start_idx + split_len
end_idx = grp.index[-1]
return list(range(split_idx, end_idx + 1))
val_ids = train_set.groupby(uid).apply(time_split).explode().values
val_ids = np.array(list(val_ids))
train_ids = np.setdiff1d(train_set.index.values, val_ids)
# train_set_list.append(train_ids)
# val_set_list.append(val_ids)
# if val_method == 'cv':
# kf = KFold(n_splits=fold_num, shuffle=False, random_state=2019)
# for train_ids, val_ids in kf.split(train_set):
# train_set_list.append(train_ids)
# val_set_list.append(val_ids)
if val_method == 'rsbr':
# for _ in range(fold_num):
val_ids = np.random.choice(train_set.index.values, size=int(len(train_set) * val_size), replace=False)
train_ids = np.setdiff1d(train_set.index.values, val_ids)
# train_set_list.append(train_ids)
# val_set_list.append(val_ids)
elif val_method == 'tsbr':
split_idx = int(np.ceil(len(train_set) * (1 - val_size)))
train_ids, val_ids = np.arange(split_idx), np.arange(split_idx, len(train_set))
# train_set_list.append(train_ids)
# val_set_list.append(val_ids)
elif val_method == 'rloo':
# for _ in range(fold_num):
val_ids = train_set.groupby([uid]).apply(lambda grp: np.random.choice(grp.index))
train_ids = np.setdiff1d(train_set.index.values, val_ids)
# train_set_list.append(train_ids)
# val_set_list.append(val_ids)
elif val_method == 'tloo':
train_set['rank_latest'] = train_set.groupby([uid])[tid].rank(method='first', ascending=False)
train_ids = train_set.index.values[train_set['rank_latest'] > 1]
val_ids = train_set.index.values[train_set['rank_latest'] == 1]
del train_set['rank_latest']
# train_set_list.append(train_ids)
# val_set_list.append(val_ids)
return train_ids, val_ids # zip(train_set_list, val_set_list)