-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
207 lines (167 loc) · 10.1 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Copyright 2021 ETH Zurich, Media Technology Center
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import os
import pandas as pd
"""
This module is mainly used to transform the data from the partners into our desired format.
In the and only load_data and get_metadata is used in the algorithms.
"""
def load_data(folder, input_path='user_item', cut=40,high_cut=1000000, seed=None):
"""
loads the training,validation,test set from the folder, restricts the users with at least "cut" read articles and
returns the sets. The Format of the sets is pd.Series with index the UserID and value a list of ArticleIDs
:param folder/input_path: {folder}/{input_path} is the path to look for the *_train.pkl files
:param cut: value to cut off users with less than "cut" read articles
:return: three pd.Series. Index of each series is the UserID. The value is a list of ArticleIDs.
(look in create_split to see how the split is defines)
"""
# cut cuts off users that read less than cut articles
user_item_train, user_item_test, user_item_validation = pd.read_pickle(
f'{folder}/{input_path}_train.pkl'), pd.read_pickle(f'{folder}/{input_path}_test.pkl'), pd.read_pickle(
f'{folder}/{input_path}_validation.pkl')
user_item_train = user_item_train[user_item_train.str.len() > cut * 0.7]
user_item_train = user_item_train[user_item_train.str.len() < high_cut * 0.7]
user_item_test = user_item_test.loc[user_item_train.index]
user_item_validation = user_item_validation.loc[user_item_train.index]
return user_item_train, user_item_test, user_item_validation
def load_data_vertical(folder, input_path='user_item_vertical', cut=40):
"""
loads the training,validation,test set from the folder, restricts the users with at least "cut" read articles and
returns the sets. The Format of the sets is pd.Series with index the UserID and value a list of ArticleIDs
:param folder/input_path: {folder}/{input_path} is the path to look for the *_train.pkl files
:param cut: value to cut off users with less than "cut" read articles
:return: three pd.Series. Index of each series is the UserID. The value is a list of ArticleIDs.
(look in create_split to see how the split is defines)
"""
# cut cuts off users that read less than cut articles
user_item_train, user_item_test, user_item_validation = pd.read_parquet(
f'{folder}/{input_path}_train.pq'), pd.read_parquet(f'{folder}/{input_path}_test.pq'), pd.read_parquet(
f'{folder}/{input_path}_validation.pq')
user_item_train = user_item_train[user_item_train['count'] >cut]
user_item_test =user_item_test[user_item_test['count'] >cut]
user_item_validation = user_item_validation[user_item_validation['count'] >cut]
user_item_train['resource_id']=user_item_train['article_id']
user_item_test['resource_id']=user_item_test['article_id']
user_item_validation['resource_id']=user_item_validation['article_id']
return user_item_train, user_item_test, user_item_validation
def load_data_cv(folder, input_path='user_item', cut=40, high_cut=100000,seed=1):
"""
Same as load_data but only returns random 80% of the training set
"""
# cut cuts off users that read less than cut articles
user_item_train, user_item_test, user_item_validation = load_data(folder, input_path=input_path, cut=cut,high_cut=high_cut)
user_item_train = user_item_train.sample(frac=0.8,random_state=seed)
user_item_test = user_item_test.sample(frac=1, random_state=seed)
return user_item_train, user_item_test, user_item_validation
def load_data_vertical_cv(folder, input_path='user_item_vertical', cut=40, high_cut=100000,seed=1):
"""
Same as load_data but only returns random 80% of the training set
"""
# cut cuts off users that read less than cut articles
user_item_train, user_item_test, user_item_validation = load_data_vertical(folder, input_path=input_path, cut=cut)
user_item_train = user_item_train.sample(frac=0.8,random_state=seed)
user_item_test = user_item_test.sample(frac=1, random_state=seed)
return user_item_train, user_item_test, user_item_validation
def get_metadata(folder, usecols=[]):
"""
Loads and returns the article metadata.
The algorithms expect the format to be a Dataframe with two columns:
- "resource_id": unique id for the article
- "text": full text of the article (without html tags)
"""
if not usecols:
usecols = ['text', 'resource_id']
metadata = pd.read_csv(f"{folder}/meta.csv", usecols=usecols)
return metadata.dropna(subset=['text'])
def transform_item_matrix_to_horizontal_format(folder, output_path='user_item_matrix.pkl',
input_path='user_item_matrix_vertical.pq', sortby='ts'):
"""
Transforms vertical User-Item matrix where ich row is one click into a horizontal User-item matrix where we have
one row for each user and each row contains a (sorted) list of articles she/he clicked on.
:param folder: Input folder
:param output_path: Filename/path for outputfile
:param input_path: Filename/path for inputfile. This pickled file contains a DataFrame with three columns:
"user_ix": the UserID and "article_id" the ArticleID and "<sortby>" which should be timestamp
to sort by. Each UserID ArticleID pair indicates a click of the user on the article at a time.
:param sortby: Columnname of the timestamp column to sort by
:return: returns a Series where the index is the UserID and values is the by timestamp
sorted list of clicked ArticleIDs
"""
now = datetime.datetime.now()
matrices = pd.read_parquet(f"{folder}/{input_path}")
grouped = matrices.sort_values(sortby).groupby(['user_ix']).apply(lambda x: list(x['article_id']))
grouped.to_pickle(f"{folder}/{output_path}")
print(f"Data transformed {datetime.datetime.now() - now}")
def create_split(folder, input_path='user_item_matrix.pkl', ouput_path='user_item', cut_dump=10):
"""
Loads the horizontal user item data from folder and creates a user-wise a 70% train, 20% validation, 10% test split.
This means for each user the first 70% read articles are in the train the next 20% in validation and the last 10%
read articles in the test set. We remove users with less than 10 clicked articles.
This is the data that is loaded to train/test the models in the end.
"""
now = datetime.datetime.now()
user_item = pd.read_pickle(f"{folder}/{input_path}")
user_item = user_item[user_item.str.len() > (cut_dump)]
user_item_train = user_item.apply(lambda x: x[:int(len(x) * 0.7)])
user_item_test = user_item.apply(lambda x: x[int(len(x) * 0.7):int(len(x) * 0.9)])
user_item_validation = user_item.apply(lambda x: x[int(len(x) * 0.9):])
user_item_train.name = 'article_id'
user_item_test.name = 'article_id'
user_item_validation.name = 'article_id'
user_item_train.to_pickle(f'{folder}/{ouput_path}_train.pkl')
user_item_test.to_pickle(f'{folder}/{ouput_path}_test.pkl')
user_item_validation.to_pickle(f'{folder}/{ouput_path}_validation.pkl')
print(f"Split created {datetime.datetime.now() - now}")
def create_split_vertical(folder, input_path='user_item_matrix_vertical.pq', ouput_path='user_item_vertical', cut_dump=10,time_column='ts'):
"""
Loads the horizontal user item data from folder and creates a user-wise a 70% train, 20% validation, 10% test split.
This means for each user the first 70% read articles are in the train the next 20% in validation and the last 10%
read articles in the test set. We remove users with less than 10 clicked articles.
This is the data that is loaded to train/test the models in the end.
"""
now = datetime.datetime.now()
user_item = pd.read_parquet(f"{folder}/{input_path}").sort_values(time_column)
user_item['count']=user_item.groupby(['user_ix']).article_id.transform('count')
user_item = user_item[user_item['count']>cut_dump]
grouped = user_item.groupby(['user_ix'])
user_item['percentile'] = (grouped.article_id.cumcount() + 1) / grouped.article_id.transform('count')
user_item_train = user_item[user_item['percentile']<=0.7]
user_item_test = user_item[(user_item['percentile']>0.7) & (user_item['percentile']<0.9)]
user_item_validation = user_item[user_item['percentile']>0.9]
user_item_train.to_parquet(f'{folder}/{ouput_path}_train.pq')
user_item_test.to_parquet(f'{folder}/{ouput_path}_test.pq')
user_item_validation.to_parquet(f'{folder}/{ouput_path}_validation.pq')
print(f"Split created {datetime.datetime.now() - now}")
def transform_horizontal_to_vertical(df):
"""
Transforms the horizontal format into vertical format
:param df:
:return:
"""
return df.explode().reset_index()
if __name__ == "__main__":
import pandas as pd
folder = os.getenv('DATA_FOLDER','processed')
# Transforms the user-item-matrix into a user-series. For each user we store the articles read as one sorted list.
# Save the new format.
# This format is more convenient for creating the split and for training some of the algorithms.
transform_item_matrix_to_horizontal_format(folder=folder)
# Create a train,test,validation split. 70%,10%,20% and save it
create_split(folder=folder, cut_dump=10)
create_split_vertical(folder=folder, cut_dump=10)
# loads the saved train,validation,test split
train, test, validation = load_data(folder=folder, cut=40)
# # if you wish to transform into normal user-item-format
# train_vertical = transform_horizontal_to_vertical(train)