-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
152 lines (120 loc) · 3.89 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
__author__ = 'yixuanhe'
import time
def get_time(ts):
time_array = time.localtime(ts)
time_str = time.strftime("%Y%m%d", time_array)
return time_str
def get_test_data(train_name, test_name):
"""
get test data that we need to test between 9 and 10
:param train_name:
:param test_name:
:return:
"""
songs = {}
with open(train_name) as f:
for l in f.readlines():
data = l.replace("\n", "").split(" ")
id = data[1]
t = data[6]
if id not in songs:
songs[id] = l
else:
r_time = data[6]
if t > r_time:
songs[id] = l
with open(test_name, "w") as f:
begin = time.mktime(time.strptime("2015-09-01 0:00:00", '%Y-%m-%d %H:%M:%S'))
day = 24*60*60
for k in songs:
l = songs[k]
data = l.split(" ")
t1 = time.mktime(time.strptime(data[7] + " 0:00:00", '%Y%m%d %H:%M:%S'))
# ts = int(data[5])
for i in range(0, 60):
t = int(begin + i*day)
data[6] = get_time(t)
data[5] = str((t - t1)/day)
w = ""
for d in data:
w += d + " "
f.write(w.strip()+"\n")
def deal_with_data(origin, write_file):
"""
cut not necessary data which before data it publish or first listened
:param origin:
:param write_file:
:return:
"""
play_time = {}
with open(origin) as f:
with open(write_file, "w") as w:
for l in f.readlines():
data = l.split(" ")
songid = data[1]
play = int(data[2])
cur = data[6]
begin = data[7]
if play > 0 or begin <= cur:
if songid not in play_time:
play_time[songid] = cur
elif play_time[songid] < cur:
play_time[songid] = cur
if songid not in play_time:
continue
if cur >= play_time[songid]:
w.write(l)
def devide(origin_file, test_file, train_file, date):
"""
cut train and test data
:param origin_file:
:param test_file:
:param train_file:
:param date:
:return:
"""
with open(origin_file, "r") as origin, open(test_file, "w") as test, open(train_file, "w") as train:
for line in origin.readlines():
feature = line.split(" ")
# deal with the empty row
if len(feature) == 1:
continue
ts = feature[6]
# time_array = time.localtime(ts)
# time_str = time.strftime("%Y%m%d", time_array)
if ts >= date:
test.write(line)
else:
train.write(line)
def avg_data(path, avg_path, avg_days=3):
"""
get avg data
:param path:
:param avg_path:
:param avg_days:
:return:
"""
plays = {}
with open(path, "w") as w:
with open(avg_path) as f:
for l in f.readlines():
data = l.replace("\n", "").split(" ")
songid = data[1]
play = int(data[2])
tmp = plays.get(songid, [])
tmp.append(play)
plays[songid] = tmp
avg = 0
num = 0
for i in plays[songid][-avg_days::]:
avg += i
num += 1
avg /= num
data[2] = str(avg)
for d in data:
w.write(d + " ")
w.write("\n")
deal_with_data('data/data', 'data/data_start')
devide("data/data_start", "data/test_p2", "data/train_p2", "20150801")
get_test_data('data/data', 'data/pose')