-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_dsin.py
101 lines (93 loc) · 5.02 KB
/
train_dsin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import yaml
import paddle
import paddle.nn as nn
from net import DSIN_layer
from dsin_reader import RecDataset
from paddle.io import DataLoader
import logging
if __name__ == "__main__":
# set random seed
paddle.seed(12345)
# read config
with open('config_bigdata.yaml', 'r') as f:
config = yaml.load(f.read())
train_batch_size = config['runner']['train_batch_size']
test_batch_size = config['runner']['infer_batch_size']
epochs = config['runner']['epochs']
# read dataset
train_data_dir = config['runner']['train_data_dir']
test_data_dir = config['runner']['test_data_dir']
dataloader_train = DataLoader(RecDataset(train_data_dir, mode='train'), batch_size=train_batch_size, shuffle=False)
dataloader_test = DataLoader(RecDataset(test_data_dir, mode='test'), batch_size=test_batch_size, shuffle=False)
# set logger config
logging.basicConfig(filename='train&test.log', format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info('--------------common.configs--------------')
logger.info(f"train_batch_size: {train_batch_size}, test_batch_size: {test_batch_size}, feature_embed_size: {config['hyper_parameters']['feat_embed_size']} ")
logger.info('--------------common.configs--------------')
# create model
feat_size = config['hyper_parameters']
model = DSIN_layer(user_size = feat_size['user_size'],adgroup_size = feat_size['adgroup_size'],pid_size = feat_size['pid_size'],
cms_segid_size = feat_size['cms_segid_size'],cms_group_size = feat_size['cms_group_size'],final_gender_size = feat_size['final_gender_size'],
age_level_size = feat_size['age_level_size'],pvalue_level_size = feat_size['pvalue_level_size'],shopping_level_size = feat_size['shopping_level_size'],
occupation_size = feat_size['occupation_size'],new_user_class_level_size = feat_size['new_user_class_level_size'],campaign_size = feat_size['campaign_size'],
customer_size = feat_size['customer_size'],cate_size = feat_size['cate_size'],brand_size = feat_size['brand_size'], l2_reg_embedding=1e-6)
# set loss and opt
model.train()
criterion = nn.BCELoss()
optimizer = paddle.optimizer.Adam(learning_rate=config['hyper_parameters']['optimizer']['learning_rate'], parameters=model.parameters())
# training
best_test_auc = 0
for i in range(epochs):
start_time = time.asctime()
auc_metric = paddle.metric.Auc("ROC")
for batch_id, datas in enumerate(dataloader_train):
data, label = datas[0], datas[1]
label = label.reshape([-1,1])
output = model(data)
pred_2d = paddle.concat(x=[1-output, output], axis=1)
auc_metric.update(preds=pred_2d.numpy(), labels=label.numpy())
loss = criterion(output,label)
loss.backward()
optimizer.step()
optimizer.clear_grad()
if batch_id %20 == 0:
logger.info(f"epoch:{i}, batch_id:{batch_id}, log_loss:{loss.numpy()}, train_auc: {auc_metric.accumulate()}")
if batch_id %50 == 0 and auc_metric.accumulate()>=0.6:
model.eval()
test_metric = paddle.metric.Auc('ROC')
for test_batch_id, test_datas in enumerate(dataloader_test):
data, label = test_datas[0], test_datas[1]
label = label.reshape([-1,1])
output = model(data)
pred_2d = paddle.concat(x=[1-output, output], axis=1)
test_metric.update(preds=pred_2d.numpy(), labels=label.numpy())
test_auc = test_metric.accumulate()
best_test_auc = test_auc if test_auc>best_test_auc else best_test_auc
logger.info("------------test stage------------")
logger.info(f"epoch:{i}, batch_id:{batch_id}, test_auc: {test_auc}")
logger.info(f"best_test_auc:{best_test_auc}")
logger.info("------------test stage------------")
model.train()
if best_test_auc >=0.6375:
break
if best_test_auc >=0.63:
break
print()
print(f'After {i} epochs, best_test_auc: {best_test_auc}, log_loss:{loss}, train_auc:{auc_metric.accumulate()}')
print(f'training start at {start_time} finish at {time.asctime()}')