-
Notifications
You must be signed in to change notification settings - Fork 2
/
train.py
119 lines (94 loc) · 3.22 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from typing import List, Tuple
import torch
from torch import nn, optim
import pandas as pd
from pathlib import Path
from category_encoders import OrdinalEncoder
from model.esmm import ESMM
from model.dataset import EsmmDataset
import tqdm
WORK_DIR = Path().resolve()
def load_data() -> pd.DataFrame:
"""
Load sample data from local storage.
Returns:
pd.DataFrame: pandas dataframe
"""
df = pd.read_pickle('/workspace/data/sample.pkl')
return df
def get_embedding_size(df: pd.DataFrame, embedding_dim: int) -> List[Tuple[int, int]]:
"""
Get embedding size
Args:
df (pd.DataFrame): Train dataset
embedding_dim (int): Number of embedded dimensions
Returns:
List[Tuple[int, int]]: List of (Unique number of categories, embedding_dim)
"""
df_feature = df.drop(columns=['click', 'conversion'])
# Get embedding layer size
max_idxs = list(df_feature.max())
embedding_sizes = []
for i in max_idxs:
embedding_sizes.append((int(i + 1), embedding_dim))
return embedding_sizes
def train(df: pd.DataFrame) -> None:
"""
Train ESMM.
Args:
df (pd.DataFrame): Encoded dataset
"""
if torch.cuda.is_available():
device = 'cuda'
else:
device = 'cpu'
# Build dataset
dataset = EsmmDataset(df)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)
# Build model
embedding_sizes = get_embedding_size(df, 5)
model = ESMM(embedding_sizes)
model = model.to(device)
# Settings
loss_fn = loss_fn = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
epochs = 30
# Start fitting
model.train()
for epoch in range(epochs):
running_total_loss = 0.0
running_ctr_loss = 0.0
running_ctcvr_loss = 0.0
for i, (inputs, click, conversion) in tqdm.tqdm(enumerate(train_loader), total=len(train_loader)):
inputs = inputs.to(device)
click = torch.unsqueeze(click.to(device), 1)
conversion = torch.unsqueeze(conversion.to(device), 1)
# Initialize gradient
optimizer.zero_grad()
# caluculate losses
p_ctr, p_ctcvr = model(inputs)
ctr_loss = loss_fn(p_ctr, click)
ctcvr_loss = loss_fn(p_ctcvr, conversion)
total_loss = ctr_loss + ctcvr_loss
# Backpropagation
total_loss.backward()
# Update parameters
optimizer.step()
running_total_loss += total_loss.item()
running_ctr_loss += ctr_loss.item()
running_ctcvr_loss += ctcvr_loss.item()
running_total_loss = running_total_loss / len(train_loader)
running_ctr_loss = running_ctr_loss / len(train_loader)
running_ctcvr_loss = running_ctcvr_loss / len(train_loader)
print(f'epoch: {epoch+1}, train_loss: {running_total_loss}')
def main():
# Load data
df = load_data()
# Encode categorical columns
category_columns = ['feature1', 'feature2', 'feature3']
encoder = OrdinalEncoder(cols=category_columns, handle_unknown='impute').fit(df)
df = encoder.transform(df)
# Start train
train(df)
if __name__ == '__main__':
main()