Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
fukushima_daisuke committed Feb 5, 2022
1 parent 3487724 commit 0265342
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 15 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.vscode
.vscode
__pycache__
8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3.8.6

WORKDIR /app
COPY ./requirements.txt /app/requirements.txt
RUN pip3 install --upgrade pip && pip3 install -r ./requirements.txt

ADD . /workspace
WORKDIR /workspace
36 changes: 35 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,38 @@

## Overview
PyTorch implementation of the paper
[Xiao Ma, et al., Entire Space Multi-Task Model: An Effective Approach for Estimating Post-Click Conversion Rate, SIGIR-2018](https://dl.acm.org/doi/abs/10.1145/3209978.3210104)
[Xiao Ma, et al., Entire Space Multi-Task Model: An Effective Approach for Estimating Post-Click Conversion Rate, SIGIR-2018](https://dl.acm.org/doi/abs/10.1145/3209978.3210104)

## Dataset
columns
- `feature1` ... `feature_n`: categorical feautre column. (Assuming all variables are categorical.)
- `click`: click label
- `conversion`: conversion label (Always 0 if there are no clicks)

sample data
```
feature1 feature2 feature3 click conversion
0 1 10 cc 1 1
1 4 11 a 0 0
2 6 30 5 0 0
3 10 3 bb 1 0
4 3 33 cc 1 1
5 1 2 d 0 0
6 4 5 cd 1 0
```

## Usage
Environment construction with docker.

Start container
```
$ docker-compose up -d --build
```
Attach container
```
$ docker exec -it esmm /bin/bash
```
Run training esmm
```
$ python train.py
```
Binary file added data/sample.pkl
Binary file not shown.
13 changes: 13 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# 開発用
version: '3'
services:
app:
build:
context: ./
dockerfile: Dockerfile
image: esmm
volumes:
- './:/workspace'
container_name: esmm
tty: true
working_dir: '/workspace'
5 changes: 3 additions & 2 deletions model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ class EsmmDataset(torch.utils.data.Dataset):
"""

def __init__(self, df: pd.DataFrame):
# Drop supervised columns
df_feature = df.drop(columns=['click', 'conversion'])

self.X = torch.from_numpy(df_feature.values).long()
self.click = torch.from_numpy(df['click'].values).float() # label of click
self.conversion = torch.from_numpy(df['conversion'].values).float() # label of conversion
self.click = torch.from_numpy(df['click'].values).float() # click label
self.conversion = torch.from_numpy(df['conversion'].values).float() # conversion label

self.data_num = len(self.X)

Expand Down
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pandas==1.4.0
numpy==1.22.2
category-encoders==1.3.0
tqdm==4.62.3
-f https://download.pytorch.org/whl/cpu/torch_stable.html
torch==1.10.2+cpu
torchvision==0.11.3+cpu
torchaudio==0.10.2+cpu
32 changes: 21 additions & 11 deletions main.py → train.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,20 @@


def load_data() -> pd.DataFrame:
# TODO implement data_load
"""
Load sample data from local storage.
Returns:
pd.DataFrame: pandas dataframe
"""
df = pd.read_pickle('/workspace/data/sample.pkl')
return df


def get_embedding_size(df: pd.DataFrame, embedding_dim: int) -> List[Tuple[int, int]]:
"""
Get embedding size
Args:
df (pd.DataFrame): Train dataset
embedding_dim (int): Number of embedded dimensions
Expand All @@ -36,27 +43,32 @@ def get_embedding_size(df: pd.DataFrame, embedding_dim: int) -> List[Tuple[int,
return embedding_sizes


def train(df: pd.DataFrame):
def train(df: pd.DataFrame) -> None:
"""
Train ESMM.
Args:
df (pd.DataFrame): Encoded dataset
"""
if torch.cuda.is_available():
device = 'cuda'
else:
device = 'cpu'

# Build dataset
dataset = EsmmDataset(df)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

# Build model
embedding_sizes = get_embedding_size(df, 5)
model = ESMM(embedding_sizes)
model = model.to(device)

# Settings
batch_size = 64
loss_fn = loss_fn = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
epochs = 30

# Build dataloader
dataset = EsmmDataset(df)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)

# Start fitting
model.train()
for epoch in range(epochs):
Expand Down Expand Up @@ -87,16 +99,14 @@ def train(df: pd.DataFrame):
running_total_loss = running_total_loss / len(train_loader)
running_ctr_loss = running_ctr_loss / len(train_loader)
running_ctcvr_loss = running_ctcvr_loss / len(train_loader)
print(
f'epoch: {epoch+1}, total_loss: {running_total_loss}, ctr_loss: {running_ctr_loss}, ctcvr_loss: {running_ctcvr_loss}'
)
print(f'epoch: {epoch+1}, train_loss: {running_total_loss}')


def main():
# Load data
df = load_data()

# Encode dataset
# Encode categorical columns
category_columns = ['feature1', 'feature2', 'feature3']
encoder = OrdinalEncoder(cols=category_columns, handle_unknown='impute').fit(df)
df = encoder.transform(df)
Expand Down

0 comments on commit 0265342

Please sign in to comment.