Skip to content

Commit

Permalink
Merge branch 'main' into feat/9-mlflow
Browse files Browse the repository at this point in the history
  • Loading branch information
twndus authored May 16, 2024
2 parents a74f275 + cde55f5 commit d87889c
Show file tree
Hide file tree
Showing 13 changed files with 501 additions and 57 deletions.
33 changes: 27 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ This project focuses on matching benchmark performance in recommendation systems

### Models Implemented
- **Collaborative Filtering**: Predicts user preferences based on user-item interactions.
- **[Collaborative Denoising Auto-Encoders (2016)](https://alicezheng.org/papers/wsdm16-cdae.pdf)** applies Denoising Auto-Encoders (DAE) to top-N recommendation systems, generalizing various collaborative filtering (CF) models. Unlike AutoRec from 2015, CDAE incorporates a user node and uses corrupted input preferences.
- **Matrix Factorization**: Reduces the dimensionality of the interaction matrix to uncover latent features.
- **Deep Neural Networks**: Leverages deep learning to enhance prediction accuracy using complex feature interactions.
- **Hybrid Models**: Integrates several models to capitalize on their individual strengths for superior performance.
Expand All @@ -35,16 +36,36 @@ To run this project, you will need:

The following table shows the performance of different models used in the project. Each model was evaluated based on multiple metrics:

| Model | Accuracy | Precision | Recall | F1 Score |
| Model | MAP@10 | Precision@10 | Recall@10 | NDCG@10 |
|-------------------------|----------|-----------|--------|----------|
| Collaborative Filtering | 82.5% | 80.3% | 84.1% | 82.1% |
| Matrix Factorization | 85.0% | 83.7% | 86.4% | 85.0% |
| Deep Neural Networks | 87.5% | 85.8% | 89.2% | 87.4% |
| Hybrid Models | 90.2% | 88.9% | 91.5% | 90.2% |
| CDAE | 82.5% | 80.3% | 84.1% | 82.1% |
| DCN | 85.0% | 83.7% | 86.4% | 85.0% |
| NGCF | 87.5% | 85.8% | 89.2% | 87.4% |
| S3Rec | 90.2% | 88.9% | 91.5% | 90.2% |
| Multi-armed bandit | 90.2% | 88.9% | 91.5% | 90.2% |

These results were obtained from the Yelp 2018 dataset under controlled test conditions.


## How to Run

Prerequisites
- Python >= 3.11
- Poetry >= 1.8.2
- [Pytorch](https://pytorch.org/)

```
# set environments
$ poetry install
$ poetry shell
# generate input data
# download data from [yelp official website](https://www.yelp.com/dataset/download) and set data directory in config
$ vi configs/data_preprocess.yaml
$ python data/data_preprocess.py
# train model
$ vi configs/train_config.yaml
$ python train.py
```

## Contributors
22 changes: 14 additions & 8 deletions configs/train_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,23 @@ tags: [test, yelp, cdae]

# train config
device: cuda # cpu
epochs: 1
epochs: 10
batch_size: 32
lr: 0.001
optimizer: adamw
loss: bce
optimizer: sgd # adamw
loss_name: bpr # pointwise # bce
patience: 5
top_n: 10
weight_decay: 0 #1e-5

# model config
model_name: CDAE
hidden_size: 64
corruption_level: 0.6
hidden_activation: sigmoid
output_activation: sigmoid
#model_name: CDAE
#negative_sampling: True # False
#neg_times: 5 # this works only when negative_sampling == True, if value is 5, the number of negative samples will be 5 times the number of positive samples by users
#hidden_size: 64
#corruption_level: 0.6
#hidden_activation: sigmoid
#output_activation: sigmoid

model_name: MF
embed_size: 64
39 changes: 33 additions & 6 deletions data/datasets/cdae_dataset.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,59 @@
import numpy as np

import torch
from torch.utils.data import Dataset

from loguru import logger

class CDAEDataset(Dataset):

def __init__(self, data, mode='train'):
def __init__(self, data, mode='train', neg_times: int=5):
super().__init__()
self.data = data
self.mode = mode
if self.mode != 'test':
self.neg_times = neg_times

def __len__(self):
return len(self.data.keys())

def _negative_sampling(self, input_mask):
# Calculate the number of positive samples.
num_pos = int(input_mask.sum())
# Flip zeros and ones to generate candidates for negative sampling.
flipped_mask = 1-input_mask
# Retrieve indexes of the negative candidates.
negative_indexes = flipped_mask.nonzero()[0]
# Sample from negative indexes, selecting multiple times the number of positive samples.
negative_samples = np.random.choice(negative_indexes, num_pos*self.neg_times, replace=False)
# Create a negative mask of the same shape as input_mask
negative_mask = np.zeros_like(input_mask)
# Set sampled indexes to 1 in the negative mask
# Only the masked indexes need to be computed for the loss
negative_mask[negative_samples] = 1.
return negative_mask

def __getitem__(self, user_id):
input_mask = self.data[user_id]['input_mask'].astype('float32')
if self.mode == 'train':
return {
'user_id': user_id,
'input_mask': self.data[user_id]['input_mask'].astype('float32'),
'input_mask': input_mask,
'negative_mask': self._negative_sampling(input_mask)
}
elif self.mode == 'valid':
valid_mask = self.data[user_id]['valid_mask'].astype('float32')
return {
'user_id': user_id,
'input_mask': self.data[user_id]['input_mask'].astype('float32'),
'valid_mask': self.data[user_id]['valid_mask'].astype('float32'),
'input_mask': input_mask,
'valid_mask': valid_mask,
'negative_mask': self._negative_sampling(input_mask + valid_mask)
}
else:
test_mask = self.data[user_id]['test_mask'].astype('float32')
return {
'user_id': user_id,
'input_mask': self.data[user_id]['input_mask'].astype('float32'),
'test_mask': self.data[user_id]['test_mask'].astype('float32')
'input_mask': input_mask,
'test_mask': test_mask,
}

96 changes: 96 additions & 0 deletions data/datasets/mf_data_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from loguru import logger

from .data_pipeline import DataPipeline

class MFDataPipeline(DataPipeline):

def __init__(self, cfg):
super().__init__(cfg)
self.num_items = None
self.num_users = None

def split(self, df):
'''
data: ((user_id, item_id, rating), ...)
'''
logger.info(f'start random user split...')
train_df, valid_df, test_df = [], [], []

for _, user_df in df.groupby('user_id'):
if self.cfg.loss_name == 'pointwise':
user_train_df, user_test_df = train_test_split(user_df, test_size=.2, stratify=user_df['rating'])
user_train_df, user_valid_df = train_test_split(user_train_df, test_size=.25, stratify=user_train_df['rating'])
else:
user_train_df, user_test_df = train_test_split(user_df, test_size=.2)
user_train_df, user_valid_df = train_test_split(user_train_df, test_size=.25)
train_df.append(user_train_df)
valid_df.append(user_valid_df)
test_df.append(user_test_df)

train_df = pd.concat(train_df).reset_index()
valid_df = pd.concat(valid_df).reset_index()
test_df = pd.concat(test_df).reset_index()

train_pos_df = train_df.groupby('user_id').agg({'business_id': [('pos_items', list)]}).droplevel(0, 1)
valid_pos_df = valid_df.groupby('user_id').agg({'business_id': [('pos_items', list)]}).droplevel(0, 1)
train_valid_pos_df = pd.concat([train_df, valid_df], axis=0).groupby('user_id').agg({'business_id': [('pos_items', list)]}).droplevel(0, 1)
test_pos_df = test_df.groupby('user_id').agg({'business_id': [('pos_items', list)]}).droplevel(0, 1)

train_data = pd.merge(train_df, train_pos_df, left_on='user_id', right_on='user_id', how='left')
valid_data = pd.merge(valid_df, train_valid_pos_df, left_on='user_id', right_on='user_id', how='left')
valid_eval_data = pd.merge(valid_pos_df, train_pos_df.rename(columns={'pos_items': 'mask_items'}), left_on='user_id', right_on='user_id', how='left')
test_eval_data = pd.merge(test_pos_df, train_valid_pos_df.rename(columns={'pos_items': 'mask_items'}), left_on='user_id', right_on='user_id', how='left')

return train_data, valid_data, valid_eval_data, test_eval_data

def preprocess(self) -> pd.DataFrame:
'''
output: pivot table (row: user, col: user-specific vector + item set, values: binary preference)
'''
logger.info("start preprocessing...")
# load df
df = self._load_df()
# set num items and num users
self._set_num_items_and_num_users(df)
# negative sampling
if self.cfg.loss_name == 'pointwise':
df = self._negative_sampling(df, self.cfg.neg_times)
logger.info("done")
return df

def _load_df(self):
logger.info("load df...")
return pd.read_csv(os.path.join(self.cfg.data_dir, 'yelp_interactions.tsv'), sep='\t', index_col=False)

def _set_num_items_and_num_users(self, df):
self.num_items = df.business_id.nunique()
self.num_users = df.user_id.nunique()

def _negative_sampling(self, df: pd.DataFrame, neg_times: 5) -> pd.DataFrame:
logger.info(f"negative sampling...")
logger.info(f"before neg sampling: {df.shape}")
all_items = df.business_id.unique()

df['rating'] = 1
neg_data = []
for _, user_df in df.groupby('user_id'):
user_id = user_df.user_id.values[0]
pos_items = user_df.business_id.unique()
neg_items = []
while len(neg_items) < len(pos_items)*neg_times:
neg_item = np.random.choice(all_items)
if (neg_item in pos_items) or (neg_item in neg_items): continue
neg_items.append(neg_item)
neg_data.extend([[user_id, neg_item, 0] for neg_item in neg_items])

df = pd.concat([df, pd.DataFrame(neg_data, columns=df.columns)], axis=0)
df = df.sample(frac=1).reset_index(drop=True)
logger.info(f"after neg sampling: {df.shape}")
logger.info(f"done...")
return df
32 changes: 32 additions & 0 deletions data/datasets/mf_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import numpy as np

import torch
from torch.utils.data import Dataset

from loguru import logger

class MFDataset(Dataset):

def __init__(self, data, num_items=None):
super().__init__()
self.data = data
self.num_items = num_items

def __len__(self):
return self.data.shape[0]

def _negative_sampling(self, user_positives):
neg_item = np.random.randint(self.num_items)
while neg_item in user_positives:
neg_item = np.random.randint(self.num_items)
return neg_item

def __getitem__(self, index):
data = self.data.iloc[index,:]
pos_item = data['business_id'].astype('int64')
user_pos_items = data['pos_items']
return {
'user_id': data['user_id'].astype('int64'),
'pos_item': pos_item,
'neg_item': self._negative_sampling(user_pos_items)
}
27 changes: 27 additions & 0 deletions loss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Optional

import torch
import torch.nn as nn
from torch import Tensor

class NSBCELoss(nn.BCELoss):

def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
super().__init__(weight, size_average, reduce, reduction)

def forward(self, input: Tensor, target: Tensor, negative_mask: Tensor) -> Tensor:
# make loss masking adding negative_mask to target and find nonzero indices
loss_targets = (target.add(negative_mask)).nonzero(as_tuple=True)
# compute loss only for nonzero indices
return nn.functional.binary_cross_entropy(input[loss_targets], target[loss_targets], weight=self.weight, reduction=self.reduction)


class BPRLoss(nn.Module):

def __init__(self):
super().__init__()
self.logsigmoid = nn.LogSigmoid()

def forward(self, positive_preds, negative_preds):
difference = positive_preds - negative_preds
return torch.mean(-self.logsigmoid(difference))
23 changes: 23 additions & 0 deletions models/mf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import torch
import torch.nn as nn

from models.base_model import BaseModel

from loguru import logger
class MatrixFactorization(BaseModel):

def __init__(self, cfg, num_users, num_items):
super().__init__()
self.user_embedding = nn.Embedding(num_users, cfg.embed_size, dtype=torch.float32)
self.item_embedding = nn.Embedding(num_items, cfg.embed_size, dtype=torch.float32)
self._init_weights()

def _init_weights(self):
for child in self.children():
if isinstance(child, nn.Embedding):
nn.init.xavier_uniform_(child.weight)

def forward(self, user_id, item_id):
user_emb = self.user_embedding(user_id)
item_emb = self.item_embedding(item_id)
return torch.sum(user_emb * item_emb, dim=1)
Loading

0 comments on commit d87889c

Please sign in to comment.