From f701e9124a9342b49c059b16f116bd9c75ed1d50 Mon Sep 17 00:00:00 2001 From: GangBean Date: Mon, 15 Jul 2024 01:52:02 +0000 Subject: [PATCH 01/22] refactor: order by date in data preprocessing #21 --- data/data_preprocess.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/data/data_preprocess.py b/data/data_preprocess.py index ea68694..1fb4ccb 100644 --- a/data/data_preprocess.py +++ b/data/data_preprocess.py @@ -30,7 +30,10 @@ def run(self): review_df = self._filter_by_min_interactions(review_df, self.cfg.min_interactions) logger.info(f'필터링 후 데이터: {review_df.shape}') - review_df = self._id_mapping(review_df) + review_df: pd.DataFrame = self._id_mapping(review_df) + # logger.info(f"review df dtypes: {review_df.dtypes}") + review_df = review_df.sort_values(['date']) + # logger.info(f"after order by: {review_df[review_df.user_id == review_df.iloc[0].user_id].head()}") review_df = review_df[['user_id', 'business_id', 'stars']].rename(columns={'stars':'rating'}) self._save_interactions(review_df) @@ -145,7 +148,7 @@ def _save_entities2attributes(self, entities2attributes, entity_name): logger.info(f"done...") -@hydra.main(version_base=None, config_path="../config", config_name="data_preprocess") +@hydra.main(version_base=None, config_path="../configs", config_name="data_preprocess") def main(cfg: OmegaConf): ypp = YelpPreprocessPipe(cfg) ypp.run() From f3f13ee31ff9fef67834a67d90715ba493528fb7 Mon Sep 17 00:00:00 2001 From: GangBean Date: Mon, 15 Jul 2024 01:53:50 +0000 Subject: [PATCH 02/22] feat: implements S3Rec preprocess method #21 --- configs/train_config.yaml | 4 ++- data/datasets/s3rec_data_pipeline.py | 40 ++++++++++++++++++++++++++++ train.py | 3 +++ 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 data/datasets/s3rec_data_pipeline.py diff --git a/configs/train_config.yaml b/configs/train_config.yaml index a32c8c2..4509527 100644 --- a/configs/train_config.yaml +++ b/configs/train_config.yaml @@ -26,7 +26,7 @@ weight_decay: 0 #1e-5 best_metric: loss # loss, precision, recall, map, ndcg # model config -model_name: NGCF +model_name: S3Rec model: CDAE: negative_sampling: True # False @@ -44,3 +44,5 @@ model: NGCF: embed_size: 64 num_orders: 3 + S3Rec: + max_seq_len: 50 diff --git a/data/datasets/s3rec_data_pipeline.py b/data/datasets/s3rec_data_pipeline.py new file mode 100644 index 0000000..7cc8d87 --- /dev/null +++ b/data/datasets/s3rec_data_pipeline.py @@ -0,0 +1,40 @@ +import os +from loguru import logger +import pandas as pd +from .data_pipeline import DataPipeline + +class S3RecDataPipeline(DataPipeline): + def __init__(self, cfg): + super().__init__(cfg) + self.num_users = None + self.num_items = None + + def split(self): + return None + + def preprocess(self) -> pd.DataFrame: + ''' + output: pivot table (row: user, col: user-specific vector + item set, values: binary preference) + ''' + logger.info("start preprocessing...") + # load df + df = self._load_df() + # set num items and num users + self._set_num_items_and_num_users(df) + + # user 별로 item sequence 뽑아야돼 + # train_pos_df = train_df.groupby('user_id').agg({'business_id': [('pos_items', list)]}).droplevel(0, 1) + df = df.groupby(['user_id']).agg({'business_id': [('behaviors', list)]}).droplevel(0, 1) + # logger.info(f"after groupby: {df.head()}") + + logger.info("done") + return df + + def _load_df(self): + logger.info("load df...") + return pd.read_csv(os.path.join(self.cfg.data_dir, 'yelp_interactions.tsv'), sep='\t', index_col=False) + + + def _set_num_items_and_num_users(self, df): + self.num_items = df.business_id.nunique() + self.num_users = df.user_id.nunique() diff --git a/train.py b/train.py index 8e25862..f0b0ac2 100644 --- a/train.py +++ b/train.py @@ -17,6 +17,7 @@ from data.datasets.ngcf_data_pipeline import NGCFDataPipeline from data.datasets.cdae_dataset import CDAEDataset from data.datasets.mf_dataset import MFDataset +from data.datasets.s3rec_data_pipeline import S3RecDataPipeline from trainers.cdae_trainer import CDAETrainer from trainers.dcn_trainer import DCNTrainer from trainers.mf_trainer import MFTrainer @@ -122,6 +123,8 @@ def main(cfg: OmegaConf): data_pipeline = DCNDatapipeline(cfg) elif cfg.model_name == 'NGCF': data_pipeline = NGCFDataPipeline(cfg) + elif cfg.model_name == 'S3Rec': + data_pipeline = S3RecDataPipeline(cfg) else: raise ValueError() From 2ccf3a43f5e50b05034583645b32649162bbad8f Mon Sep 17 00:00:00 2001 From: GangBean Date: Mon, 15 Jul 2024 03:03:45 +0000 Subject: [PATCH 03/22] feat: implements S3Rec split method #21 --- data/datasets/s3rec_data_pipeline.py | 53 +++++++++++++++++++++++++--- train.py | 9 +++++ 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/data/datasets/s3rec_data_pipeline.py b/data/datasets/s3rec_data_pipeline.py index 7cc8d87..daa612a 100644 --- a/data/datasets/s3rec_data_pipeline.py +++ b/data/datasets/s3rec_data_pipeline.py @@ -9,8 +9,41 @@ def __init__(self, cfg): self.num_users = None self.num_items = None - def split(self): - return None + + def split(self, df: pd.DataFrame): + # train X: [:-3] y: -3 + train_df_X = df.behaviors.apply(lambda row: row[: -3]).rename({'behaviors': 'X'}) + train_df_Y = df.behaviors.apply(lambda row: row[-3]).rename({'behaviors': 'y'}) + + # valid X: [:-2] y: -2 + valid_df_X = df.behaviors.apply(lambda row: row[: -2]).rename({'behaviors': 'X'}) + valid_df_Y = df.behaviors.apply(lambda row: row[-2]).rename({'behaviors': 'y'}) + + # test X: [:-1] y: -1 + test_df_X = df.behaviors.apply(lambda row: row[: -1]).rename({'behaviors': 'X'}) + test_df_Y = df.behaviors.apply(lambda row: row[-1]).rename({'behaviors': 'y'}) + + # pre-padding for input sequence X + train_df_X = self._adjust_seq_len(train_df_X) + valid_df_X = self._adjust_seq_len(valid_df_X) + test_df_X = self._adjust_seq_len(test_df_X) + + return pd.concat([train_df_X, train_df_Y], axis=1),\ + pd.concat([valid_df_X, valid_df_Y], axis=1),\ + pd.concat([test_df_X, test_df_Y], axis=1) + + + def _adjust_seq_len(self, df): + def _adjust_seq_len_by_user(row): + if len(row) > self.cfg.max_seq_len: + row = row[-self.cfg.max_seq_len:] + elif len(row) < self.cfg.max_seq_len: + row = [0] * (self.cfg.max_seq_len - len(row)) + row + return row + + df = df.apply(_adjust_seq_len_by_user) + return df + def preprocess(self) -> pd.DataFrame: ''' @@ -22,19 +55,29 @@ def preprocess(self) -> pd.DataFrame: # set num items and num users self._set_num_items_and_num_users(df) - # user 별로 item sequence 뽑아야돼 - # train_pos_df = train_df.groupby('user_id').agg({'business_id': [('pos_items', list)]}).droplevel(0, 1) + # group by user_id df = df.groupby(['user_id']).agg({'business_id': [('behaviors', list)]}).droplevel(0, 1) - # logger.info(f"after groupby: {df.head()}") + + # load attributes + self.item2attributes = self._load_attributes() logger.info("done") return df + def _load_df(self): logger.info("load df...") return pd.read_csv(os.path.join(self.cfg.data_dir, 'yelp_interactions.tsv'), sep='\t', index_col=False) + def _load_attributes(self): + logger.info("load item2attributes...") + df = pd.read_json(os.path.join(self.cfg.data_dir, 'yelp_item2attributes.json')).transpose() + self.attributes_count = [df.categories.explode().nunique(), df.statecity.nunique()] + + return df.transpose().to_dict() + + def _set_num_items_and_num_users(self, df): self.num_items = df.business_id.nunique() self.num_users = df.user_id.nunique() diff --git a/train.py b/train.py index f0b0ac2..4b1fb3c 100644 --- a/train.py +++ b/train.py @@ -159,6 +159,15 @@ def main(cfg: OmegaConf): valid_dataset = MFDataset(valid_data, num_items=data_pipeline.num_items) args.update({'valid_eval_data': valid_eval_data, 'test_eval_data': test_eval_data}) model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users + elif cfg.model_name == 'S3Rec': + train_data, valid_data, test_data = data_pipeline.split(df) + # logger.info(f"train: {train_data.loc[1]}") + # logger.info(f"valid: {valid_data.loc[1]}") + # logger.info(f"test: {test_data.loc[1]}") + # train_dataset = MFDataset(train_data, num_items=data_pipeline.num_items) + # valid_dataset = MFDataset(valid_data, num_items=data_pipeline.num_items) + # args.update({'valid_eval_data': valid_eval_data, 'test_eval_data': test_eval_data}) + # model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users else: raise ValueError() From 1ebe96af9e3d06d53e1bcb8fc0e76980270ec9b4 Mon Sep 17 00:00:00 2001 From: Judy Date: Mon, 15 Jul 2024 13:26:37 +0000 Subject: [PATCH 04/22] feat: implements s3rec dataset #21 --- data/datasets/s3rec_data_pipeline.py | 17 ++++++----- data/datasets/s3rec_dataset.py | 45 ++++++++++++++++++++++++++++ train.py | 17 +++++------ 3 files changed, 62 insertions(+), 17 deletions(-) create mode 100644 data/datasets/s3rec_dataset.py diff --git a/data/datasets/s3rec_data_pipeline.py b/data/datasets/s3rec_data_pipeline.py index daa612a..a2e881c 100644 --- a/data/datasets/s3rec_data_pipeline.py +++ b/data/datasets/s3rec_data_pipeline.py @@ -12,16 +12,16 @@ def __init__(self, cfg): def split(self, df: pd.DataFrame): # train X: [:-3] y: -3 - train_df_X = df.behaviors.apply(lambda row: row[: -3]).rename({'behaviors': 'X'}) - train_df_Y = df.behaviors.apply(lambda row: row[-3]).rename({'behaviors': 'y'}) + train_df_X = df.behaviors.apply(lambda row: row[: -3]).rename('X') + train_df_Y = df.behaviors.apply(lambda row: row[-3]).rename('y') # valid X: [:-2] y: -2 - valid_df_X = df.behaviors.apply(lambda row: row[: -2]).rename({'behaviors': 'X'}) - valid_df_Y = df.behaviors.apply(lambda row: row[-2]).rename({'behaviors': 'y'}) + valid_df_X = df.behaviors.apply(lambda row: row[: -2]).rename('X') + valid_df_Y = df.behaviors.apply(lambda row: row[-2]).rename('y') # test X: [:-1] y: -1 - test_df_X = df.behaviors.apply(lambda row: row[: -1]).rename({'behaviors': 'X'}) - test_df_Y = df.behaviors.apply(lambda row: row[-1]).rename({'behaviors': 'y'}) + test_df_X = df.behaviors.apply(lambda row: row[: -1]).rename('X') + test_df_Y = df.behaviors.apply(lambda row: row[-1]).rename('y') # pre-padding for input sequence X train_df_X = self._adjust_seq_len(train_df_X) @@ -38,8 +38,9 @@ def _adjust_seq_len_by_user(row): if len(row) > self.cfg.max_seq_len: row = row[-self.cfg.max_seq_len:] elif len(row) < self.cfg.max_seq_len: - row = [0] * (self.cfg.max_seq_len - len(row)) + row - return row + row = [-1] * (self.cfg.max_seq_len - len(row)) + row + # item 0: pad, item starts from 1 + return [e+1 for e in row] df = df.apply(_adjust_seq_len_by_user) return df diff --git a/data/datasets/s3rec_dataset.py b/data/datasets/s3rec_dataset.py new file mode 100644 index 0000000..e7b657d --- /dev/null +++ b/data/datasets/s3rec_dataset.py @@ -0,0 +1,45 @@ +import numpy as np + +import torch +from torch.utils.data import Dataset + +from loguru import logger + +class S3RecDataset(Dataset): + + def __init__(self, data, num_items=None, train=True): + super().__init__() + self.data = data + self.num_items = num_items + self.train = train + + def __len__(self): + return self.data.shape[0] + + def _negative_sampling(self, pos_item): + sample_size = 1 if self.train else 99 + neg_items = [] + for _ in range(sample_size): + neg_item = np.random.randint(1, self.num_items+1) + while (neg_item == pos_item) or (neg_item in neg_items): + neg_item = np.random.randint(1, self.num_items+1) + neg_items.append(neg_item) + return neg_items + + def __getitem__(self, user_id): + data = self.data.iloc[user_id,:] + pos_item = data['y'].astype('int64') + if self.train: + return { + 'user_id': user_id, + 'X': data['X'], + 'pos_item': pos_item, + 'neg_item': self._negative_sampling(pos_item)[0] + } + else: + return { + 'user_id': user_id, + 'X': data['X'], + 'pos_item': pos_item, + 'neg_items': self._negative_sampling(pos_item) + } diff --git a/train.py b/train.py index 4b1fb3c..ed235ae 100644 --- a/train.py +++ b/train.py @@ -11,13 +11,14 @@ from loguru import logger from data.datasets.cdae_data_pipeline import CDAEDataPipeline -from data.datasets.dcn_dataset import DCNDataset from data.datasets.mf_data_pipeline import MFDataPipeline from data.datasets.dcn_data_pipeline import DCNDatapipeline from data.datasets.ngcf_data_pipeline import NGCFDataPipeline +from data.datasets.s3rec_data_pipeline import S3RecDataPipeline from data.datasets.cdae_dataset import CDAEDataset from data.datasets.mf_dataset import MFDataset -from data.datasets.s3rec_data_pipeline import S3RecDataPipeline +from data.datasets.dcn_dataset import DCNDataset +from data.datasets.s3rec_dataset import S3RecDataset from trainers.cdae_trainer import CDAETrainer from trainers.dcn_trainer import DCNTrainer from trainers.mf_trainer import MFTrainer @@ -161,13 +162,11 @@ def main(cfg: OmegaConf): model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users elif cfg.model_name == 'S3Rec': train_data, valid_data, test_data = data_pipeline.split(df) - # logger.info(f"train: {train_data.loc[1]}") - # logger.info(f"valid: {valid_data.loc[1]}") - # logger.info(f"test: {test_data.loc[1]}") - # train_dataset = MFDataset(train_data, num_items=data_pipeline.num_items) - # valid_dataset = MFDataset(valid_data, num_items=data_pipeline.num_items) - # args.update({'valid_eval_data': valid_eval_data, 'test_eval_data': test_eval_data}) - # model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users + train_dataset = S3RecDataset(train_data, num_items=data_pipeline.num_items) + valid_dataset = S3RecDataset(valid_data, num_items=data_pipeline.num_items) + test_dataset = S3RecDataset(test_data, num_items=data_pipeline.num_items, train=False) + args.update({'test_dataset': test_data}) + model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users else: raise ValueError() From 4377c9f2fc74cdf2dbbaac594ac86df51e0d2c75 Mon Sep 17 00:00:00 2001 From: Judy Date: Mon, 15 Jul 2024 13:42:50 +0000 Subject: [PATCH 05/22] refactor: exclude behaviors in negative sampling #21 --- data/datasets/s3rec_data_pipeline.py | 6 +++--- data/datasets/s3rec_dataset.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/data/datasets/s3rec_data_pipeline.py b/data/datasets/s3rec_data_pipeline.py index a2e881c..ecf5547 100644 --- a/data/datasets/s3rec_data_pipeline.py +++ b/data/datasets/s3rec_data_pipeline.py @@ -28,9 +28,9 @@ def split(self, df: pd.DataFrame): valid_df_X = self._adjust_seq_len(valid_df_X) test_df_X = self._adjust_seq_len(test_df_X) - return pd.concat([train_df_X, train_df_Y], axis=1),\ - pd.concat([valid_df_X, valid_df_Y], axis=1),\ - pd.concat([test_df_X, test_df_Y], axis=1) + return pd.concat([df, train_df_X, train_df_Y], axis=1),\ + pd.concat([df, valid_df_X, valid_df_Y], axis=1),\ + pd.concat([df, test_df_X, test_df_Y], axis=1) def _adjust_seq_len(self, df): diff --git a/data/datasets/s3rec_dataset.py b/data/datasets/s3rec_dataset.py index e7b657d..cbd575a 100644 --- a/data/datasets/s3rec_dataset.py +++ b/data/datasets/s3rec_dataset.py @@ -16,12 +16,12 @@ def __init__(self, data, num_items=None, train=True): def __len__(self): return self.data.shape[0] - def _negative_sampling(self, pos_item): + def _negative_sampling(self, behaviors): sample_size = 1 if self.train else 99 neg_items = [] for _ in range(sample_size): neg_item = np.random.randint(1, self.num_items+1) - while (neg_item == pos_item) or (neg_item in neg_items): + while (neg_item in behaviors) or (neg_item in neg_items): neg_item = np.random.randint(1, self.num_items+1) neg_items.append(neg_item) return neg_items @@ -34,12 +34,12 @@ def __getitem__(self, user_id): 'user_id': user_id, 'X': data['X'], 'pos_item': pos_item, - 'neg_item': self._negative_sampling(pos_item)[0] + 'neg_item': self._negative_sampling(data['behaviors'])[0] } else: return { 'user_id': user_id, 'X': data['X'], 'pos_item': pos_item, - 'neg_items': self._negative_sampling(pos_item) + 'neg_items': self._negative_sampling(data['behaviors']) } From 0737c28734c0f8874855dca711d83379281f5393 Mon Sep 17 00:00:00 2001 From: Judy Date: Mon, 15 Jul 2024 14:28:04 +0000 Subject: [PATCH 06/22] feat: implement s3rec trainer #21 --- trainers/s3rec_trainer.py | 140 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 trainers/s3rec_trainer.py diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py new file mode 100644 index 0000000..f490c8d --- /dev/null +++ b/trainers/s3rec_trainer.py @@ -0,0 +1,140 @@ +import numpy as np +from tqdm import tqdm + +import torch +import torch.nn as nn +from torch import Tensor +from torch.utils.data import DataLoader +from torch.optim import Optimizer + +from loguru import logger +from omegaconf.dictconfig import DictConfig + +from models.cdae import CDAE +from utils import log_metric +from .base_trainer import BaseTrainer +from metric import * +from loss import BPRLoss + +class CDAETrainer(BaseTrainer): + def __init__(self, cfg: DictConfig, num_items: int, num_users: int) -> None: + super().__init__(cfg) + self.model = CDAE(self.cfg, num_items, num_users) ## + self.optimizer: Optimizer = self._optimizer(self.cfg.optimizer, self.model, self.cfg.lr) + self.loss = self._loss() + + def _loss(self): + return BPRLoss() + + def run(self, train_dataloader: DataLoader, valid_dataloader: DataLoader): + logger.info(f"[Trainer] run...") + + best_valid_loss: float = 1e+6 + best_epoch: int = 0 + endurance: int = 0 + + # train + for epoch in range(self.cfg.epochs): + train_loss: float = self.train(train_dataloader) + valid_loss = self.validate(valid_dataloader) + logger.info(f'''\n[Trainer] epoch: {epoch} > train loss: {train_loss:.4f} / + valid loss: {valid_loss:.4f}''') + + if self.cfg.wandb: + wandb.log({ + 'train_loss': train_loss, + 'valid_loss': valid_loss, + }) + + # update model + if self._is_surpass_best_metric( + current=(valid_loss,) + best=(best_valid_loss,)): + + logger.info(f"[Trainer] update best model...") + best_valid_loss = valid_loss + best_epoch = epoch + endurance = 0 + + torch.save(self.model.state_dict(), f'{self.cfg.model_dir}/best_model.pt') + else: + endurance += 1 + if endurance > self.cfg.patience: + logger.info(f"[Trainer] ealry stopping...") + break + + def train(self, train_dataloader: DataLoader) -> float: + self.model.train() + train_loss = 0 + for data in tqdm(train_dataloader): + X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) + pos_pred, neg_pred = self.model(X, pos_item, neg_item) + + self.optimizer.zero_grad() + loss = self.loss(pos_pred, neg_pred) + loss.backward() + self.optimizer.step() + + train_loss += loss.item() + + return train_loss + + def validate(self, valid_dataloader: DataLoader) -> tuple[float]: + self.model.eval() + valid_loss = 0 + actual, predicted = [], [] + for data in tqdm(valid_dataloader): + X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) + pos_pred, neg_pred = self.model(X, pos_item, neg_item) + + self.optimizer.zero_grad() + loss = self.loss(pos_pred, neg_pred) + + valid_loss += loss.item() + + return valid_loss + + @log_metric + def evaluate(self, test_dataloader: DataLoader) -> tuple[float]: + self.model.eval() + actual, predicted = [], [] + for data in tqdm(test_dataloader): + X, pos_item, neg_items = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_items'].to(self.device) + scores = self.model.evaluate(X, pos_item, neg_items) + + batch_actual, batch_predicted = \ + self._generate_target_and_top_k_recommendation(pred, test_mask, input_mask) + actual.append(batch_actual) + predicted.append(batch_predicted) + + predicted = np.concatenate(predicted, axis=0) + + test_precision_at_k = precision_at_k(actual, predicted, self.cfg.top_n) + test_recall_at_k = recall_at_k(actual, predicted, self.cfg.top_n) + test_map_at_k = map_at_k(actual, predicted, self.cfg.top_n) + test_ndcg_at_k = ndcg_at_k(actual, predicted, self.cfg.top_n) + + logger.info(f'''\n[Trainer] Test > + precision@{self.cfg.top_n} : {test_precision_at_k:.4f} / + Recall@{self.cfg.top_n}: {test_recall_at_k:.4f} / + MAP@{self.cfg.top_n}: {test_map_at_k:.4f} / + NDCG@{self.cfg.top_n}: {test_ndcg_at_k:.4f}''') + + return (test_precision_at_k, + test_recall_at_k, + test_map_at_k, + test_ndcg_at_k) + + def _generate_target_and_top_k_recommendation(self, scores: Tensor, pos_item) -> tuple[list]: + actual = [pos_item,] + + # create item index information + scores_idx = np.zeros_like(scores.cpu().detach().numpy()) + scores_idx[0,:] = pos_item + + # sort topK probs and find their indexes + sorted_indices = np.argsort(-scores.cpu().detach().numpy(), axis=1)[:self.cfg.top_n] + # apply sorted indexes to item indexes to get sorted topK item indexes by user + predicted = np.take_along_axis(scores_idx, sorted_indices, axis=1) + + return actual, predicted From a4b2721250e44b7b88b9443c997b1040d04a2bf5 Mon Sep 17 00:00:00 2001 From: GangBean Date: Fri, 19 Jul 2024 22:22:37 +0900 Subject: [PATCH 07/22] refactor: validate top-k recommendation method #21 --- trainers/s3rec_trainer.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index f490c8d..d9723f0 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -9,6 +9,7 @@ from loguru import logger from omegaconf.dictconfig import DictConfig +import wandb from models.cdae import CDAE from utils import log_metric @@ -48,7 +49,7 @@ def run(self, train_dataloader: DataLoader, valid_dataloader: DataLoader): # update model if self._is_surpass_best_metric( - current=(valid_loss,) + current=(valid_loss,), best=(best_valid_loss,)): logger.info(f"[Trainer] update best model...") @@ -82,7 +83,7 @@ def train(self, train_dataloader: DataLoader) -> float: def validate(self, valid_dataloader: DataLoader) -> tuple[float]: self.model.eval() valid_loss = 0 - actual, predicted = [], [] + # actual, predicted = [], [] for data in tqdm(valid_dataloader): X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) pos_pred, neg_pred = self.model(X, pos_item, neg_item) @@ -103,9 +104,9 @@ def evaluate(self, test_dataloader: DataLoader) -> tuple[float]: scores = self.model.evaluate(X, pos_item, neg_items) batch_actual, batch_predicted = \ - self._generate_target_and_top_k_recommendation(pred, test_mask, input_mask) - actual.append(batch_actual) - predicted.append(batch_predicted) + self._generate_target_and_top_k_recommendation(scores, pos_item) + actual.extend(batch_actual) + predicted.extend(batch_predicted) predicted = np.concatenate(predicted, axis=0) @@ -125,16 +126,16 @@ def evaluate(self, test_dataloader: DataLoader) -> tuple[float]: test_map_at_k, test_ndcg_at_k) - def _generate_target_and_top_k_recommendation(self, scores: Tensor, pos_item) -> tuple[list]: - actual = [pos_item,] + def _generate_target_and_top_k_recommendation(self, scores: Tensor, pos_item: Tensor) -> tuple[list]: + actual = pos_item.cpu().detach().numpy() # create item index information scores_idx = np.zeros_like(scores.cpu().detach().numpy()) - scores_idx[0,:] = pos_item + scores_idx[:, 0] = pos_item # sort topK probs and find their indexes sorted_indices = np.argsort(-scores.cpu().detach().numpy(), axis=1)[:self.cfg.top_n] # apply sorted indexes to item indexes to get sorted topK item indexes by user predicted = np.take_along_axis(scores_idx, sorted_indices, axis=1) - return actual, predicted + return actual.reshape(pos_item.size(0),1).tolist(), predicted[:,:self.cfg.top_n] From 5c222dcd95290ba973347d984740c6057a746d75 Mon Sep 17 00:00:00 2001 From: GangBean Date: Sat, 20 Jul 2024 00:04:49 +0900 Subject: [PATCH 08/22] feat: implements s3rec model #21 --- configs/train_config.yaml | 5 ++++- data/datasets/s3rec_data_pipeline.py | 4 ++-- data/datasets/s3rec_dataset.py | 4 ++-- train.py | 7 +++++++ trainers/s3rec_trainer.py | 19 ++++++++++++++++--- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/configs/train_config.yaml b/configs/train_config.yaml index 4509527..70cb9b7 100644 --- a/configs/train_config.yaml +++ b/configs/train_config.yaml @@ -14,7 +14,7 @@ notes: "..." tags: [sweep, yelp, cdae, hyper-parameter, model-structure] # train config -device: cuda # cpu +device: cpu epochs: 100 batch_size: 32 lr: 0.0001 @@ -45,4 +45,7 @@ model: embed_size: 64 num_orders: 3 S3Rec: + embed_size: 64 max_seq_len: 50 + num_heads: 2 + num_blocks: 2 diff --git a/data/datasets/s3rec_data_pipeline.py b/data/datasets/s3rec_data_pipeline.py index ecf5547..938eaca 100644 --- a/data/datasets/s3rec_data_pipeline.py +++ b/data/datasets/s3rec_data_pipeline.py @@ -74,9 +74,9 @@ def _load_df(self): def _load_attributes(self): logger.info("load item2attributes...") df = pd.read_json(os.path.join(self.cfg.data_dir, 'yelp_item2attributes.json')).transpose() - self.attributes_count = [df.categories.explode().nunique(), df.statecity.nunique()] + self.attributes_count = df.categories.explode().nunique() - return df.transpose().to_dict() + return df.drop(columns=['statecity']).transpose().to_dict() def _set_num_items_and_num_users(self, df): diff --git a/data/datasets/s3rec_dataset.py b/data/datasets/s3rec_dataset.py index cbd575a..8360b7d 100644 --- a/data/datasets/s3rec_dataset.py +++ b/data/datasets/s3rec_dataset.py @@ -32,14 +32,14 @@ def __getitem__(self, user_id): if self.train: return { 'user_id': user_id, - 'X': data['X'], + 'X': np.array(data['X'], dtype='int64'), 'pos_item': pos_item, 'neg_item': self._negative_sampling(data['behaviors'])[0] } else: return { 'user_id': user_id, - 'X': data['X'], + 'X': np.array(data['X'], dtype='int64'), 'pos_item': pos_item, 'neg_items': self._negative_sampling(data['behaviors']) } diff --git a/train.py b/train.py index ed235ae..5210259 100644 --- a/train.py +++ b/train.py @@ -22,6 +22,7 @@ from trainers.cdae_trainer import CDAETrainer from trainers.dcn_trainer import DCNTrainer from trainers.mf_trainer import MFTrainer +from trainers.s3rec_trainer import S3RecTrainer from utils import set_seed @@ -98,6 +99,12 @@ def train(cfg, args):#train_dataset, valid_dataset, test_dataset, model_info): trainer.run(train_dataloader, valid_dataloader, args.valid_eval_data) trainer.load_best_model() trainer.evaluate(args.test_eval_data, 'test') + elif cfg.model_name in ('S3Rec',): + trainer = S3RecTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], + args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) + trainer.run(train_dataloader, valid_dataloader) + trainer.load_best_model() + trainer.evaluate(test_dataloader) def unpack_model(cfg: OmegaConf) -> OmegaConf: if cfg.model_name not in cfg.model: diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index d9723f0..daff18e 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -12,20 +12,33 @@ import wandb from models.cdae import CDAE +from models.s3rec import S3Rec from utils import log_metric from .base_trainer import BaseTrainer from metric import * from loss import BPRLoss -class CDAETrainer(BaseTrainer): - def __init__(self, cfg: DictConfig, num_items: int, num_users: int) -> None: +class S3RecTrainer(BaseTrainer): + def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attributes, attributes_count: int) -> None: super().__init__(cfg) - self.model = CDAE(self.cfg, num_items, num_users) ## + self.model = S3Rec(self.cfg, num_items, num_users, attributes_count) self.optimizer: Optimizer = self._optimizer(self.cfg.optimizer, self.model, self.cfg.lr) self.loss = self._loss() def _loss(self): return BPRLoss() + + def _is_surpass_best_metric(self, **metric) -> bool: + (valid_loss, + ) = metric['current'] + + (best_valid_loss, + ) = metric['best'] + + if self.cfg.best_metric == 'loss': + return valid_loss < best_valid_loss + else: + return False def run(self, train_dataloader: DataLoader, valid_dataloader: DataLoader): logger.info(f"[Trainer] run...") From 2d931e1456edd618e0621a1bd2e8be86e82db3bf Mon Sep 17 00:00:00 2001 From: GangBean Date: Sun, 21 Jul 2024 09:50:12 +0900 Subject: [PATCH 09/22] fix: add missed module s3rec model #21 --- models/s3rec.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 models/s3rec.py diff --git a/models/s3rec.py b/models/s3rec.py new file mode 100644 index 0000000..eb126a2 --- /dev/null +++ b/models/s3rec.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn + +from models.base_model import BaseModel + +from loguru import logger +class S3Rec(BaseModel): + + def __init__(self, cfg, num_users, num_items, attributes_count): + super().__init__() + self.cfg = cfg + # self.user_embedding = nn.Embedding(num_users, cfg.embed_size, dtype=torch.float32) + self.item_embedding = nn.Embedding(num_items + 1, self.cfg.embed_size, dtype=torch.float32) + # self.attribute_embedding = nn.Embedding(attributes_count, self.cfg.embed_size, dtype=torch.float32) + self.positional_encoding = nn.Parameter(torch.rand(self.cfg.max_seq_len, self.cfg.embed_size)) + + # self.query = nn.ModuleList([nn.Linear(self.cfg.embed_size / self.num_heads) for _ in range(self.cfg.num_heads)]) + # self.key = nn.ModuleList([nn.Linear(self.cfg.embed_size) for _ in range(self.cfg.num_heads)]) + # self.value = nn.ModuleList([nn.Linear(self.cfg.embed_size) for _ in range(self.cfg.num_heads)]) + self.ffn1s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) + self.ffn2s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) + self.multihead_attns = nn.ModuleList([nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads) for _ in range(self.cfg.num_blocks)]) + self._init_weights() + + def _init_weights(self): + for child in self.children(): + if isinstance(child, nn.Embedding): + nn.init.xavier_uniform_(child.weight) + + def _embedding_layer(self, X): + return self.item_embedding(X) + self.positional_encoding + + def _self_attention_block(self, X): + for multihead_attn, ffn1, ffn2 in zip(self.multihead_attns, self.ffn1s, self.ffn2s): + attn_output, attn_output_weights = multihead_attn(X, X, X) + X = ffn2(nn.functional.relu(ffn1(attn_output))) + return X + + def _prediction_layer(self, item, self_attn_output): + return torch.matmul(item.T, self_attn_output) + + def forward(self, X, pos_item, neg_item): + X = self._embedding_layer(X) + X = self._self_attention_block(X) + pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]) + neg_pred = self._prediction_layer(self.item_embedding(neg_item), X[:, -1]) + return pos_pred, neg_pred From 1e6fc4ff466f9ddcc7aaea1cf78bd59ed8f502a0 Mon Sep 17 00:00:00 2001 From: Judy Date: Sun, 21 Jul 2024 02:43:01 +0000 Subject: [PATCH 10/22] fix: fix s3rec evaluation function #21 --- data/datasets/s3rec_dataset.py | 2 +- models/s3rec.py | 11 ++++++++++- train.py | 2 +- trainers/s3rec_trainer.py | 11 ++++++----- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/data/datasets/s3rec_dataset.py b/data/datasets/s3rec_dataset.py index 8360b7d..ca15892 100644 --- a/data/datasets/s3rec_dataset.py +++ b/data/datasets/s3rec_dataset.py @@ -41,5 +41,5 @@ def __getitem__(self, user_id): 'user_id': user_id, 'X': np.array(data['X'], dtype='int64'), 'pos_item': pos_item, - 'neg_items': self._negative_sampling(data['behaviors']) + 'neg_items': np.array(self._negative_sampling(data['behaviors']), dtype='int64') } diff --git a/models/s3rec.py b/models/s3rec.py index eb126a2..7f4109f 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -37,7 +37,7 @@ def _self_attention_block(self, X): return X def _prediction_layer(self, item, self_attn_output): - return torch.matmul(item.T, self_attn_output) + return torch.einsum('bi,bi->b', (item, self_attn_output)) def forward(self, X, pos_item, neg_item): X = self._embedding_layer(X) @@ -45,3 +45,12 @@ def forward(self, X, pos_item, neg_item): pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]) neg_pred = self._prediction_layer(self.item_embedding(neg_item), X[:, -1]) return pos_pred, neg_pred + + def evaluate(self, X, pos_item, neg_items): + X = self._embedding_layer(X) + X = self._self_attention_block(X) + pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]).view(pos_item.size(0), -1) + neg_preds = [self._prediction_layer( + self.item_embedding(neg_items[:,i]), X[:, -1]).view(neg_items.size(0), -1) for i in range(neg_items.size(-1))] + neg_preds = torch.concat(neg_preds, dim=1) + return pos_pred, neg_preds diff --git a/train.py b/train.py index 5210259..6c4c264 100644 --- a/train.py +++ b/train.py @@ -172,7 +172,7 @@ def main(cfg: OmegaConf): train_dataset = S3RecDataset(train_data, num_items=data_pipeline.num_items) valid_dataset = S3RecDataset(valid_data, num_items=data_pipeline.num_items) test_dataset = S3RecDataset(test_data, num_items=data_pipeline.num_items, train=False) - args.update({'test_dataset': test_data}) + args.update({'test_dataset': test_dataset}) model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users else: raise ValueError() diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index daff18e..6022d17 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -114,14 +114,15 @@ def evaluate(self, test_dataloader: DataLoader) -> tuple[float]: actual, predicted = [], [] for data in tqdm(test_dataloader): X, pos_item, neg_items = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_items'].to(self.device) - scores = self.model.evaluate(X, pos_item, neg_items) + pos_scores, neg_scores = self.model.evaluate(X, pos_item, neg_items) batch_actual, batch_predicted = \ - self._generate_target_and_top_k_recommendation(scores, pos_item) + self._generate_target_and_top_k_recommendation(torch.concat([pos_scores, neg_scores], dim=1), pos_item) actual.extend(batch_actual) predicted.extend(batch_predicted) - predicted = np.concatenate(predicted, axis=0) + predicted = np.array(predicted) + logger.info(f'predicted: {predicted.shape}') test_precision_at_k = precision_at_k(actual, predicted, self.cfg.top_n) test_recall_at_k = recall_at_k(actual, predicted, self.cfg.top_n) @@ -147,8 +148,8 @@ def _generate_target_and_top_k_recommendation(self, scores: Tensor, pos_item: Te scores_idx[:, 0] = pos_item # sort topK probs and find their indexes - sorted_indices = np.argsort(-scores.cpu().detach().numpy(), axis=1)[:self.cfg.top_n] + sorted_indices = np.argsort(-scores.cpu().detach().numpy(), axis=1)[:, :self.cfg.top_n] # apply sorted indexes to item indexes to get sorted topK item indexes by user predicted = np.take_along_axis(scores_idx, sorted_indices, axis=1) - return actual.reshape(pos_item.size(0),1).tolist(), predicted[:,:self.cfg.top_n] + return actual.reshape(pos_item.size(0),1).tolist(), predicted[:, :self.cfg.top_n] From facb224e7fdd2f32aae329bd915418fb55738272 Mon Sep 17 00:00:00 2001 From: GangBean Date: Tue, 23 Jul 2024 00:28:40 +0000 Subject: [PATCH 11/22] fix: add ModuleList init and sync model device with input data #21 --- models/s3rec.py | 6 ++++++ trainers/s3rec_trainer.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/models/s3rec.py b/models/s3rec.py index 7f4109f..45f1b0a 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -26,6 +26,12 @@ def _init_weights(self): for child in self.children(): if isinstance(child, nn.Embedding): nn.init.xavier_uniform_(child.weight) + elif isinstance(child, nn.ModuleList): # nn.Linear): + for sub_child in child.children(): + if not isinstance(sub_child, nn.MultiheadAttention): + nn.init.xavier_uniform_(sub_child.weight) + else: + logger.info(f"other type: {child} / {type(child)}") def _embedding_layer(self, X): return self.item_embedding(X) + self.positional_encoding diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index 6022d17..ec4bf97 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -21,7 +21,7 @@ class S3RecTrainer(BaseTrainer): def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attributes, attributes_count: int) -> None: super().__init__(cfg) - self.model = S3Rec(self.cfg, num_items, num_users, attributes_count) + self.model = S3Rec(self.cfg, num_items, num_users, attributes_count).to(self.device) self.optimizer: Optimizer = self._optimizer(self.cfg.optimizer, self.model, self.cfg.lr) self.loss = self._loss() @@ -145,7 +145,7 @@ def _generate_target_and_top_k_recommendation(self, scores: Tensor, pos_item: Te # create item index information scores_idx = np.zeros_like(scores.cpu().detach().numpy()) - scores_idx[:, 0] = pos_item + scores_idx[:, 0] = pos_item.cpu().detach() # sort topK probs and find their indexes sorted_indices = np.argsort(-scores.cpu().detach().numpy(), axis=1)[:, :self.cfg.top_n] From dafe357aea44d4a8b2b91e985f3b87e639d66f64 Mon Sep 17 00:00:00 2001 From: GangBean Date: Tue, 23 Jul 2024 02:43:35 +0000 Subject: [PATCH 12/22] feat: implements aap #21 --- configs/train_config.yaml | 8 +- models/s3rec.py | 12 ++- train.py | 18 ++-- trainers/s3rec_trainer.py | 170 +++++++++++++++++++++++++++++++++++++- 4 files changed, 198 insertions(+), 10 deletions(-) diff --git a/configs/train_config.yaml b/configs/train_config.yaml index 70cb9b7..e0ad81d 100644 --- a/configs/train_config.yaml +++ b/configs/train_config.yaml @@ -14,8 +14,8 @@ notes: "..." tags: [sweep, yelp, cdae, hyper-parameter, model-structure] # train config -device: cpu -epochs: 100 +device: cuda # cpu +epochs: 5 batch_size: 32 lr: 0.0001 optimizer: adam # adamw @@ -49,3 +49,7 @@ model: max_seq_len: 50 num_heads: 2 num_blocks: 2 + pretrain: True # False + pretrain_epochs: 1 # 100 + mask_portion: 0.2 + iter_nums: 200 \ No newline at end of file diff --git a/models/s3rec.py b/models/s3rec.py index 45f1b0a..fadf918 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -11,7 +11,7 @@ def __init__(self, cfg, num_users, num_items, attributes_count): self.cfg = cfg # self.user_embedding = nn.Embedding(num_users, cfg.embed_size, dtype=torch.float32) self.item_embedding = nn.Embedding(num_items + 1, self.cfg.embed_size, dtype=torch.float32) - # self.attribute_embedding = nn.Embedding(attributes_count, self.cfg.embed_size, dtype=torch.float32) + self.attribute_embedding = nn.Embedding(attributes_count, self.cfg.embed_size, dtype=torch.float32) self.positional_encoding = nn.Parameter(torch.rand(self.cfg.max_seq_len, self.cfg.embed_size)) # self.query = nn.ModuleList([nn.Linear(self.cfg.embed_size / self.num_heads) for _ in range(self.cfg.num_heads)]) @@ -20,8 +20,11 @@ def __init__(self, cfg, num_users, num_items, attributes_count): self.ffn1s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) self.ffn2s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) self.multihead_attns = nn.ModuleList([nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads) for _ in range(self.cfg.num_blocks)]) + self.aap_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) + self._init_weights() + def _init_weights(self): for child in self.children(): if isinstance(child, nn.Embedding): @@ -30,6 +33,8 @@ def _init_weights(self): for sub_child in child.children(): if not isinstance(sub_child, nn.MultiheadAttention): nn.init.xavier_uniform_(sub_child.weight) + elif isinstance(child, nn.Linear): + nn.init.xavier_uniform_(child.weight) else: logger.info(f"other type: {child} / {type(child)}") @@ -60,3 +65,8 @@ def evaluate(self, X, pos_item, neg_items): self.item_embedding(neg_items[:,i]), X[:, -1]).view(neg_items.size(0), -1) for i in range(neg_items.size(-1))] neg_preds = torch.concat(neg_preds, dim=1) return pos_pred, neg_preds + + def aap(self, items): + # item + item_embeddings = self.item_embedding(items) + return torch.matmul(self.aap_weight(item_embeddings), self.attribute_embedding.weight.T) # (batch, embed_size) * (attribute_size, embed_size) (batch, attribute_size) diff --git a/train.py b/train.py index 6c4c264..e1c101b 100644 --- a/train.py +++ b/train.py @@ -22,7 +22,7 @@ from trainers.cdae_trainer import CDAETrainer from trainers.dcn_trainer import DCNTrainer from trainers.mf_trainer import MFTrainer -from trainers.s3rec_trainer import S3RecTrainer +from trainers.s3rec_trainer import S3RecTrainer, S3RecPreTrainer from utils import set_seed @@ -100,11 +100,17 @@ def train(cfg, args):#train_dataset, valid_dataset, test_dataset, model_info): trainer.load_best_model() trainer.evaluate(args.test_eval_data, 'test') elif cfg.model_name in ('S3Rec',): - trainer = S3RecTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], - args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) - trainer.run(train_dataloader, valid_dataloader) - trainer.load_best_model() - trainer.evaluate(test_dataloader) + if cfg.pretrain: + trainer = S3RecPreTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], + args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) + trainer.pretrain(args.train_dataset, args.valid_dataset) + trainer.load_best_model() + else: + trainer = S3RecTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], + args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) + trainer.run(train_dataloader, valid_dataloader) + trainer.load_best_model() + trainer.evaluate(test_dataloader) def unpack_model(cfg: OmegaConf) -> OmegaConf: if cfg.model_name not in cfg.model: diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index ec4bf97..dd1d011 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -11,13 +11,181 @@ from omegaconf.dictconfig import DictConfig import wandb -from models.cdae import CDAE from models.s3rec import S3Rec from utils import log_metric from .base_trainer import BaseTrainer from metric import * from loss import BPRLoss +class S3RecPreTrainer: + def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attributes, attributes_count: int) -> None: + self.cfg = cfg + self.device = self.cfg.device + self.model = S3Rec(self.cfg, num_items, num_users, attributes_count).to(self.device) + self.optimizer: Optimizer = self._optimizer(self.cfg.optimizer, self.model, self.cfg.lr) + self.loss = self._loss() + self.item2attribute = item2attributes + self.num_items = num_items + self.num_users = num_users + self.attributes_count = attributes_count + + def _loss(self): + # AAP + MIP + MAP + SP + return nn.BCEWithLogitsLoss() + + def _optimizer(self, optimizer_name: str, model: nn.Module, learning_rate: float, weight_decay: float=0) -> Optimizer: + if optimizer_name.lower() == 'adam': + return torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + elif optimizer_name.lower() == 'adamw': + return torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + elif optimizer_name.lower() == 'sgd': + return torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + else: + logger.error(f"Optimizer Not Exists: {optimizer_name}") + raise NotImplementedError(f"Optimizer Not Exists: {optimizer_name}") + + def _is_surpass_best_metric(self, **metric) -> bool: + (valid_loss, + ) = metric['current'] + + (best_valid_loss, + ) = metric['best'] + + if self.cfg.best_metric == 'loss': + return valid_loss < best_valid_loss + else: + return False + + def pretrain(self, train_dataset, valid_dataset): + logger.info(f"[Trainer] run...") + + best_valid_loss: float = 1e+6 + best_epoch: int = 0 + endurance: int = 0 + + # train + for epoch in range(self.cfg.pretrain_epochs): + train_loss: float = self.train(torch.tensor([i for i in range(1, self.num_items+1)], dtype=torch.int32).to(self.device), train_dataset) + valid_loss = self.validate(torch.tensor([i for i in range(1, self.num_items+1)], dtype=torch.int32).to(self.device), valid_dataset) + logger.info(f'''\n[Trainer] epoch: {epoch} > train loss: {train_loss:.4f} / + valid loss: {valid_loss:.4f}''') + + if self.cfg.wandb: + wandb.log({ + 'train_loss': train_loss, + 'valid_loss': valid_loss, + }) + + # update model + if self._is_surpass_best_metric( + current=(valid_loss,), + best=(best_valid_loss,)): + + logger.info(f"[Trainer] update best model...") + best_valid_loss = valid_loss + best_epoch = epoch + endurance = 0 + + torch.save(self.model.state_dict(), f'{self.cfg.model_dir}/best_pretrain_model.pt') + else: + endurance += 1 + if endurance > self.cfg.patience: + logger.info(f"[Trainer] ealry stopping...") + break + + def train(self, item_datasets, sequence_datasets) -> float: + self.model.train() + train_loss = 0 + + for iter_num in tqdm(range(self.cfg.iter_nums)): # sequence + item_chunk_size = self.num_items // self.cfg.iter_nums + items = item_datasets[item_chunk_size * iter_num: item_chunk_size * (iter_num + 1)] + + sequence_chunk_size = self.num_users // self.cfg.iter_nums + # sequences = sequence_datasets[sequence_chunk_size * iter_num: sequence_chunk_size * (iter_num + 1)] + + # AAP: item + atrributes + pred = self.model.aap(items) # (item_chunk_size, attributes_count) + actual = torch.Tensor([[1 if attriute in self.item2attribute[item.item()] else 0 for attriute in range(self.attributes_count)] for item in items]).to(self.device) # (item_chunk_size, attributes_count) + aap_loss = nn.functional.binary_cross_entropy_with_logits(pred, actual) + + # MIP: sequence + item + # mask + # def random_mask(sequence): + # # mask = torch.Tensor([0] * sequence.size(0)) + # non_zero_count = torch.nonzero(sequence, as_tuple=True)[0].size(0) + # mask_indices = torch.randint(sequence.size(0) - non_zero_count, sequence.size(0), size=1) + # # mask[mask_indices] = 1 + # return mask_indices + + # masks = torch.Tensor([random_mask(sequence) for sequence in sequences]) # () + # masked_sequences = sequences * (1 - masks) + # pred = self.model.mip(masked_sequences, ) # (sequence_chunk_size, mask_count, sequence_len) item idx pred + # nn.functional.binary_cross_entropy + # # MAP: sequence + attributes + # map_loss = self.loss() + # # SP: sequence + segment + # sp_loss = self.loss() + # # X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) + # # pos_pred, neg_pred = self.model(X, pos_item, neg_item) + + self.optimizer.zero_grad() + # loss = self.loss(pos_pred, neg_pred) + loss = aap_loss # + mip_loss + map_loss + sp_loss + loss.backward() + self.optimizer.step() + + train_loss += loss.item() + + return train_loss + + def validate(self, item_datasets, sequence_datasets) -> float: + self.model.eval() + valid_loss = 0 + + for iter_num in tqdm(range(self.cfg.iter_nums)): # sequence + item_chunk_size = self.num_items // self.cfg.iter_nums + items = item_datasets[item_chunk_size * iter_num: item_chunk_size * (iter_num + 1)] + + sequence_chunk_size = self.num_users // self.cfg.iter_nums + # sequences = sequence_datasets[sequence_chunk_size * iter_num: sequence_chunk_size * (iter_num + 1)] + + # AAP: item + atrributes + pred = self.model.aap(items) # (item_chunk_size, attributes_count) + actual = torch.Tensor([[1 if attriute in self.item2attribute[item.item()] else 0 for attriute in range(self.attributes_count)] for item in items]).to(self.device) # (item_chunk_size, attributes_count) + aap_loss = nn.functional.binary_cross_entropy_with_logits(pred, actual) + + # MIP: sequence + item + # mask + # def random_mask(sequence): + # # mask = torch.Tensor([0] * sequence.size(0)) + # non_zero_count = torch.nonzero(sequence, as_tuple=True)[0].size(0) + # mask_indices = torch.randint(sequence.size(0) - non_zero_count, sequence.size(0), size=1) + # # mask[mask_indices] = 1 + # return mask_indices + + # masks = torch.Tensor([random_mask(sequence) for sequence in sequences]) # () + # masked_sequences = sequences * (1 - masks) + # pred = self.model.mip(masked_sequences, ) # (sequence_chunk_size, sequence_len) item idx pred + # nn.functional.binary_cross_entropy + # # MAP: sequence + attributes + # map_loss = self.loss() + # # SP: sequence + segment + # sp_loss = self.loss() + # # X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) + # # pos_pred, neg_pred = self.model(X, pos_item, neg_item) + + # loss = self.loss(pos_pred, neg_pred) + loss = aap_loss # + mip_loss + map_loss + sp_loss + + valid_loss += loss.item() + + return valid_loss + + def load_best_model(self): + logger.info(f"[Trainer] Load best model...") + self.model.load_state_dict(torch.load(f'{self.cfg.model_dir}/best_pretrain_model.pt')) + class S3RecTrainer(BaseTrainer): def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attributes, attributes_count: int) -> None: super().__init__(cfg) From 18fca50b0b1356bb3b5c00536a7a5775ca719b7c Mon Sep 17 00:00:00 2001 From: twndus Date: Fri, 26 Jul 2024 00:10:29 -0400 Subject: [PATCH 13/22] fix: implements S3Rec pretrain method and insert self-attention network to pretraining stage #21 --- models/s3rec.py | 79 +++++++++++++++++++++--- train.py | 2 +- trainers/s3rec_trainer.py | 127 ++++++++++++++++++-------------------- 3 files changed, 129 insertions(+), 79 deletions(-) diff --git a/models/s3rec.py b/models/s3rec.py index fadf918..88056ea 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -9,22 +9,20 @@ class S3Rec(BaseModel): def __init__(self, cfg, num_users, num_items, attributes_count): super().__init__() self.cfg = cfg - # self.user_embedding = nn.Embedding(num_users, cfg.embed_size, dtype=torch.float32) self.item_embedding = nn.Embedding(num_items + 1, self.cfg.embed_size, dtype=torch.float32) self.attribute_embedding = nn.Embedding(attributes_count, self.cfg.embed_size, dtype=torch.float32) self.positional_encoding = nn.Parameter(torch.rand(self.cfg.max_seq_len, self.cfg.embed_size)) - # self.query = nn.ModuleList([nn.Linear(self.cfg.embed_size / self.num_heads) for _ in range(self.cfg.num_heads)]) - # self.key = nn.ModuleList([nn.Linear(self.cfg.embed_size) for _ in range(self.cfg.num_heads)]) - # self.value = nn.ModuleList([nn.Linear(self.cfg.embed_size) for _ in range(self.cfg.num_heads)]) self.ffn1s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) self.ffn2s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) self.multihead_attns = nn.ModuleList([nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads) for _ in range(self.cfg.num_blocks)]) self.aap_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) + self.mip_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) + self.map_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) + self.sp_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) self._init_weights() - def _init_weights(self): for child in self.children(): if isinstance(child, nn.Embedding): @@ -50,7 +48,7 @@ def _self_attention_block(self, X): def _prediction_layer(self, item, self_attn_output): return torch.einsum('bi,bi->b', (item, self_attn_output)) - def forward(self, X, pos_item, neg_item): + def finetune(self, X, pos_item, neg_item): X = self._embedding_layer(X) X = self._self_attention_block(X) pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]) @@ -65,8 +63,69 @@ def evaluate(self, X, pos_item, neg_items): self.item_embedding(neg_items[:,i]), X[:, -1]).view(neg_items.size(0), -1) for i in range(neg_items.size(-1))] neg_preds = torch.concat(neg_preds, dim=1) return pos_pred, neg_preds + + def encode(self, X): + return self._self_attention_block(self._embedding_layer(X)) - def aap(self, items): - # item - item_embeddings = self.item_embedding(items) - return torch.matmul(self.aap_weight(item_embeddings), self.attribute_embedding.weight.T) # (batch, embed_size) * (attribute_size, embed_size) (batch, attribute_size) + def pretrain(self, item_masked_sequences, subsequence_masked_sequences, pos_subsequences, neg_subsequences): + # encode + attention_output = self.encode(item_masked_sequences) + subsequence_attention_output = self.encode(subsequence_masked_sequences) + pos_subsequence_attention_output = self.encode(pos_subsequences) + neg_subsequence_attention_output = self.encode(neg_subsequences) + # aap + aap_output = self.aap(attention_output) # (B, L, A) + # mip + mip_output = self.mip(attention_output) + # map + map_output = self.map(attention_output) + # sp + sp_output_pos = self.sp(attention_output, pos_subsequence_attention_output) # pos 1 + sp_output_neg = self.sp(attention_output, neg_subsequence_attention_output) # neg 1 + return aap_output, mip_output, map_output, (sp_output_pos, sp_output_neg) + + def aap(self, attention_output): + ''' + inputs: + attention_output: [ B, L, H ] + output: + [ B, L, A ] + ''' + FW = self.aap_weight(attention_output) # [ B L H ] + return torch.matmul(FW, self.attribute_embedding.weight.T) # [ B L H ] [ H A ] -> [ B L A ] + + def mip(self, attention_output): + ''' + inputs: + attention_output: [ B, L, H ] + output: + ''' + FW = self.mip_weight(attention_output) # [ B L H ] + return torch.matmul(FW, self.item_embedding.weight.t()) # [ B L H ] [ H I ] -> [ B L I ] + + def map(self, attention_output): + ''' + inputs: + attention_output: [ B, L, H ] + output: + [ B, L, A ] + ''' + FW = self.aap_weight(attention_output) # [ B L H ] + return torch.matmul(FW, self.attribute_embedding.weight.T) # [ B L H ] [ H A ] -> [ B L A ] + + def sp(self, context_attention_output, subsequence_attention_output): + ''' + inputs: + context_attention_output: [ B, L, H ] + subsequence_attention_output: [ B, len_subsequence, H ] + output: + [ B ] + + s - input [ i1, i2, mask, mask, mask, ..., in ] + s~ - input [ i3, i4, i5 ] + + ''' + s = context_attention_output[:, -1, :] # [ B H ] + s_tilde = subsequence_attention_output[:, -1, :] # [ B H ] + SW = self.sp_weight(s) + return torch.einsum('bi,bi->b', SW, s_tilde) # [ B ] diff --git a/train.py b/train.py index e1c101b..3637fe2 100644 --- a/train.py +++ b/train.py @@ -103,7 +103,7 @@ def train(cfg, args):#train_dataset, valid_dataset, test_dataset, model_info): if cfg.pretrain: trainer = S3RecPreTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) - trainer.pretrain(args.train_dataset, args.valid_dataset) + trainer.pretrain(train_dataloader) trainer.load_best_model() else: trainer = S3RecTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index dd1d011..46ad4a6 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -56,7 +56,7 @@ def _is_surpass_best_metric(self, **metric) -> bool: else: return False - def pretrain(self, train_dataset, valid_dataset): + def pretrain(self, train_dataset): logger.info(f"[Trainer] run...") best_valid_loss: float = 1e+6 @@ -65,8 +65,8 @@ def pretrain(self, train_dataset, valid_dataset): # train for epoch in range(self.cfg.pretrain_epochs): - train_loss: float = self.train(torch.tensor([i for i in range(1, self.num_items+1)], dtype=torch.int32).to(self.device), train_dataset) - valid_loss = self.validate(torch.tensor([i for i in range(1, self.num_items+1)], dtype=torch.int32).to(self.device), valid_dataset) + train_loss: float = self.train(train_dataset) + valid_loss = self.validate(train_dataset) logger.info(f'''\n[Trainer] epoch: {epoch} > train loss: {train_loss:.4f} / valid loss: {valid_loss:.4f}''') @@ -92,42 +92,66 @@ def pretrain(self, train_dataset, valid_dataset): if endurance > self.cfg.patience: logger.info(f"[Trainer] ealry stopping...") break - - def train(self, item_datasets, sequence_datasets) -> float: + + def item_level_masking(self, sequences): + masks = torch.rand_like(sequences, dtype=torch.float32) < .2 + item_masked_sequences = masks * sequences + return masks, item_masked_sequences + + def segment_masking(self, sequences): + masks, pos_sequences, neg_sequences = torch.zeros_like(sequences), torch.zeros_like(sequences), torch.zeros_like(sequences) + for i in range(sequences.size(0)): + # sample segment length randomly + segment_len = torch.randint(low=2, high=self.cfg.max_seq_len//2, size=(1,)) + # start_index + start_idx = torch.randint(self.cfg.max_seq_len-segment_len, size=(1,)) + masks[i, start_idx:start_idx+segment_len] = 1 + # pos_sequence + pos_sequences[i, -segment_len:] = sequences[i, start_idx:start_idx+segment_len] + # neg_sequence + ## other user in same batch + neg_user_idx = torch.randint(sequences.size(0), size=(1,)) + while neg_user_idx != i: + neg_user_idx = torch.randint(sequences.size(0), size=(1,)) + ## start_idx + neg_start_idx = torch.randint(self.cfg.max_seq_len-segment_len, size=(1,)) + neg_sequences[i, -segment_len:] = sequences[neg_user_idx, neg_start_idx:neg_start_idx+segment_len] + segment_masked_sequences = (1-masks) * sequences + return segment_masked_sequences, pos_sequences, neg_sequences + + # def train(self, item_datasets, sequence_datasets) -> float: + def train(self, train_dataloader) -> float: self.model.train() train_loss = 0 - for iter_num in tqdm(range(self.cfg.iter_nums)): # sequence - item_chunk_size = self.num_items // self.cfg.iter_nums - items = item_datasets[item_chunk_size * iter_num: item_chunk_size * (iter_num + 1)] + for data in tqdm(train_dataloader): # sequence + sequences = data['X'].to(self.device) + # item_masked_sequences + masks, item_masked_sequences = self.item_level_masking(sequences) + # segment_masked_sequences + segment_masked_sequences, pos_segments, neg_segments = self.segment_masking(sequences) - sequence_chunk_size = self.num_users // self.cfg.iter_nums - # sequences = sequence_datasets[sequence_chunk_size * iter_num: sequence_chunk_size * (iter_num + 1)] + # pretrain + aap_output, mip_output, map_output, (sp_output_pos, sp_output_neg) = self.model.pretrain( + item_masked_sequences, segment_masked_sequences, pos_segments, neg_segments) # AAP: item + atrributes - pred = self.model.aap(items) # (item_chunk_size, attributes_count) - actual = torch.Tensor([[1 if attriute in self.item2attribute[item.item()] else 0 for attriute in range(self.attributes_count)] for item in items]).to(self.device) # (item_chunk_size, attributes_count) - aap_loss = nn.functional.binary_cross_entropy_with_logits(pred, actual) + aap_actual = torch.ones_like(aap_output).to(self.device) +# actual = torch.Tensor([ +# [1 if attriute in self.item2attribute[item.item()] else 0 \ + # for attriute in range(self.attributes_count)] for item in items] +# ).to(self.device) # (item_chunk_size, attributes_count) + ## compute unmasked area only + aap_loss = nn.functional.binary_cross_entropy_with_logits(aap_output, aap_actual) # MIP: sequence + item - # mask - # def random_mask(sequence): - # # mask = torch.Tensor([0] * sequence.size(0)) - # non_zero_count = torch.nonzero(sequence, as_tuple=True)[0].size(0) - # mask_indices = torch.randint(sequence.size(0) - non_zero_count, sequence.size(0), size=1) - # # mask[mask_indices] = 1 - # return mask_indices - - # masks = torch.Tensor([random_mask(sequence) for sequence in sequences]) # () - # masked_sequences = sequences * (1 - masks) - # pred = self.model.mip(masked_sequences, ) # (sequence_chunk_size, mask_count, sequence_len) item idx pred - # nn.functional.binary_cross_entropy - # # MAP: sequence + attributes - # map_loss = self.loss() - # # SP: sequence + segment - # sp_loss = self.loss() - # # X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) - # # pos_pred, neg_pred = self.model(X, pos_item, neg_item) + ## compute masked area only + + # MAP: sequence + attribute + ## compute masked area only + + # SP: sequence + segment + ## pos_segment > neg_segment self.optimizer.zero_grad() # loss = self.loss(pos_pred, neg_pred) @@ -139,45 +163,12 @@ def train(self, item_datasets, sequence_datasets) -> float: return train_loss - def validate(self, item_datasets, sequence_datasets) -> float: + def validate(self, sequence_datasets) -> float: self.model.eval() valid_loss = 0 for iter_num in tqdm(range(self.cfg.iter_nums)): # sequence - item_chunk_size = self.num_items // self.cfg.iter_nums - items = item_datasets[item_chunk_size * iter_num: item_chunk_size * (iter_num + 1)] - - sequence_chunk_size = self.num_users // self.cfg.iter_nums - # sequences = sequence_datasets[sequence_chunk_size * iter_num: sequence_chunk_size * (iter_num + 1)] - - # AAP: item + atrributes - pred = self.model.aap(items) # (item_chunk_size, attributes_count) - actual = torch.Tensor([[1 if attriute in self.item2attribute[item.item()] else 0 for attriute in range(self.attributes_count)] for item in items]).to(self.device) # (item_chunk_size, attributes_count) - aap_loss = nn.functional.binary_cross_entropy_with_logits(pred, actual) - - # MIP: sequence + item - # mask - # def random_mask(sequence): - # # mask = torch.Tensor([0] * sequence.size(0)) - # non_zero_count = torch.nonzero(sequence, as_tuple=True)[0].size(0) - # mask_indices = torch.randint(sequence.size(0) - non_zero_count, sequence.size(0), size=1) - # # mask[mask_indices] = 1 - # return mask_indices - - # masks = torch.Tensor([random_mask(sequence) for sequence in sequences]) # () - # masked_sequences = sequences * (1 - masks) - # pred = self.model.mip(masked_sequences, ) # (sequence_chunk_size, sequence_len) item idx pred - # nn.functional.binary_cross_entropy - # # MAP: sequence + attributes - # map_loss = self.loss() - # # SP: sequence + segment - # sp_loss = self.loss() - # # X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) - # # pos_pred, neg_pred = self.model(X, pos_item, neg_item) - - # loss = self.loss(pos_pred, neg_pred) - loss = aap_loss # + mip_loss + map_loss + sp_loss - + break valid_loss += loss.item() return valid_loss @@ -250,7 +241,7 @@ def train(self, train_dataloader: DataLoader) -> float: train_loss = 0 for data in tqdm(train_dataloader): X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) - pos_pred, neg_pred = self.model(X, pos_item, neg_item) + pos_pred, neg_pred = self.model.finetune(X, pos_item, neg_item) self.optimizer.zero_grad() loss = self.loss(pos_pred, neg_pred) @@ -267,7 +258,7 @@ def validate(self, valid_dataloader: DataLoader) -> tuple[float]: # actual, predicted = [], [] for data in tqdm(valid_dataloader): X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) - pos_pred, neg_pred = self.model(X, pos_item, neg_item) + pos_pred, neg_pred = self.model.finetune(X, pos_item, neg_item) self.optimizer.zero_grad() loss = self.loss(pos_pred, neg_pred) From ced93d5b8017daae3d0db366bdb7ed7064d3cd2e Mon Sep 17 00:00:00 2001 From: GangBean Date: Fri, 2 Aug 2024 01:56:24 +0000 Subject: [PATCH 14/22] feat: implements aap actual #21 --- data/datasets/s3rec_data_pipeline.py | 8 ++++++-- data/datasets/s3rec_dataset.py | 12 +++++++++--- train.py | 6 +++--- trainers/s3rec_trainer.py | 7 ++----- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/data/datasets/s3rec_data_pipeline.py b/data/datasets/s3rec_data_pipeline.py index 938eaca..a3b7410 100644 --- a/data/datasets/s3rec_data_pipeline.py +++ b/data/datasets/s3rec_data_pipeline.py @@ -61,6 +61,7 @@ def preprocess(self) -> pd.DataFrame: # load attributes self.item2attributes = self._load_attributes() + logger.info(f"item2attributes : {len(self.item2attributes)}") logger.info("done") return df @@ -75,8 +76,11 @@ def _load_attributes(self): logger.info("load item2attributes...") df = pd.read_json(os.path.join(self.cfg.data_dir, 'yelp_item2attributes.json')).transpose() self.attributes_count = df.categories.explode().nunique() - - return df.drop(columns=['statecity']).transpose().to_dict() + + df = df.drop(columns=['statecity']).transpose().to_dict() + df = {key+1:value for key,value in df.items()} + df.update({0: {'categories': []}}) + return df def _set_num_items_and_num_users(self, df): diff --git a/data/datasets/s3rec_dataset.py b/data/datasets/s3rec_dataset.py index ca15892..bb16acc 100644 --- a/data/datasets/s3rec_dataset.py +++ b/data/datasets/s3rec_dataset.py @@ -7,11 +7,13 @@ class S3RecDataset(Dataset): - def __init__(self, data, num_items=None, train=True): + def __init__(self, data, item2attribute, attributes_count, num_items=None, train=True): super().__init__() self.data = data self.num_items = num_items self.train = train + self.item2attribute = item2attribute + self.attributes_count = attributes_count def __len__(self): return self.data.shape[0] @@ -29,17 +31,21 @@ def _negative_sampling(self, behaviors): def __getitem__(self, user_id): data = self.data.iloc[user_id,:] pos_item = data['y'].astype('int64') + aap_actual = np.array([[1 if attriute in self.item2attribute[item]['categories'] else 0 \ + for attriute in range(self.attributes_count)] for item in data['X']], dtype='float') if self.train: return { 'user_id': user_id, 'X': np.array(data['X'], dtype='int64'), 'pos_item': pos_item, - 'neg_item': self._negative_sampling(data['behaviors'])[0] + 'neg_item': self._negative_sampling(data['behaviors'])[0], + 'aap_actual': aap_actual } else: return { 'user_id': user_id, 'X': np.array(data['X'], dtype='int64'), 'pos_item': pos_item, - 'neg_items': np.array(self._negative_sampling(data['behaviors']), dtype='int64') + 'neg_items': np.array(self._negative_sampling(data['behaviors']), dtype='int64'), + 'aap_actual': aap_actual } diff --git a/train.py b/train.py index 3637fe2..1ef5500 100644 --- a/train.py +++ b/train.py @@ -175,9 +175,9 @@ def main(cfg: OmegaConf): model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users elif cfg.model_name == 'S3Rec': train_data, valid_data, test_data = data_pipeline.split(df) - train_dataset = S3RecDataset(train_data, num_items=data_pipeline.num_items) - valid_dataset = S3RecDataset(valid_data, num_items=data_pipeline.num_items) - test_dataset = S3RecDataset(test_data, num_items=data_pipeline.num_items, train=False) + train_dataset = S3RecDataset(train_data, data_pipeline.item2attributes, data_pipeline.attributes_count, num_items=data_pipeline.num_items) + valid_dataset = S3RecDataset(valid_data, data_pipeline.item2attributes, data_pipeline.attributes_count, num_items=data_pipeline.num_items) + test_dataset = S3RecDataset(test_data, data_pipeline.item2attributes, data_pipeline.attributes_count, num_items=data_pipeline.num_items, train=False) args.update({'test_dataset': test_dataset}) model_info['num_items'], model_info['num_users'] = data_pipeline.num_items, data_pipeline.num_users else: diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index 46ad4a6..f23ccbd 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -126,6 +126,7 @@ def train(self, train_dataloader) -> float: for data in tqdm(train_dataloader): # sequence sequences = data['X'].to(self.device) + aap_actual = data['aap_actual'].to(self.device) # item_masked_sequences masks, item_masked_sequences = self.item_level_masking(sequences) # segment_masked_sequences @@ -136,11 +137,7 @@ def train(self, train_dataloader) -> float: item_masked_sequences, segment_masked_sequences, pos_segments, neg_segments) # AAP: item + atrributes - aap_actual = torch.ones_like(aap_output).to(self.device) -# actual = torch.Tensor([ -# [1 if attriute in self.item2attribute[item.item()] else 0 \ - # for attriute in range(self.attributes_count)] for item in items] -# ).to(self.device) # (item_chunk_size, attributes_count) + aap_actual = aap_actual * masks.unsqueeze(-1) ## compute unmasked area only aap_loss = nn.functional.binary_cross_entropy_with_logits(aap_output, aap_actual) From cab7dd37c1294a0601ec94483edfe7f6bebfe40f Mon Sep 17 00:00:00 2001 From: GangBean Date: Fri, 2 Aug 2024 02:27:52 +0000 Subject: [PATCH 15/22] feat: implements mip actual #21 --- data/datasets/s3rec_dataset.py | 9 +++++++-- models/s3rec.py | 2 +- trainers/s3rec_trainer.py | 9 +++++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/data/datasets/s3rec_dataset.py b/data/datasets/s3rec_dataset.py index bb16acc..5ae2364 100644 --- a/data/datasets/s3rec_dataset.py +++ b/data/datasets/s3rec_dataset.py @@ -33,13 +33,17 @@ def __getitem__(self, user_id): pos_item = data['y'].astype('int64') aap_actual = np.array([[1 if attriute in self.item2attribute[item]['categories'] else 0 \ for attriute in range(self.attributes_count)] for item in data['X']], dtype='float') + mip_actual = np.zeros((len(data['X']), self.num_items+1), dtype='float') + for i, item in enumerate(data['X']): + mip_actual[i, item] = 1 if self.train: return { 'user_id': user_id, 'X': np.array(data['X'], dtype='int64'), 'pos_item': pos_item, 'neg_item': self._negative_sampling(data['behaviors'])[0], - 'aap_actual': aap_actual + 'aap_actual': aap_actual, + 'mip_actual': mip_actual, } else: return { @@ -47,5 +51,6 @@ def __getitem__(self, user_id): 'X': np.array(data['X'], dtype='int64'), 'pos_item': pos_item, 'neg_items': np.array(self._negative_sampling(data['behaviors']), dtype='int64'), - 'aap_actual': aap_actual + 'aap_actual': aap_actual, + 'mip_actual': mip_actual, } diff --git a/models/s3rec.py b/models/s3rec.py index 88056ea..e22d0da 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -6,7 +6,7 @@ from loguru import logger class S3Rec(BaseModel): - def __init__(self, cfg, num_users, num_items, attributes_count): + def __init__(self, cfg, num_items, attributes_count): super().__init__() self.cfg = cfg self.item_embedding = nn.Embedding(num_items + 1, self.cfg.embed_size, dtype=torch.float32) diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index f23ccbd..2e7c3aa 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -21,7 +21,7 @@ class S3RecPreTrainer: def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attributes, attributes_count: int) -> None: self.cfg = cfg self.device = self.cfg.device - self.model = S3Rec(self.cfg, num_items, num_users, attributes_count).to(self.device) + self.model = S3Rec(self.cfg, num_items, attributes_count).to(self.device) self.optimizer: Optimizer = self._optimizer(self.cfg.optimizer, self.model, self.cfg.lr) self.loss = self._loss() self.item2attribute = item2attributes @@ -127,6 +127,7 @@ def train(self, train_dataloader) -> float: for data in tqdm(train_dataloader): # sequence sequences = data['X'].to(self.device) aap_actual = data['aap_actual'].to(self.device) + mip_actual = data['mip_actual'].to(self.device) # item_masked_sequences masks, item_masked_sequences = self.item_level_masking(sequences) # segment_masked_sequences @@ -143,16 +144,20 @@ def train(self, train_dataloader) -> float: # MIP: sequence + item ## compute masked area only + mip_actual = mip_actual * masks.unsqueeze(-1) + mip_loss = nn.functional.binary_cross_entropy_with_logits(mip_output, mip_actual) # MAP: sequence + attribute ## compute masked area only + map_loss = 0 # SP: sequence + segment ## pos_segment > neg_segment + sp_loss = 0 self.optimizer.zero_grad() # loss = self.loss(pos_pred, neg_pred) - loss = aap_loss # + mip_loss + map_loss + sp_loss + loss = aap_loss + mip_loss + map_loss + sp_loss loss.backward() self.optimizer.step() From acfa24122e700bf31402a39548336a5139194548 Mon Sep 17 00:00:00 2001 From: GangBean Date: Fri, 2 Aug 2024 04:01:46 +0000 Subject: [PATCH 16/22] feat: implements mip, map, sp actual and loss #21 --- configs/train_config.yaml | 10 +++---- train.py | 18 ++++++------ trainers/s3rec_trainer.py | 60 +++++++++++++++++++-------------------- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/configs/train_config.yaml b/configs/train_config.yaml index e0ad81d..2f90bb7 100644 --- a/configs/train_config.yaml +++ b/configs/train_config.yaml @@ -8,14 +8,14 @@ log_dir: logs/ sweep: False # wandb config -wandb: False # True/ False +wandb: True # True/ False project: YelpRecommendation notes: "..." -tags: [sweep, yelp, cdae, hyper-parameter, model-structure] +tags: [yelp, s3rec] # train config device: cuda # cpu -epochs: 5 +epochs: 100 batch_size: 32 lr: 0.0001 optimizer: adam # adamw @@ -50,6 +50,6 @@ model: num_heads: 2 num_blocks: 2 pretrain: True # False - pretrain_epochs: 1 # 100 + load_pretrain: True + pretrain_epochs: 100 # 100 mask_portion: 0.2 - iter_nums: 200 \ No newline at end of file diff --git a/train.py b/train.py index 1ef5500..5f23ff9 100644 --- a/train.py +++ b/train.py @@ -101,16 +101,16 @@ def train(cfg, args):#train_dataset, valid_dataset, test_dataset, model_info): trainer.evaluate(args.test_eval_data, 'test') elif cfg.model_name in ('S3Rec',): if cfg.pretrain: - trainer = S3RecPreTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], + pretrainer = S3RecPreTrainer(cfg, args.model_info['num_items'], args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) - trainer.pretrain(train_dataloader) - trainer.load_best_model() - else: - trainer = S3RecTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], - args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) - trainer.run(train_dataloader, valid_dataloader) - trainer.load_best_model() - trainer.evaluate(test_dataloader) + pretrainer.pretrain(train_dataloader) + pretrainer.load_best_model() + + trainer = S3RecTrainer(cfg, args.model_info['num_items'], + args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) + trainer.run(train_dataloader, valid_dataloader) + trainer.load_best_model() + trainer.evaluate(test_dataloader) def unpack_model(cfg: OmegaConf) -> OmegaConf: if cfg.model_name not in cfg.model: diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index 2e7c3aa..4df1d2c 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -1,3 +1,4 @@ +import os import numpy as np from tqdm import tqdm @@ -18,7 +19,7 @@ from loss import BPRLoss class S3RecPreTrainer: - def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attributes, attributes_count: int) -> None: + def __init__(self, cfg: DictConfig, num_items: int, item2attributes, attributes_count: int) -> None: self.cfg = cfg self.device = self.cfg.device self.model = S3Rec(self.cfg, num_items, attributes_count).to(self.device) @@ -26,7 +27,6 @@ def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attribu self.loss = self._loss() self.item2attribute = item2attributes self.num_items = num_items - self.num_users = num_users self.attributes_count = attributes_count def _loss(self): @@ -57,33 +57,28 @@ def _is_surpass_best_metric(self, **metric) -> bool: return False def pretrain(self, train_dataset): - logger.info(f"[Trainer] run...") + logger.info(f"[Pre-Trainer] run...") - best_valid_loss: float = 1e+6 - best_epoch: int = 0 + best_train_loss: float = 1e+6 endurance: int = 0 # train for epoch in range(self.cfg.pretrain_epochs): train_loss: float = self.train(train_dataset) - valid_loss = self.validate(train_dataset) - logger.info(f'''\n[Trainer] epoch: {epoch} > train loss: {train_loss:.4f} / - valid loss: {valid_loss:.4f}''') + logger.info(f'''\n[Pre-Trainer] epoch: {epoch} > pretrain loss: {train_loss:.4f}''') if self.cfg.wandb: wandb.log({ - 'train_loss': train_loss, - 'valid_loss': valid_loss, + 'pretrain_loss': train_loss, }) # update model if self._is_surpass_best_metric( - current=(valid_loss,), - best=(best_valid_loss,)): + current=(train_loss,), + best=(best_train_loss,)): logger.info(f"[Trainer] update best model...") - best_valid_loss = valid_loss - best_epoch = epoch + best_train_loss = train_loss endurance = 0 torch.save(self.model.state_dict(), f'{self.cfg.model_dir}/best_pretrain_model.pt') @@ -94,7 +89,7 @@ def pretrain(self, train_dataset): break def item_level_masking(self, sequences): - masks = torch.rand_like(sequences, dtype=torch.float32) < .2 + masks = torch.rand_like(sequences, dtype=torch.float32) < self.cfg.mask_portion item_masked_sequences = masks * sequences return masks, item_masked_sequences @@ -128,6 +123,8 @@ def train(self, train_dataloader) -> float: sequences = data['X'].to(self.device) aap_actual = data['aap_actual'].to(self.device) mip_actual = data['mip_actual'].to(self.device) + map_actual = data['aap_actual'].to(self.device) + # item_masked_sequences masks, item_masked_sequences = self.item_level_masking(sequences) # segment_masked_sequences @@ -144,19 +141,23 @@ def train(self, train_dataloader) -> float: # MIP: sequence + item ## compute masked area only - mip_actual = mip_actual * masks.unsqueeze(-1) + mip_output = mip_output * masks.logical_not().unsqueeze(-1) + mip_actual = mip_actual * masks.logical_not().unsqueeze(-1) mip_loss = nn.functional.binary_cross_entropy_with_logits(mip_output, mip_actual) # MAP: sequence + attribute ## compute masked area only - map_loss = 0 + map_output = map_output * masks.logical_not().unsqueeze(-1) + map_actual = map_actual * masks.logical_not().unsqueeze(-1) + map_loss = nn.functional.binary_cross_entropy_with_logits(map_output, map_actual) # SP: sequence + segment ## pos_segment > neg_segment - sp_loss = 0 + sp_output = torch.concat([sp_output_neg, sp_output_pos], dim=0) + sp_actual = torch.concat([torch.zeros(data['X'].size(0)), torch.ones(data['X'].size(0))]).to(self.device) + sp_loss = nn.functional.binary_cross_entropy_with_logits(sp_output, sp_actual) self.optimizer.zero_grad() - # loss = self.loss(pos_pred, neg_pred) loss = aap_loss + mip_loss + map_loss + sp_loss loss.backward() self.optimizer.step() @@ -165,26 +166,17 @@ def train(self, train_dataloader) -> float: return train_loss - def validate(self, sequence_datasets) -> float: - self.model.eval() - valid_loss = 0 - - for iter_num in tqdm(range(self.cfg.iter_nums)): # sequence - break - valid_loss += loss.item() - - return valid_loss - def load_best_model(self): logger.info(f"[Trainer] Load best model...") self.model.load_state_dict(torch.load(f'{self.cfg.model_dir}/best_pretrain_model.pt')) class S3RecTrainer(BaseTrainer): - def __init__(self, cfg: DictConfig, num_items: int, num_users: int, item2attributes, attributes_count: int) -> None: + def __init__(self, cfg: DictConfig, num_items: int, item2attributes, attributes_count: int) -> None: super().__init__(cfg) - self.model = S3Rec(self.cfg, num_items, num_users, attributes_count).to(self.device) + self.model = S3Rec(self.cfg, num_items, attributes_count).to(self.device) self.optimizer: Optimizer = self._optimizer(self.cfg.optimizer, self.model, self.cfg.lr) self.loss = self._loss() + self._load_best_pretrain_model() def _loss(self): return BPRLoss() @@ -300,6 +292,12 @@ def evaluate(self, test_dataloader: DataLoader) -> tuple[float]: test_recall_at_k, test_map_at_k, test_ndcg_at_k) + + def _load_best_pretrain_model(self): + pretrain_model_dir = f'{self.cfg.model_dir}/best_pretrain_model.pt' + if self.cfg.load_pretrain and os.path.exists(pretrain_model_dir): + logger.info(f"[Trainer] Load best pretrain model...") + self.model.load_state_dict(torch.load(f'{self.cfg.model_dir}/best_pretrain_model.pt')) def _generate_target_and_top_k_recommendation(self, scores: Tensor, pos_item: Tensor) -> tuple[list]: actual = pos_item.cpu().detach().numpy() From fdca1cbace44f8f0aa488b8888c8311053d4efbd Mon Sep 17 00:00:00 2001 From: twndus Date: Thu, 22 Aug 2024 21:21:53 -0400 Subject: [PATCH 17/22] test: data preprocess for s3rec official code (https://github.com/RUCAIBox/CIKM2020-S3Rec) #21 --- data/data_preprocess_for_s3rec.py | 191 ++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 data/data_preprocess_for_s3rec.py diff --git a/data/data_preprocess_for_s3rec.py b/data/data_preprocess_for_s3rec.py new file mode 100644 index 0000000..9b2e455 --- /dev/null +++ b/data/data_preprocess_for_s3rec.py @@ -0,0 +1,191 @@ +import os, json + +import numpy as np +import pandas as pd + +from sklearn.preprocessing import OrdinalEncoder + +import hydra +from omegaconf import DictConfig, OmegaConf +from loguru import logger + +class YelpPreprocessPipe: + + def __init__(self, cfg): + + self.cfg = cfg + os.makedirs(self.cfg.result_dir, exist_ok=True) + + self.user2id = None + self.id2user = None + + self.item2id = None + self.id2item = None + + logger.info('YelpPreprocessPipe instanciated') + + def run(self): + review_df = self._read_yelp_json('review') + logger.info(f'원본 리뷰 데이터: {review_df.shape}') + + review_df = self._filter_by_min_interactions(review_df, self.cfg.min_interactions) + logger.info(f'필터링 후 데이터: {review_df.shape}') + + review_df: pd.DataFrame = self._id_mapping(review_df) + # logger.info(f"review df dtypes: {review_df.dtypes}") + review_df = review_df.sort_values(['date']) + # logger.info(f"after order by: {review_df[review_df.user_id == review_df.iloc[0].user_id].head()}") + review_df = review_df[['user_id', 'business_id', 'stars']].rename(columns={'stars':'rating'}) + + item_df = self._read_yelp_json('business') + items2attributes = self._create_items2attributes(item_df, review_df) + self._save_entities2attributes(items2attributes, 'item') + + behaviors, behaviors_df = self._agg_behaviors(review_df) + self._save_behaviors(behaviors) + + samples = self._negative_sampling(behaviors_df) + self._save_samples(samples) + + def _read_yelp_json(self, datatype, query: str=None): + logger.info(f"load {datatype} raw data ...") + + reader = pd.read_json( + f'{self.cfg.data_dir}/yelp_academic_dataset_{datatype}.json', + orient="records", lines=True, chunksize=10000) + + target_dfs = [] + for raw_df in reader: + if datatype == 'review': + target_df = raw_df.query( + f"`date` >= '{self.cfg.start_date}' and `date` <= '{self.cfg.end_date}'") + else: + target_df = raw_df + if target_df.shape[0] > 0: + target_dfs.append(target_df) + + logger.info(f"done...") + return pd.concat(target_dfs) + + def _filter_by_min_interactions(self, df, min_interactions=5): + logger.info(f"filter users and items having {min_interactions} or more interactions ...") + user_ids_under_5, business_ids_under_5 = [0], [0] + + while len(user_ids_under_5) > 0 or len(business_ids_under_5) > 0: + user_ids_under_5 = df.user_id.value_counts()[df.user_id.value_counts() < 5].index + business_ids_under_5 = df.business_id.value_counts()[df.business_id.value_counts() < 5].index + + df = df[~df.user_id.isin(user_ids_under_5)] + df = df[~df.business_id.isin(business_ids_under_5)] + + logger.info(f"done...") + return df + + def _id_mapping(self, df): + logger.info(f"map user_id and business_id to new numeric ids...") + + self.user2id = {user_id: id_ for id_, user_id in enumerate(df['user_id'].unique(), 1)} + self.id2user = {id_: user_id for user_id, id_ in self.user2id.items()} + + self.item2id = {item_id: id_ for id_, item_id in enumerate(df['business_id'].unique(), 1)} + self.id2item = {id_: item_id for item_id, id_ in self.item2id.items()} + + df['user_id'] = df['user_id'].map(self.user2id) + df['business_id'] = df['business_id'].map(self.item2id) + + logger.info(f"done...") + return df + + def _create_users2attributes(self, user_df, review_df): + logger.info(f"create user2attributes...") + users2attributes = None + + user_df = user_df[user_df.user_id.isin(self.user2id.keys())] + user_df['user_id'] = user_df['user_id'].map(self.user2id) + + logger.info(f"done...") + return users2attributes + + def _create_items2attributes(self, item_df, review_df): + logger.info(f"create item2attributes...") + items2attributes = None + + item_df = item_df[item_df.business_id.isin(self.item2id.keys())] + item_df['business_id'] = item_df['business_id'].map(self.item2id) + + # categories 810 + categories = item_df.categories.str.split(', ').explode().unique() + categories2id = {category: id_ for id_, category in enumerate(categories, 1)} + id2categories = {id_:category for category, id_ in categories2id.items()} + + # encoding + item_df['category_encoded'] = item_df.categories.str.split(', ').apply(lambda x: [categories2id[y] for y in x]) + + items2attributes = { + int(row['business_id']): row['category_encoded']\ + for i, row in item_df.iterrows() + } + + logger.info(f"done...") + return items2attributes + + def _save_entities2attributes(self, entities2attributes, entity_name): + logger.info(f"save {entity_name}2attributes...") + + filename = os.path.join(self.cfg.result_dir, f'yelp_{entity_name}2attributes.json') + with open(filename, 'w') as f: + json.dump(entities2attributes, f) + + logger.info(f"done...") + + def _save_samples(self, samples: list): + logger.info(f"save samples...") + with open(os.path.join(self.cfg.result_dir, 'yelp_samples.txt'), 'w') as f: + for line in samples: + f.write(' '.join(line) + '\n') + logger.info(f"done...") + + def _save_behaviors(self, behaviors): + logger.info(f"save behaviors...") + with open(os.path.join(self.cfg.result_dir, 'yelp.txt'), 'w') as f: + for line in behaviors: + f.write(' '.join(line) + '\n') + logger.info(f"done...") + + def _agg_behaviors(self, df: pd.DataFrame) -> pd.DataFrame: + logger.info(f"aggregate user behaviors...") + # group by user_id + df.business_id = df.business_id.astype('str') + df = df.groupby(['user_id']).agg({'business_id': [('behaviors', list)]}).droplevel(0, 1) + + behaviors = [] + for user, row in df.iterrows(): + behaviors.append([str(user), *row['behaviors']]) + + return behaviors, df + + def _negative_sampling(self, behavior_df: pd.DataFrame) -> pd.DataFrame: + logger.info(f"negative sampling...") + + samples = [] + sample_size = 99 + num_items = len(self.item2id) + for user, behaviors in behavior_df.iterrows(): + neg_items = [] + for _ in range(sample_size): + neg_item = np.random.randint(0, num_items) + while (neg_item in behaviors) or (neg_item in neg_items): + neg_item = np.random.randint(0, num_items) + neg_items.append(str(neg_item)) + samples.append([str(user), *neg_items]) + return samples + + +@hydra.main(version_base=None, config_path="../configs", config_name="data_preprocess") +def main(cfg: OmegaConf): + ypp = YelpPreprocessPipe(cfg) + ypp.run() + + +if __name__ == '__main__': + main() From 59937ab51d5fcf734dd686b2952c135ad1e734a6 Mon Sep 17 00:00:00 2001 From: twndus Date: Thu, 22 Aug 2024 22:27:26 -0400 Subject: [PATCH 18/22] refactor: add add & norm and dropout #21 --- configs/train_config.yaml | 9 +++++---- models/s3rec.py | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/configs/train_config.yaml b/configs/train_config.yaml index 2f90bb7..b5c3dfb 100644 --- a/configs/train_config.yaml +++ b/configs/train_config.yaml @@ -8,14 +8,14 @@ log_dir: logs/ sweep: False # wandb config -wandb: True # True/ False +wandb: False project: YelpRecommendation notes: "..." tags: [yelp, s3rec] # train config -device: cuda # cpu -epochs: 100 +device: cpu +epochs: 5 batch_size: 32 lr: 0.0001 optimizer: adam # adamw @@ -49,7 +49,8 @@ model: max_seq_len: 50 num_heads: 2 num_blocks: 2 - pretrain: True # False + pretrain: False # False load_pretrain: True pretrain_epochs: 100 # 100 mask_portion: 0.2 + dropout_ratio: 0.1 diff --git a/models/s3rec.py b/models/s3rec.py index e22d0da..26c0286 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -4,6 +4,8 @@ from models.base_model import BaseModel from loguru import logger + + class S3Rec(BaseModel): def __init__(self, cfg, num_items, attributes_count): @@ -12,10 +14,21 @@ def __init__(self, cfg, num_items, attributes_count): self.item_embedding = nn.Embedding(num_items + 1, self.cfg.embed_size, dtype=torch.float32) self.attribute_embedding = nn.Embedding(attributes_count, self.cfg.embed_size, dtype=torch.float32) self.positional_encoding = nn.Parameter(torch.rand(self.cfg.max_seq_len, self.cfg.embed_size)) + + self.multihead_attns = nn.ModuleList( + [nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads) for _ in range(self.cfg.num_blocks)]) + self.layernorm1s = nn.ModuleList( + [nn.LayerNorm(self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) - self.ffn1s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) - self.ffn2s = nn.ModuleList([nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) - self.multihead_attns = nn.ModuleList([nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads) for _ in range(self.cfg.num_blocks)]) + self.ffn1s = nn.ModuleList( + [nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) + self.ffn2s = nn.ModuleList( + [nn.Linear(self.cfg.embed_size, self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) + self.layernorm2s = nn.ModuleList( + [nn.LayerNorm(self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) + + self.dropout = nn.Dropout(self.cfg.dropout_ratio) + self.aap_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) self.mip_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) self.map_weight = nn.Linear(self.cfg.embed_size, self.cfg.embed_size, bias=False) @@ -29,7 +42,7 @@ def _init_weights(self): nn.init.xavier_uniform_(child.weight) elif isinstance(child, nn.ModuleList): # nn.Linear): for sub_child in child.children(): - if not isinstance(sub_child, nn.MultiheadAttention): + if isinstance(sub_child, nn.Linear): nn.init.xavier_uniform_(sub_child.weight) elif isinstance(child, nn.Linear): nn.init.xavier_uniform_(child.weight) @@ -40,9 +53,20 @@ def _embedding_layer(self, X): return self.item_embedding(X) + self.positional_encoding def _self_attention_block(self, X): - for multihead_attn, ffn1, ffn2 in zip(self.multihead_attns, self.ffn1s, self.ffn2s): + for multihead_attn, ffn1, ffn2, layernorm1, layernorm2 in zip( + self.multihead_attns, self.ffn1s, self.ffn2s, self.layernorm1s, self.layernorm2s): + # multi-head self-attention attn_output, attn_output_weights = multihead_attn(X, X, X) - X = ffn2(nn.functional.relu(ffn1(attn_output))) + # dropout + attn_output = self.dropout(attn_output) + # add & norm + normalized_attn_output = layernorm1(X + attn_output) + # feed-forward network + ffn_output = ffn2(nn.functional.relu(ffn1(normalized_attn_output))) + # dropout + ffn_output = self.dropout(ffn_output) + # add & norm + X = layernorm2(X + ffn_output) return X def _prediction_layer(self, item, self_attn_output): From bfce575e7f4eb52a88d95350e1d156fe05e81c99 Mon Sep 17 00:00:00 2001 From: twndus Date: Fri, 23 Aug 2024 00:45:38 -0400 Subject: [PATCH 19/22] feat: add padding and causal masks #21 --- models/s3rec.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/models/s3rec.py b/models/s3rec.py index 26c0286..263c52c 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -16,7 +16,7 @@ def __init__(self, cfg, num_items, attributes_count): self.positional_encoding = nn.Parameter(torch.rand(self.cfg.max_seq_len, self.cfg.embed_size)) self.multihead_attns = nn.ModuleList( - [nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads) for _ in range(self.cfg.num_blocks)]) + [nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads, batch_first=True) for _ in range(self.cfg.num_blocks)]) self.layernorm1s = nn.ModuleList( [nn.LayerNorm(self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) @@ -52,11 +52,14 @@ def _init_weights(self): def _embedding_layer(self, X): return self.item_embedding(X) + self.positional_encoding - def _self_attention_block(self, X): + def _self_attention_block(self, X, padding_mask, attn_mask): for multihead_attn, ffn1, ffn2, layernorm1, layernorm2 in zip( self.multihead_attns, self.ffn1s, self.ffn2s, self.layernorm1s, self.layernorm2s): # multi-head self-attention - attn_output, attn_output_weights = multihead_attn(X, X, X) + merged_mask,_ = multihead_attn.merge_masks(attn_mask, padding_mask, X) + attn_output, attn_output_weights = multihead_attn( + X, X, X, #key_padding_mask=padding_mask, + is_causal=True, attn_mask=attn_mask) # dropout attn_output = self.dropout(attn_output) # add & norm @@ -73,8 +76,13 @@ def _prediction_layer(self, item, self_attn_output): return torch.einsum('bi,bi->b', (item, self_attn_output)) def finetune(self, X, pos_item, neg_item): + # create padding mask + padding_mask = (X <= 0).to(self.cfg.device) + attn_mask = torch.triu( + torch.ones(self.cfg.max_seq_len, self.cfg.max_seq_len), diagonal=1 + ).bool().to(self.cfg.device) X = self._embedding_layer(X) - X = self._self_attention_block(X) + X = self._self_attention_block(X, padding_mask, attn_mask) pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]) neg_pred = self._prediction_layer(self.item_embedding(neg_item), X[:, -1]) return pos_pred, neg_pred From b7e9701fed0cb0d492af80e2bdc7e05fcee5c090 Mon Sep 17 00:00:00 2001 From: GangBean Date: Thu, 29 Aug 2024 00:48:22 +0000 Subject: [PATCH 20/22] feat: implemenets MultiheadAttention #21 --- models/s3rec.py | 59 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/models/s3rec.py b/models/s3rec.py index 263c52c..06f513a 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -1,3 +1,4 @@ +import math import torch import torch.nn as nn @@ -16,7 +17,7 @@ def __init__(self, cfg, num_items, attributes_count): self.positional_encoding = nn.Parameter(torch.rand(self.cfg.max_seq_len, self.cfg.embed_size)) self.multihead_attns = nn.ModuleList( - [nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads, batch_first=True) for _ in range(self.cfg.num_blocks)]) + [MultiHeadAttention(self.cfg.embed_size, self.cfg.num_heads) for _ in range(self.cfg.num_blocks)]) self.layernorm1s = nn.ModuleList( [nn.LayerNorm(self.cfg.embed_size) for _ in range(self.cfg.num_blocks)]) @@ -55,11 +56,12 @@ def _embedding_layer(self, X): def _self_attention_block(self, X, padding_mask, attn_mask): for multihead_attn, ffn1, ffn2, layernorm1, layernorm2 in zip( self.multihead_attns, self.ffn1s, self.ffn2s, self.layernorm1s, self.layernorm2s): - # multi-head self-attention - merged_mask,_ = multihead_attn.merge_masks(attn_mask, padding_mask, X) - attn_output, attn_output_weights = multihead_attn( - X, X, X, #key_padding_mask=padding_mask, - is_causal=True, attn_mask=attn_mask) + # # multi-head self-attention + # merged_mask,_ = multihead_attn.merge_masks(attn_mask, padding_mask, X) + # attn_output, attn_output_weights = multihead_attn( + # X, X, X, #key_padding_mask=padding_mask, + # is_causal=True, attn_mask=attn_mask) + attn_output = multihead_attn(X, X, X, padding_mask, attn_mask) # dropout attn_output = self.dropout(attn_output) # add & norm @@ -161,3 +163,48 @@ def sp(self, context_attention_output, subsequence_attention_output): s_tilde = subsequence_attention_output[:, -1, :] # [ B H ] SW = self.sp_weight(s) return torch.einsum('bi,bi->b', SW, s_tilde) # [ B ] + + +class MultiHeadAttention(nn.Module): + def __init__(self, embed_size, num_heads): + super().__init__() + # self.multihead_attns = nn.ModuleList( + # [nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads, batch_first=True) for _ in range(self.cfg.num_blocks)]) + self.embed_size = embed_size + self.num_heads = num_heads + self.q_weights = nn.ModuleList( + [nn.Linear(self.embed_size, self.embed_size, bias=False) for _ in range(self.num_heads)]) + self.k_weights = nn.ModuleList( + [nn.Linear(self.embed_size, self.embed_size, bias=False) for _ in range(self.num_heads)]) + self.v_weights = nn.ModuleList( + [nn.Linear(self.embed_size, self.embed_size, bias=False) for _ in range(self.num_heads)]) + self.output = nn.Linear(num_heads * embed_size, embed_size) + + def forward(self, q, k, v, padding_mask, attn_mask): + # merged_mask,_ = multihead_attn.merge_masks(attn_mask, padding_mask, X) + # attn_output, attn_output_weights = multihead_attn( + # X, X, X, #key_padding_mask=padding_mask, + # is_causal=True, attn_mask=attn_mask) + # # dropout + attention_outputs = [] + for q_weight, k_weight, v_weight in zip(self.q_weights, self.k_weights, self.v_weights): + Q = q_weight(q) + K = k_weight(k) # (B, L, E) + K = K * padding_mask.unsqueeze(2) # (B, L) + logger.info(K) + V = v_weight(v) + + attention_score = torch.matmul(Q, K.permute(0,2,1).contiguous()) # (B, L, L) + attention_score /= math.sqrt(self.embed_size) + attention_score = attention_score * attn_mask.unsqueeze(0) + logger.info(attention_score) + attention_score = torch.nn.functional.softmax(attention_score) + logger.info(attention_score) + attention_score = torch.matmul(attention_score, V) # (B, L, E) + # torch.einsum('bi,bi->b', Q, K) # (batch, seq_len, embed_size) + attention_outputs.append(attention_score) # (H, B, L, E) + + attention_scores = torch.cat(attention_outputs, dim=-1) # (B, L, E*H) + + return self.output(attention_scores) # (B, L, E) + \ No newline at end of file From 2cb455ce5a8ab047ea52585efdb2a04bf5ce0ac5 Mon Sep 17 00:00:00 2001 From: Judy Date: Thu, 29 Aug 2024 03:14:54 +0000 Subject: [PATCH 21/22] fix: fix attention mask #21 --- models/s3rec.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/models/s3rec.py b/models/s3rec.py index 06f513a..33e624e 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -57,10 +57,6 @@ def _self_attention_block(self, X, padding_mask, attn_mask): for multihead_attn, ffn1, ffn2, layernorm1, layernorm2 in zip( self.multihead_attns, self.ffn1s, self.ffn2s, self.layernorm1s, self.layernorm2s): # # multi-head self-attention - # merged_mask,_ = multihead_attn.merge_masks(attn_mask, padding_mask, X) - # attn_output, attn_output_weights = multihead_attn( - # X, X, X, #key_padding_mask=padding_mask, - # is_causal=True, attn_mask=attn_mask) attn_output = multihead_attn(X, X, X, padding_mask, attn_mask) # dropout attn_output = self.dropout(attn_output) @@ -80,9 +76,10 @@ def _prediction_layer(self, item, self_attn_output): def finetune(self, X, pos_item, neg_item): # create padding mask padding_mask = (X <= 0).to(self.cfg.device) + # create attn mask attn_mask = torch.triu( - torch.ones(self.cfg.max_seq_len, self.cfg.max_seq_len), diagonal=1 - ).bool().to(self.cfg.device) + torch.zeros(self.cfg.max_seq_len, self.cfg.max_seq_len) + torch.finfo(torch.float32).min, + diagonal=1).to(self.cfg.device) X = self._embedding_layer(X) X = self._self_attention_block(X, padding_mask, attn_mask) pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]) @@ -168,8 +165,6 @@ def sp(self, context_attention_output, subsequence_attention_output): class MultiHeadAttention(nn.Module): def __init__(self, embed_size, num_heads): super().__init__() - # self.multihead_attns = nn.ModuleList( - # [nn.MultiheadAttention(self.cfg.embed_size, self.cfg.num_heads, batch_first=True) for _ in range(self.cfg.num_blocks)]) self.embed_size = embed_size self.num_heads = num_heads self.q_weights = nn.ModuleList( @@ -181,30 +176,21 @@ def __init__(self, embed_size, num_heads): self.output = nn.Linear(num_heads * embed_size, embed_size) def forward(self, q, k, v, padding_mask, attn_mask): - # merged_mask,_ = multihead_attn.merge_masks(attn_mask, padding_mask, X) - # attn_output, attn_output_weights = multihead_attn( - # X, X, X, #key_padding_mask=padding_mask, - # is_causal=True, attn_mask=attn_mask) - # # dropout attention_outputs = [] for q_weight, k_weight, v_weight in zip(self.q_weights, self.k_weights, self.v_weights): Q = q_weight(q) K = k_weight(k) # (B, L, E) K = K * padding_mask.unsqueeze(2) # (B, L) - logger.info(K) V = v_weight(v) attention_score = torch.matmul(Q, K.permute(0,2,1).contiguous()) # (B, L, L) attention_score /= math.sqrt(self.embed_size) - attention_score = attention_score * attn_mask.unsqueeze(0) - logger.info(attention_score) - attention_score = torch.nn.functional.softmax(attention_score) - logger.info(attention_score) + attention_score += attn_mask + attention_score = torch.nn.functional.softmax(attention_score, dim=-1) attention_score = torch.matmul(attention_score, V) # (B, L, E) - # torch.einsum('bi,bi->b', Q, K) # (batch, seq_len, embed_size) attention_outputs.append(attention_score) # (H, B, L, E) attention_scores = torch.cat(attention_outputs, dim=-1) # (B, L, E*H) return self.output(attention_scores) # (B, L, E) - \ No newline at end of file + From 322ea178567521e009aacd43f99267f59b822552 Mon Sep 17 00:00:00 2001 From: Judy Date: Thu, 29 Aug 2024 04:14:11 +0000 Subject: [PATCH 22/22] fix: fix the target of loss from only last item to all items along the sequence #21 --- data/datasets/s3rec_data_pipeline.py | 10 +++++++--- data/datasets/s3rec_dataset.py | 7 ++++--- models/s3rec.py | 29 +++++++++++++++++++++++----- trainers/s3rec_trainer.py | 12 ++++++------ 4 files changed, 41 insertions(+), 17 deletions(-) diff --git a/data/datasets/s3rec_data_pipeline.py b/data/datasets/s3rec_data_pipeline.py index a3b7410..5b230ed 100644 --- a/data/datasets/s3rec_data_pipeline.py +++ b/data/datasets/s3rec_data_pipeline.py @@ -13,21 +13,25 @@ def __init__(self, cfg): def split(self, df: pd.DataFrame): # train X: [:-3] y: -3 train_df_X = df.behaviors.apply(lambda row: row[: -3]).rename('X') - train_df_Y = df.behaviors.apply(lambda row: row[-3]).rename('y') + train_df_Y = df.behaviors.apply(lambda row: row[1:-2]).rename('Y') # valid X: [:-2] y: -2 valid_df_X = df.behaviors.apply(lambda row: row[: -2]).rename('X') - valid_df_Y = df.behaviors.apply(lambda row: row[-2]).rename('y') + valid_df_Y = df.behaviors.apply(lambda row: row[2:-1]).rename('Y') # test X: [:-1] y: -1 test_df_X = df.behaviors.apply(lambda row: row[: -1]).rename('X') - test_df_Y = df.behaviors.apply(lambda row: row[-1]).rename('y') + test_df_Y = df.behaviors.apply(lambda row: row[3:]).rename('Y') # pre-padding for input sequence X train_df_X = self._adjust_seq_len(train_df_X) valid_df_X = self._adjust_seq_len(valid_df_X) test_df_X = self._adjust_seq_len(test_df_X) + train_df_Y = self._adjust_seq_len(train_df_Y) + valid_df_Y = self._adjust_seq_len(valid_df_Y) + test_df_Y = self._adjust_seq_len(test_df_Y) + return pd.concat([df, train_df_X, train_df_Y], axis=1),\ pd.concat([df, valid_df_X, valid_df_Y], axis=1),\ pd.concat([df, test_df_X, test_df_Y], axis=1) diff --git a/data/datasets/s3rec_dataset.py b/data/datasets/s3rec_dataset.py index 5ae2364..bb59bcd 100644 --- a/data/datasets/s3rec_dataset.py +++ b/data/datasets/s3rec_dataset.py @@ -30,7 +30,8 @@ def _negative_sampling(self, behaviors): def __getitem__(self, user_id): data = self.data.iloc[user_id,:] - pos_item = data['y'].astype('int64') + pos_items = np.array(data['Y'], dtype='int64') + pos_item = pos_items[-1] aap_actual = np.array([[1 if attriute in self.item2attribute[item]['categories'] else 0 \ for attriute in range(self.attributes_count)] for item in data['X']], dtype='float') mip_actual = np.zeros((len(data['X']), self.num_items+1), dtype='float') @@ -40,8 +41,8 @@ def __getitem__(self, user_id): return { 'user_id': user_id, 'X': np.array(data['X'], dtype='int64'), - 'pos_item': pos_item, - 'neg_item': self._negative_sampling(data['behaviors'])[0], + 'pos_items': pos_items, + 'neg_items': np.array([self._negative_sampling(data['behaviors'])[0] for _ in range(pos_items.shape[0])]), 'aap_actual': aap_actual, 'mip_actual': mip_actual, } diff --git a/models/s3rec.py b/models/s3rec.py index 33e624e..ee31d97 100644 --- a/models/s3rec.py +++ b/models/s3rec.py @@ -73,7 +73,20 @@ def _self_attention_block(self, X, padding_mask, attn_mask): def _prediction_layer(self, item, self_attn_output): return torch.einsum('bi,bi->b', (item, self_attn_output)) - def finetune(self, X, pos_item, neg_item): + def _sequence_prediction_layer(self, items, self_attn_outputs): + ''' + input: + items: batch, max_seq_len, embed_size + self_attn_outputs: batch, max_seq_len, embed_size + + return: + batch*max_seq_len, + ''' + return torch.einsum('bi,bi->b', + (items.view(-1, self.cfg.embed_size), self_attn_outputs.view(-1, self.cfg.embed_size))) + + + def finetune(self, X, pos_items, neg_items): # create padding mask padding_mask = (X <= 0).to(self.cfg.device) # create attn mask @@ -82,13 +95,19 @@ def finetune(self, X, pos_item, neg_item): diagonal=1).to(self.cfg.device) X = self._embedding_layer(X) X = self._self_attention_block(X, padding_mask, attn_mask) - pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]) - neg_pred = self._prediction_layer(self.item_embedding(neg_item), X[:, -1]) - return pos_pred, neg_pred + pos_preds = self._sequence_prediction_layer(self.item_embedding(pos_items), X) + neg_preds = self._sequence_prediction_layer(self.item_embedding(neg_items), X) + return pos_preds, neg_preds def evaluate(self, X, pos_item, neg_items): + # create padding mask + padding_mask = (X <= 0).to(self.cfg.device) + # create attn mask + attn_mask = torch.triu( + torch.zeros(self.cfg.max_seq_len, self.cfg.max_seq_len) + torch.finfo(torch.float32).min, + diagonal=1).to(self.cfg.device) X = self._embedding_layer(X) - X = self._self_attention_block(X) + X = self._self_attention_block(X, padding_mask, attn_mask) pos_pred = self._prediction_layer(self.item_embedding(pos_item), X[:, -1]).view(pos_item.size(0), -1) neg_preds = [self._prediction_layer( self.item_embedding(neg_items[:,i]), X[:, -1]).view(neg_items.size(0), -1) for i in range(neg_items.size(-1))] diff --git a/trainers/s3rec_trainer.py b/trainers/s3rec_trainer.py index 4df1d2c..5cfbfb5 100644 --- a/trainers/s3rec_trainer.py +++ b/trainers/s3rec_trainer.py @@ -234,11 +234,11 @@ def train(self, train_dataloader: DataLoader) -> float: self.model.train() train_loss = 0 for data in tqdm(train_dataloader): - X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) - pos_pred, neg_pred = self.model.finetune(X, pos_item, neg_item) + X, pos_items, neg_items = data['X'].to(self.device), data['pos_items'].to(self.device), data['neg_items'].to(self.device) + pos_preds, neg_preds = self.model.finetune(X, pos_items, neg_items) self.optimizer.zero_grad() - loss = self.loss(pos_pred, neg_pred) + loss = self.loss(pos_preds, neg_preds) # batch*max_length, 1 loss.backward() self.optimizer.step() @@ -251,11 +251,11 @@ def validate(self, valid_dataloader: DataLoader) -> tuple[float]: valid_loss = 0 # actual, predicted = [], [] for data in tqdm(valid_dataloader): - X, pos_item, neg_item = data['X'].to(self.device), data['pos_item'].to(self.device), data['neg_item'].to(self.device) - pos_pred, neg_pred = self.model.finetune(X, pos_item, neg_item) + X, pos_items, neg_items = data['X'].to(self.device), data['pos_items'].to(self.device), data['neg_items'].to(self.device) + pos_preds, neg_preds = self.model.finetune(X, pos_items, neg_items) self.optimizer.zero_grad() - loss = self.loss(pos_pred, neg_pred) + loss = self.loss(pos_preds, neg_preds) valid_loss += loss.item()