From d5da30fbe6e477be3a18e076c0f28a567987e792 Mon Sep 17 00:00:00 2001 From: Judy Date: Wed, 22 May 2024 09:41:53 +0000 Subject: [PATCH] refactor: validate model and training process #16 --- configs/train_config.yaml | 8 +++--- data/datasets/mf_data_pipeline.py | 10 ++++--- models/dcn.py | 11 ++++---- train.py | 2 ++ trainers/dcn_trainer.py | 43 ++++++++++++------------------- 5 files changed, 34 insertions(+), 40 deletions(-) diff --git a/configs/train_config.yaml b/configs/train_config.yaml index ed519d2..e0b0bfe 100644 --- a/configs/train_config.yaml +++ b/configs/train_config.yaml @@ -8,16 +8,16 @@ log_dir: logs/ sweep: False # wandb config -wandb: False # True/ False +wandb: True # True/ False project: YelpRecommendation notes: "..." tags: [test, yelp, cdae] # train config device: cuda # cpu -epochs: 10 +epochs: 100 batch_size: 32 -lr: 0.001 +lr: 0.0001 optimizer: adam # adamw loss_name: bpr # bpr # pointwise # bce patience: 5 @@ -38,5 +38,5 @@ weight_decay: 0 #1e-5 model_name: DCN hidden_dims: [1024, 1024] -cross_orders: 6 +cross_orders: 1 #6 embed_size: 64 diff --git a/data/datasets/mf_data_pipeline.py b/data/datasets/mf_data_pipeline.py index fbd0580..076dcc0 100644 --- a/data/datasets/mf_data_pipeline.py +++ b/data/datasets/mf_data_pipeline.py @@ -24,11 +24,13 @@ def split(self, df): for _, user_df in df.groupby('user_id'): if self.cfg.loss_name == 'pointwise': - user_train_df, user_test_df = train_test_split(user_df, test_size=.2, stratify=user_df['rating']) - user_train_df, user_valid_df = train_test_split(user_train_df, test_size=.25, stratify=user_train_df['rating']) + user_train_df, user_test_df = train_test_split( + user_df, test_size=.2, random_state=self.cfg.seed, stratify=user_df['rating']) + user_train_df, user_valid_df = train_test_split( + user_train_df, test_size=.25, random_state=self.cfg.seed, stratify=user_train_df['rating']) else: - user_train_df, user_test_df = train_test_split(user_df, test_size=.2) - user_train_df, user_valid_df = train_test_split(user_train_df, test_size=.25) + user_train_df, user_test_df = train_test_split(user_df, test_size=.2, random_state=self.cfg.seed) + user_train_df, user_valid_df = train_test_split(user_train_df, test_size=.25, random_state=self.cfg.seed) train_df.append(user_train_df) valid_df.append(user_valid_df) test_df.append(user_test_df) diff --git a/models/dcn.py b/models/dcn.py index 2b980dd..522d90f 100644 --- a/models/dcn.py +++ b/models/dcn.py @@ -2,6 +2,7 @@ import torch.nn as nn from models.base_model import BaseModel +from loguru import logger class DCN(BaseModel): def __init__(self, cfg, num_users, num_items, attributes_count: list): @@ -21,7 +22,7 @@ def __init__(self, cfg, num_users, num_items, attributes_count: list): def _deep(self): deep = nn.Sequential() - for idx in range(len(self.hidden_dims)-1): # + for idx in range(len(self.hidden_dims)-1): deep.append(nn.Linear(self.hidden_dims[idx], self.hidden_dims[idx+1])) deep.append(nn.ReLU()) return deep @@ -34,10 +35,10 @@ def _cross(self): def _init_weights(self): for child in self.children(): if isinstance(child, nn.Embedding): - nn.init.xavier_uniform_(child.weight) + nn.init.kaiming_normal_(child.weight) elif isinstance(child, nn.Linear): - nn.init.xavier_uniform_(child.weight) - nn.init.uniform_(child.bias) + nn.init.kaiming_normal_(child.weight) + nn.init.zeros_(child.bias) def forward(self, user_id, item_id, *attributes): user_emb = self.user_embedding(user_id) @@ -54,7 +55,7 @@ def forward(self, user_id, item_id, *attributes): input_x = torch.cat([self.deep(input_x), self._forward_cross(input_x)], dim=1) return torch.sigmoid(self.output_layer(input_x)) - + def _forward_cross(self, x): prev_x = x for weight, bias in zip(self.cross_weights, self.cross_bias): diff --git a/train.py b/train.py index 7a06fb1..d816585 100644 --- a/train.py +++ b/train.py @@ -81,11 +81,13 @@ def train(cfg, args):#train_dataset, valid_dataset, test_dataset, model_info): elif cfg.model_name in ('MF', ): trainer = MFTrainer(cfg, args.model_info['num_items'], args.model_info['num_users']) trainer.run(train_dataloader, valid_dataloader, args.valid_eval_data) + trainer.load_best_model() trainer.evaluate(args.test_eval_data, 'test') elif cfg.model_name in ('DCN', ): trainer = DCNTrainer(cfg, args.model_info['num_items'], args.model_info['num_users'], args.data_pipeline.item2attributes, args.data_pipeline.attributes_count) trainer.run(train_dataloader, valid_dataloader, args.valid_eval_data) + trainer.load_best_model() trainer.evaluate(args.test_eval_data, 'test') @hydra.main(version_base=None, config_path="configs", config_name="train_config") diff --git a/trainers/dcn_trainer.py b/trainers/dcn_trainer.py index 9885217..131b4fa 100644 --- a/trainers/dcn_trainer.py +++ b/trainers/dcn_trainer.py @@ -82,18 +82,12 @@ def train(self, train_dataloader: DataLoader) -> float: for data in tqdm(train_dataloader): user_id, pos_item, neg_item = data['user_id'].to(self.device), data['pos_item'].to(self.device), \ data['neg_item'].to(self.device) - # pos_item_categories, pos_item_statecity, neg_item_categories, neg_item_statecity = \ - # data['pos_item_categories'].to(self.device), data['pos_item_statecity'].to(self.device), \ - # data['neg_item_categories'].to(self.device), data['neg_item_statecity'].to(self.device) + pos_item_categories = torch.tensor([self.item2attributes[item.item()]['categories'] for item in data['pos_item']]).to(self.device) pos_item_statecity = torch.tensor([self.item2attributes[item.item()]['statecity'] for item in data['pos_item']]).to(self.device) neg_item_categories = torch.tensor([self.item2attributes[item.item()]['categories'] for item in data['neg_item']]).to(self.device) neg_item_statecity = torch.tensor([self.item2attributes[item.item()]['statecity'] for item in data['neg_item']]).to(self.device) - # logger.info(f"pos_categories: {torch.equal(pos_item_categories, torch.tensor([self.item2attributes[item.item()]['categories'] for item in data['pos_item']]).to(self.device))}") - # logger.info(f"pos_statecity: {torch.equal(pos_item_statecity, torch.tensor([self.item2attributes[item.item()]['statecity'] for item in data['pos_item']]).to(self.device))}") - # logger.info(f"neg_categories: {torch.equal(neg_item_categories, torch.tensor([self.item2attributes[item.item()]['categories'] for item in data['neg_item']]).to(self.device))}") - # logger.info(f"neg_statecity: {torch.equal(neg_item_statecity, torch.tensor([self.item2attributes[item.item()]['statecity'] for item in data['neg_item']]).to(self.device))}") pos_pred = self.model(user_id, pos_item, pos_item_categories, pos_item_statecity) neg_pred = self.model(user_id, neg_item, neg_item_categories, neg_item_statecity) @@ -112,9 +106,6 @@ def validate(self, valid_dataloader: DataLoader) -> tuple[float]: for data in tqdm(valid_dataloader): user_id, pos_item, neg_item = data['user_id'].to(self.device), data['pos_item'].to(self.device), \ data['neg_item'].to(self.device) - # pos_item_categories, pos_item_statecity, neg_item_categories, neg_item_statecity = \ - # data['pos_item_categories'].to(self.device), data['pos_item_statecity'].to(self.device), \ - # data['neg_item_categories'].to(self.device), data['neg_item_statecity'].to(self.device) pos_item_categories = torch.tensor([self.item2attributes[item.item()]['categories'] for item in data['pos_item']]).to(self.device) pos_item_statecity = torch.tensor([self.item2attributes[item.item()]['statecity'] for item in data['pos_item']]).to(self.device) neg_item_categories = torch.tensor([self.item2attributes[item.item()]['categories'] for item in data['neg_item']]).to(self.device) @@ -132,33 +123,32 @@ def validate(self, valid_dataloader: DataLoader) -> tuple[float]: def evaluate(self, eval_data: pd.DataFrame, mode='valid') -> tuple: self.model.eval() actual, predicted = [], [] - logger.info(f"Before inference #0: {torch.cuda.memory_allocated(self.device)} allocated and {torch.cuda.memory_reserved(self.device)} reserved") item_input = torch.tensor([item_id for item_id in range(self.num_items)], dtype=torch.int32).to(self.device) - # item_categories = torch.tensor([self.item2attributes[item]['categories'] for item in range(self.num_items)], dtype=torch.int32).to(self.device) - # item_statecity = torch.tensor([self.item2attributes[item]['statecity'] for item in range(self.num_items)], dtype=torch.int32).to(self.device) - chunk_size = 32 # self.cfg.batch_size - # logger.info(f"Before inference #1: {torch.cuda.memory_allocated(self.device)} allocated and {torch.cuda.memory_reserved(self.device)} reserved") - torch.cuda.empty_cache() - # logger.info(f"Before inference #2: {torch.cuda.memory_allocated(self.device)} allocated and {torch.cuda.memory_reserved(self.device)} reserved") - for user_id, row in tqdm(eval_data[:10].iterrows(), total=eval_data.shape[0]): + chunk_size = self.cfg.batch_size + + # for efficient learning + if mode == 'valid': + eval_data = eval_data[:1000] + + for user_id, row in tqdm(eval_data.iterrows(), total=eval_data.shape[0]): pred = [] for idx in range(0, eval_data.shape[0], chunk_size): chunk_item_input = item_input[idx:idx+chunk_size] - chunk_item_categories = torch.tensor([self.item2attributes[item]['categories'] for item in range(idx, min(self.num_items, idx+chunk_size))], dtype=torch.int32).to(self.device) - chunk_item_statecity = torch.tensor([self.item2attributes[item]['statecity'] for item in range(idx, min(self.num_items, idx+chunk_size))], dtype=torch.int32).to(self.device) - # print(f"{chunk_size}, {chunk_item_input.size()}, {chunk_item_categories.size()}, {chunk_item_statecity.size()}") - # logger.info(f"{torch.cuda.memory_allocated(self.device)} allocated and {torch.cuda.memory_reserved(self.device)} reserved") + chunk_item_categories = torch.tensor([ + self.item2attributes[item]['categories'] for item in range(idx, min(self.num_items, idx+chunk_size))], dtype=torch.int32).to(self.device) + chunk_item_statecity = torch.tensor([ + self.item2attributes[item]['statecity'] for item in range(idx, min(self.num_items, idx+chunk_size))], dtype=torch.int32).to(self.device) - chunk_pred: Tensor = self.model(torch.tensor([user_id,]*len(chunk_item_input), dtype=torch.int32).to(self.device), chunk_item_input, chunk_item_categories, chunk_item_statecity) + chunk_pred: Tensor = self.model( + torch.tensor([user_id,]*len(chunk_item_input), dtype=torch.int32).to(self.device), chunk_item_input, chunk_item_categories, chunk_item_statecity) pred.extend(chunk_pred.detach().cpu().numpy()) - # torch.cuda.empty_cache() - # pred = self.model(torch.tensor([user_id,]*self.num_items).to(self.device), item_input, item_categories, item_statecity) batch_predicted = \ self._generate_top_k_recommendation(np.array(pred).reshape(-1), row['mask_items']) actual.append(row['pos_items']) predicted.append(batch_predicted) + logger.info(f'0 users predicted: {predicted[0]} actual: {actual[0]}') test_precision_at_k = precision_at_k(actual, predicted, self.cfg.top_n) test_recall_at_k = recall_at_k(actual, predicted, self.cfg.top_n) test_map_at_k = map_at_k(actual, predicted, self.cfg.top_n) @@ -178,8 +168,7 @@ def evaluate(self, eval_data: pd.DataFrame, mode='valid') -> tuple: def _generate_top_k_recommendation(self, pred: np.ndarray, mask_items) -> tuple[list]: # mask to train items - # pred = pred.cpu().detach().numpy() - pred[mask_items] = -3.40282e+38 # finfo(float32) + pred[mask_items] = 0 # sigmoid # find the largest topK item indexes by user topn_index = np.argpartition(pred, -self.cfg.top_n)[-self.cfg.top_n:]