diff --git a/core/data_utils/load_data_lp.py b/core/data_utils/load_data_lp.py index f4d0386836..e3faa2559f 100644 --- a/core/data_utils/load_data_lp.py +++ b/core/data_utils/load_data_lp.py @@ -200,40 +200,60 @@ def load_taglp_citationv8(cfg: CN) -> Tuple[Dict[str, Data], List[str]]: splits, text, data = load_taglp_arxiv2023(args.data) print(f'directed: {data.is_directed()}') print(data) - print(type(text)) - + print(text[0]) + print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.") + print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.") + print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.") + print('citationv8') splits, text, data = load_taglp_citationv8(args.data) print(f'directed: {data.is_directed()}') print(data) - print(type(text)) + print(text[0]) + print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.") + print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.") + print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.") - exit(-1) print('cora') splits, text, data = load_taglp_cora(args.data) print(f'directed: {data.is_directed()}') print(data) - print(type(text)) - - print('product') - splits, text, data = load_taglp_product(args.data) - print(f'directed: {data.is_directed()}') - print(data) - print(type(text)) + print(text[0]) + print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.") + print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.") + print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.") + + # print('product') + # splits, text, data = load_taglp_product(args.data) + # print(f'directed: {data.is_directed()}') + # print(data) + # print(text[0]) + # print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.") + # print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.") + # print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.") print('pubmed') splits, text, data = load_taglp_pubmed(args.data) print(f'directed: {data.is_directed()}') print(data) - print(type(text)) - - splits, text, data = load_taglp_citeseer(args.data) - print(f'directed: {data.is_directed()}') - print(data) - print(type(text)) + print(text[0]) + print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.") + print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.") + print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.") + + # splits, text, data = load_taglp_citeseer(args.data) + # print(f'directed: {data.is_directed()}') + # print(data) + # # print(text[0]) + # print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.") + # print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.") + # print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.") print(args.data) splits, text, data = load_taglp_ogbn_arxiv(args.data) print(f'directed: {data.is_directed()}') print(data) - print(type(text)) + print(text[0]) + print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.") + print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.") + print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.") diff --git a/core/embedding/line_tag.py b/core/embedding/line_tag.py index 3615d24f56..84aaa7e09d 100644 --- a/core/embedding/line_tag.py +++ b/core/embedding/line_tag.py @@ -29,7 +29,7 @@ from heuristic.eval import get_metric_score from data_utils.load import load_data_lp -from graphgps.visualization.adj import plot_coo_matrix, construct_sparse_adj +from core.model_finetuning.adj import plot_coo_matrix, construct_sparse_adj from core.embedding.tune_utils import ( get_git_repo_root_path, param_tune_acc_mrr diff --git a/core/embedding/node2vec_tag.py b/core/embedding/node2vec_tag.py index 4e894e1c40..001dc502f7 100644 --- a/core/embedding/node2vec_tag.py +++ b/core/embedding/node2vec_tag.py @@ -6,7 +6,7 @@ import scipy.sparse as ssp import torch import matplotlib.pyplot as plt -from graphgps.visualization.adj import plot_coo_matrix, construct_sparse_adj +from core.model_finetuning.adj import plot_coo_matrix, construct_sparse_adj from ogb.linkproppred import Evaluator from heuristic.eval import get_metric_score from embedding.ge.models import Node2Vec diff --git a/core/graphgps/network/ncn.py b/core/graphgps/network/ncn.py index 807d8eba26..10df404cd9 100644 --- a/core/graphgps/network/ncn.py +++ b/core/graphgps/network/ncn.py @@ -9,7 +9,7 @@ from torch_scatter import scatter_add from typing import Iterable, Final sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from graphgps.visualization.adj import (plot_coo_matrix, +from core.model_finetuning.adj import (plot_coo_matrix, coo_matrix, construct_sparse_adj, coo_tensor_to_coo_matrix) diff --git a/core/heuristic/arxiv2023_heuristic.py b/core/heuristic/arxiv2023_heuristic.py index f13cc2d1bc..0526099a95 100644 --- a/core/heuristic/arxiv2023_heuristic.py +++ b/core/heuristic/arxiv2023_heuristic.py @@ -11,7 +11,7 @@ from heuristic.gsf import Ben_PPR, shortest_path, katz_apro, katz_close , SymPPR from data_utils.load_data_lp import get_raw_text_pubmed, get_pubmed_lp import matplotlib.pyplot as plt -from core.graphgps.visualization.adj import construct_sparse_adj, plot_coo_matrix +from core.model_finetuning.adj import construct_sparse_adj, plot_coo_matrix import scipy.sparse as ssp from core.graphgps.utility.utils import get_git_repo_root_path from typing import Dict diff --git a/core/heuristic/cora_heuristic.py b/core/heuristic/cora_heuristic.py index 1b711bd0ba..2d99409e4c 100644 --- a/core/heuristic/cora_heuristic.py +++ b/core/heuristic/cora_heuristic.py @@ -14,7 +14,7 @@ from heuristic.gsf import Ben_PPR, shortest_path, katz_apro, katz_close, SymPPR from textfeat.mlp_dot_product import pairwise_prediction import matplotlib.pyplot as plt -from core.graphgps.visualization.adj import plot_coo_matrix, construct_sparse_adj +from core.model_finetuning.adj import plot_coo_matrix, construct_sparse_adj from core.graphgps.utility.utils import get_git_repo_root_path, append_acc_to_excel, append_mrr_to_excel from ogb.linkproppred import PygLinkPropPredDataset, Evaluator from heuristic.eval import get_metric_score diff --git a/core/heuristic/ogbn_products_heuristic.py b/core/heuristic/ogbn_products_heuristic.py index e6a9bffd3f..3cbd62cba7 100644 --- a/core/heuristic/ogbn_products_heuristic.py +++ b/core/heuristic/ogbn_products_heuristic.py @@ -9,7 +9,7 @@ from heuristic.lsf import CN, AA, RA, InverseRA from heuristic.gsf import Ben_PPR, shortest_path, katz_apro, katz_close , SymPPR import matplotlib.pyplot as plt -from core.graphgps.visualization.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj +from core.model_finetuning.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj import scipy.sparse as ssp from ogb.linkproppred import PygLinkPropPredDataset, Evaluator diff --git a/core/heuristic/pubmed_heuristic.py b/core/heuristic/pubmed_heuristic.py index 257bf5f2f4..d991ade01e 100644 --- a/core/heuristic/pubmed_heuristic.py +++ b/core/heuristic/pubmed_heuristic.py @@ -11,9 +11,8 @@ from torch_geometric.transforms import RandomLinkSplit import pandas as pd from torch_geometric.data import Data, InMemoryDataset -from data_utils.load_data_lp import get_raw_text_pubmed, get_pubmed_lp import matplotlib.pyplot as plt -from core.graphgps.visualization.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj +from core.model_finetuning.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj import scipy.sparse as ssp from heuristic.lsf import CN, AA, RA, InverseRA diff --git a/core/graphgps/visualization/adj.py b/core/model_finetuning/adj.py similarity index 96% rename from core/graphgps/visualization/adj.py rename to core/model_finetuning/adj.py index ddc3dfe6de..ef3c0e1966 100644 --- a/core/graphgps/visualization/adj.py +++ b/core/model_finetuning/adj.py @@ -12,7 +12,7 @@ from ogb.nodeproppred import NodePropPredDataset from scipy.sparse import csc_array -from data_utils.load import load_data_nc +from data_utils.load import load_data_nc, load_graph_lp import matplotlib.pyplot as plt from scipy.sparse import coo_matrix @@ -237,49 +237,25 @@ def avg_degree(G): args = parser.parse_args() scale = 100000 - name_list = ['ogbn-arxiv', 'arxiv_2023', 'cora', 'pubmed'] + name_list = ['cora', 'pubmed', 'arxiv_2023', 'ogbn-arxiv', 'citationv8'] for name in name_list: + if name == 'cora': + data, text = load_graph_lp[name](use_mask=False) + raise NotImplementedError - if name == 'ogbn-products': - dataset = NodePropPredDataset(name) - edge_index = dataset[0][0]['edge_index'] - - # edge index to sparse matrix - edge_index = edge_index[:, ::scale] - m = construct_sparse_adj(edge_index) - + if name == 'pubmed': + data, text = load_data_nc[name](use_mask=False) G = nx.from_scipy_sparse_array(m) - plot_coo_matrix(m, f'{name}_data_edges.png') - - fig, ax = spy.spy_to_mpl(m) - fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight') - - data, text = load_data_nc[name]() - m = construct_sparse_adj(data.edge_index.coo()) - plot_coo_matrix(m, f'{name}_data_index.png') + compare_adj(name, data.edge_index.numpy()) + m = construct_sparse_adj(data.edge_index.numpy()) fig, ax = spy.spy_to_mpl(m) fig.savefig(f"{name}_data_index_spy.png", bbox_inches='tight') heterogeneity = calculate_heterogeneity(G) - print(f"{name}, heterogeneity: {heterogeneity}. num_node: {dataset[0].num_node}") - - if name == 'ogbn-arxiv': - dataset = NodePropPredDataset(name) - edge_index = dataset[0][0]['edge_index'] - - m = construct_sparse_adj(edge_index[:, ::2]) - G = nx.from_scipy_sparse_array(m) - - plot_coo_matrix(m, f'{name}_data_edges.png') - - fig, ax = spy.spy_to_mpl(m) - fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight') - - heterogeneity = calculate_heterogeneity(G) - num_nodes = dataset[0][0]['num_nodes'] - num_edges = dataset[0][0]['edge_index'].shape[1] + num_nodes = data.num_nodes + num_edges = data.edge_index.shape[1] avg_degree_arithmetic = int(num_edges / num_nodes) avg_degree_G, avg_degree_dict = avg_degree(G) avg_degree_G2 = avg_degree2(G, avg_degree_dict) @@ -287,6 +263,7 @@ def avg_degree(G): avg degree arithmetic {avg_degree_arithmetic}, \ avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.") + if name == 'arxiv_2023': data, text = load_data_nc[name]() m = construct_sparse_adj(data.edge_index.numpy()) @@ -307,21 +284,50 @@ def avg_degree(G): avg degree arithmetic {avg_degree_arithmetic}, \ avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.") - if name == 'pubmed': - data, text = load_data_nc[name](use_mask=False) + + if name == 'ogbn-arxiv': + dataset = NodePropPredDataset(name) + edge_index = dataset[0][0]['edge_index'] + + m = construct_sparse_adj(edge_index[:, ::2]) G = nx.from_scipy_sparse_array(m) - compare_adj(name, data.edge_index.numpy()) - m = construct_sparse_adj(data.edge_index.numpy()) + plot_coo_matrix(m, f'{name}_data_edges.png') + fig, ax = spy.spy_to_mpl(m) - fig.savefig(f"{name}_data_index_spy.png", bbox_inches='tight') + fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight') heterogeneity = calculate_heterogeneity(G) - num_nodes = data.num_nodes - num_edges = data.edge_index.shape[1] + num_nodes = dataset[0][0]['num_nodes'] + num_edges = dataset[0][0]['edge_index'].shape[1] avg_degree_arithmetic = int(num_edges / num_nodes) avg_degree_G, avg_degree_dict = avg_degree(G) avg_degree_G2 = avg_degree2(G, avg_degree_dict) print(f"{name}, heterogeneity: {heterogeneity}. num_node: {num_nodes}, num_edges: {num_edges}, \ avg degree arithmetic {avg_degree_arithmetic}, \ - avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.") \ No newline at end of file + avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.") + + + if name == 'citationv8': + dataset = NodePropPredDataset(name) + edge_index = dataset[0][0]['edge_index'] + + # edge index to sparse matrix + edge_index = edge_index[:, ::scale] + m = construct_sparse_adj(edge_index) + + G = nx.from_scipy_sparse_array(m) + plot_coo_matrix(m, f'{name}_data_edges.png') + + fig, ax = spy.spy_to_mpl(m) + fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight') + + data, text = load_data_nc[name]() + m = construct_sparse_adj(data.edge_index.coo()) + plot_coo_matrix(m, f'{name}_data_index.png') + + fig, ax = spy.spy_to_mpl(m) + fig.savefig(f"{name}_data_index_spy.png", bbox_inches='tight') + + heterogeneity = calculate_heterogeneity(G) + print(f"{name}, heterogeneity: {heterogeneity}. num_node: {dataset[0].num_node}") diff --git a/core/model_finetuning/create_dataset.py b/core/model_finetuning/create_dataset.py index ec4ab2bf3e..ede46fa9de 100644 --- a/core/model_finetuning/create_dataset.py +++ b/core/model_finetuning/create_dataset.py @@ -134,25 +134,15 @@ def process_texts(pos_edge_index, neg_edge_index, text): return dataset, labels -def main(): +def save_dataset(embedding_model_name, cfg, args): # create dataset with 3 seeds - embedding_model_name = "tfidf" - file_path = f'{get_git_repo_root_path()}/' - args = parse_args() - cfg = set_cfg(file_path, args.cfg_file) - cfg.merge_from_list(args.opts) - custom_set_out_dir(cfg, args.cfg_file, cfg.wandb.name_tag) - dump_cfg(cfg) - cfg = config_device(cfg) - torch.set_num_threads(cfg.run.num_threads) - cfg.data.name = args.data - cfg.seed = args.seed + for run_id, seed, split_index in zip(*run_loop_settings(cfg, args)): print(f'run id : {run_id}, seed: {seed}, split_index: {split_index}') cfg.seed = seed cfg.run_id = run_id seed_everything(cfg.seed) - splits, text, data = load_data_lp[cfg.data.name](cfg.data) + splits, text, _ = load_data_lp[cfg.data.name](cfg.data) if embedding_model_name == "tfidf": train_dataset, train_labels = process_texts( @@ -177,35 +167,16 @@ def main(): start_time = time.time() train_dataset = vectorizer.fit_transform(train_dataset) print(f'fit_transform: {time.time() - start_time:.2f} seconds') - ssp.save_npz(f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_dataset.npz', train_dataset) - print(f'Saved train dataset to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_dataset.npz') - print(f'save data: {time.time() - start_time:.2f} seconds') - torch.save(train_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_labels.npz') - print(f'Saved train labels to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_labels.npz') - print(f'save label: {time.time() - start_time:.2f} seconds') - - del train_dataset + + # del train_dataset start_time = time.time() val_dataset = vectorizer.transform(val_dataset) print(f'fit_transform: {time.time() - start_time:.2f} seconds') - ssp.save_npz(f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_dataset.npz', val_dataset) - print(f'save data: {time.time() - start_time:.2f} seconds') - print(f'Saved validation dataset to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_dataset.npz') - torch.save(val_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_labels.npz') - print(f'save label: {time.time() - start_time:.2f} seconds') - print(f'Saved validation labels to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_labels.npz') - del val_dataset - + start_time = time.time() test_dataset = vectorizer.transform(test_dataset) print(f'fit_transform: {time.time() - start_time:.2f} seconds') - ssp.save_npz(f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_dataset.npz', test_dataset) - print(f'save data: {time.time() - start_time:.2f} seconds') - print(f'Saved test dataset to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_dataset.npz') - torch.save(test_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_labels.npz') - print(f'save label: {time.time() - start_time:.2f} seconds') - print(f'Saved test labels to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_labels.npz') - del test_dataset + elif embedding_model_name == "word2vec": sentences = [text[i].split() for i in range(len(text))] @@ -254,26 +225,33 @@ def main(): embedding_model, "mpnet" ) + + return train_dataset, train_labels, val_dataset, val_labels, test_dataset, test_labels + # ssp.save_npz(f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_dataset.npz', train_dataset) + # print(f'Saved train dataset to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_dataset.npz') + # print(f'save data: {time.time() - start_time:.2f} seconds') + # torch.save(train_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_labels.npz') + # print(f'Saved train labels to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_labels.npz') + # print(f'save label: {time.time() - start_time:.2f} seconds') + + # ssp.save_npz(f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_dataset.npz', val_dataset) + # print(f'save data: {time.time() - start_time:.2f} seconds') + # print(f'Saved validation dataset to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_dataset.npz') + # torch.save(val_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_labels.npz') + # print(f'save label: {time.time() - start_time:.2f} seconds') + # print(f'Saved validation labels to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_labels.npz') + # del val_dataset + + # ssp.save_npz(f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_dataset.npz', test_dataset) + # print(f'save data: {time.time() - start_time:.2f} seconds') + # print(f'Saved test dataset to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_dataset.npz') + # torch.save(test_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_labels.npz') + # print(f'save label: {time.time() - start_time:.2f} seconds') + # print(f'Saved test labels to ./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_labels.npz') + # del test_dataset - - # Convert to tensors - # train_dataset = torch.tensor(train_dataset, dtype=torch.float32) - # train_labels = torch.tensor(train_labels, dtype=torch.long) - # val_dataset = torch.tensor(val_dataset, dtype=torch.float32) - # val_labels = torch.tensor(val_labels, dtype=torch.long) - # test_dataset = torch.tensor(test_dataset, dtype=torch.float32) - # test_labels = torch.tensor(test_labels, dtype=torch.long) - - # # Save datasets - # os.makedirs(f'./generated_dataset/{cfg.data.name}/', exist_ok=True) - # torch.save(train_dataset, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_dataset.pt') - # torch.save(train_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_train_labels.pt') - # torch.save(val_dataset, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_dataset.pt') - # torch.save(val_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_val_labels.pt') - # torch.save(test_dataset, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_dataset.pt') - # torch.save(test_labels, f'./generated_dataset/{cfg.data.name}/{embedding_model_name}_{cfg.seed}_test_labels.pt') - - + + def list2csr(lst: List): # Identify non-zero values and their positions data = [] @@ -288,5 +266,55 @@ def list2csr(lst: List): return sparse_matrix +def create_tfidf(cfg, seed): + seed_everything(seed) + splits, text, _ = load_data_lp[cfg.data.name](cfg.data) + if cfg.embedder.type == 'tfidf': + train_dataset, train_labels = process_texts( + splits['train'].pos_edge_label_index, + splits['train'].neg_edge_label_index, + text + ) + val_dataset, val_labels = process_texts( + splits['valid'].pos_edge_label_index, + splits['valid'].neg_edge_label_index, + text + ) + test_dataset, test_labels = process_texts( + splits['test'].pos_edge_label_index, + splits['test'].neg_edge_label_index, + text + ) + vectorizer = TfidfVectorizer() + + os.makedirs(f'./generated_dataset/{cfg.data.name}/', exist_ok=True) + start_time = time.time() + train_data = vectorizer.fit_transform(train_dataset) + print(f'fit_transform: {time.time() - start_time:.2f} seconds') + + # del train_dataset + start_time = time.time() + val_data = vectorizer.transform(val_dataset) + print(f'fit_transform: {time.time() - start_time:.2f} seconds') + + start_time = time.time() + test_data = vectorizer.transform(test_dataset) + print(f'fit_transform: {time.time() - start_time:.2f} seconds') + + return train_data, train_labels, val_data, val_labels, test_data, test_labels + + if __name__ == "__main__": - main() + + file_path = f'{get_git_repo_root_path()}/' + args = parse_args() + cfg = set_cfg(file_path, args.cfg_file) + cfg.merge_from_list(args.opts) + # custom_set_out_dir(cfg, args.cfg_file, cfg.wandb.name_tag) + cfg = config_device(cfg) + cfg.data.name = args.data + cfg.seed = args.seed + + embedding_model_name = "tfidf" + create_tfidf(embed_type, cfg, args) + train_dataset, train_labels, val_dataset, val_labels, test_dataset, test_labels = save_dataset(embedding_model_name, cfg, args) diff --git a/core/model_finetuning/mlp.py b/core/model_finetuning/mlp.py index c1dd7b5a0f..28cfd4e1cc 100644 --- a/core/model_finetuning/mlp.py +++ b/core/model_finetuning/mlp.py @@ -24,6 +24,7 @@ from tqdm import tqdm import torch.optim as optim import numpy as np +from data_utils.load import load_data_nc, load_data_lp from ogb.linkproppred import PygLinkPropPredDataset, Evaluator from heuristic.eval import get_metric_score from graphgps.lm_trainer.tfidf_trainer import Trainer_TFIDF @@ -31,8 +32,10 @@ from sklearn.neural_network import MLPClassifier import argparse import wandb +from torch_geometric import seed_everything from pdb import set_trace as st import time +from create_dataset import create_tfidf FILE_PATH = f'{get_git_repo_root_path()}/' @@ -95,7 +98,8 @@ def parse_args() -> argparse.Namespace: def get_metrics(clf, dataset, labels, evaluator_hit, evaluator_mrr): # Predict and calculate accuracy pred = clf.predict(dataset) - acc = np.mean(np.asarray(labels) == pred) + labels = np.asarray(labels) + acc = np.mean( labels== pred) # Calculate positive and negative predictions y_pos_pred = torch.tensor(pred[labels == 1]) @@ -131,18 +135,25 @@ def project_main(): for run_id, seed, split_index in zip(*run_loop_settings(cfg, args)): print(f'run id : {run_id}') # Set configurations for each run TODO clean code here - root = '/hkfs/work/workspace/scratch/cc7738-benchmark_tag/TAPE_chen/core/model_finetuning' - from scipy.sparse import load_npz - train_dataset = load_npz(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_train_dataset.npz') - # train_dataset = torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_train_dataset.npz') - train_labels = np.array(torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_train_labels.npz')) - # val_dataset = torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_val_dataset.npz') - val_dataset = load_npz(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_val_dataset.npz') - val_labels = np.array(torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_val_labels.npz')) - # test_dataset = torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_test_dataset.npz') - test_dataset = load_npz(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_test_dataset.npz') - test_labels = np.array(torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_test_labels.npz')) + # root = '/hkfs/work/workspace/scratch/cc7738-benchmark_tag/TAPE_chen/core/model_finetuning' + # from scipy.sparse import load_npz + # train_dataset = load_npz(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_train_dataset.npz') + # # train_dataset = torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_train_dataset.npz') + # train_labels = np.array(torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_train_labels.npz')) + # # val_dataset = torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_val_dataset.npz') + # val_dataset = load_npz(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_val_dataset.npz') + # val_labels = np.array(torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_val_labels.npz')) + # # test_dataset = torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_test_dataset.npz') + # test_dataset = load_npz(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_test_dataset.npz') + # test_labels = np.array(torch.load(f'{root}/generated_dataset/{cfg.data.name}/{cfg.embedder.type}_{seed}_test_labels.npz')) + + print(f'run id : {run_id}, seed: {seed}, split_index: {split_index}') + cfg.seed = seed + cfg.run_id = run_id + seed_everything(cfg.seed) + train_dataset, train_labels, val_dataset, val_labels, test_dataset, test_labels = create_tfidf(cfg, seed) + print(f"loaded dataset") clf = MLPClassifier(random_state=run_id, max_iter=args.max_iter) print(f"created model") diff --git a/core/textfeat/mlp_dot_product.py b/core/textfeat/mlp_dot_product.py index 4c9587a7c6..226262ce2d 100644 --- a/core/textfeat/mlp_dot_product.py +++ b/core/textfeat/mlp_dot_product.py @@ -12,10 +12,10 @@ from ogb.linkproppred import PygLinkPropPredDataset, Evaluator from heuristic.eval import get_metric_score -from data_utils.load import data_loader_nc +#from data_utils.load import data_loader_nc from embedding.tune_utils import parse_args, param_tune_acc_mrr from core.graphgps.utility.utils import get_git_repo_root_path, append_acc_to_excel, append_mrr_to_excel, set_cfg -from core.graphgps.visualization.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj +from core.model_finetuning.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj method = 'nonlinear_mlp'