Skip to content

Commit

Permalink
optimize data pipeline with mlp tfidf
Browse files Browse the repository at this point in the history
  • Loading branch information
ChenS676 committed Jul 8, 2024
1 parent b00be64 commit 650b39f
Show file tree
Hide file tree
Showing 12 changed files with 203 additions and 139 deletions.
56 changes: 38 additions & 18 deletions core/data_utils/load_data_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,40 +200,60 @@ def load_taglp_citationv8(cfg: CN) -> Tuple[Dict[str, Data], List[str]]:
splits, text, data = load_taglp_arxiv2023(args.data)
print(f'directed: {data.is_directed()}')
print(data)
print(type(text))

print(text[0])
print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.")
print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.")
print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.")

print('citationv8')
splits, text, data = load_taglp_citationv8(args.data)
print(f'directed: {data.is_directed()}')
print(data)
print(type(text))
print(text[0])
print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.")
print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.")
print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.")

exit(-1)
print('cora')
splits, text, data = load_taglp_cora(args.data)
print(f'directed: {data.is_directed()}')
print(data)
print(type(text))

print('product')
splits, text, data = load_taglp_product(args.data)
print(f'directed: {data.is_directed()}')
print(data)
print(type(text))
print(text[0])
print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.")
print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.")
print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.")

# print('product')
# splits, text, data = load_taglp_product(args.data)
# print(f'directed: {data.is_directed()}')
# print(data)
# print(text[0])
# print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.")
# print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.")
# print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.")

print('pubmed')
splits, text, data = load_taglp_pubmed(args.data)
print(f'directed: {data.is_directed()}')
print(data)
print(type(text))

splits, text, data = load_taglp_citeseer(args.data)
print(f'directed: {data.is_directed()}')
print(data)
print(type(text))
print(text[0])
print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.")
print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.")
print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.")

# splits, text, data = load_taglp_citeseer(args.data)
# print(f'directed: {data.is_directed()}')
# print(data)
# # print(text[0])
# print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.")
# print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.")
# print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.")

print(args.data)
splits, text, data = load_taglp_ogbn_arxiv(args.data)
print(f'directed: {data.is_directed()}')
print(data)
print(type(text))
print(text[0])
print(f"train dataset: {splits['train'].pos_edge_label.shape[0]*2} edges.")
print(f"valid dataset: {splits['valid'].pos_edge_label.shape[0]*2} edges.")
print(f"test dataset: {splits['test'].pos_edge_label.shape[0]*2} edges.")
2 changes: 1 addition & 1 deletion core/embedding/line_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from heuristic.eval import get_metric_score
from data_utils.load import load_data_lp
from graphgps.visualization.adj import plot_coo_matrix, construct_sparse_adj
from core.model_finetuning.adj import plot_coo_matrix, construct_sparse_adj
from core.embedding.tune_utils import (
get_git_repo_root_path,
param_tune_acc_mrr
Expand Down
2 changes: 1 addition & 1 deletion core/embedding/node2vec_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import scipy.sparse as ssp
import torch
import matplotlib.pyplot as plt
from graphgps.visualization.adj import plot_coo_matrix, construct_sparse_adj
from core.model_finetuning.adj import plot_coo_matrix, construct_sparse_adj
from ogb.linkproppred import Evaluator
from heuristic.eval import get_metric_score
from embedding.ge.models import Node2Vec
Expand Down
2 changes: 1 addition & 1 deletion core/graphgps/network/ncn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from torch_scatter import scatter_add
from typing import Iterable, Final
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from graphgps.visualization.adj import (plot_coo_matrix,
from core.model_finetuning.adj import (plot_coo_matrix,
coo_matrix,
construct_sparse_adj,
coo_tensor_to_coo_matrix)
Expand Down
2 changes: 1 addition & 1 deletion core/heuristic/arxiv2023_heuristic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from heuristic.gsf import Ben_PPR, shortest_path, katz_apro, katz_close , SymPPR
from data_utils.load_data_lp import get_raw_text_pubmed, get_pubmed_lp
import matplotlib.pyplot as plt
from core.graphgps.visualization.adj import construct_sparse_adj, plot_coo_matrix
from core.model_finetuning.adj import construct_sparse_adj, plot_coo_matrix
import scipy.sparse as ssp
from core.graphgps.utility.utils import get_git_repo_root_path
from typing import Dict
Expand Down
2 changes: 1 addition & 1 deletion core/heuristic/cora_heuristic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from heuristic.gsf import Ben_PPR, shortest_path, katz_apro, katz_close, SymPPR
from textfeat.mlp_dot_product import pairwise_prediction
import matplotlib.pyplot as plt
from core.graphgps.visualization.adj import plot_coo_matrix, construct_sparse_adj
from core.model_finetuning.adj import plot_coo_matrix, construct_sparse_adj
from core.graphgps.utility.utils import get_git_repo_root_path, append_acc_to_excel, append_mrr_to_excel
from ogb.linkproppred import PygLinkPropPredDataset, Evaluator
from heuristic.eval import get_metric_score
Expand Down
2 changes: 1 addition & 1 deletion core/heuristic/ogbn_products_heuristic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from heuristic.lsf import CN, AA, RA, InverseRA
from heuristic.gsf import Ben_PPR, shortest_path, katz_apro, katz_close , SymPPR
import matplotlib.pyplot as plt
from core.graphgps.visualization.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj
from core.model_finetuning.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj
import scipy.sparse as ssp

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator
Expand Down
3 changes: 1 addition & 2 deletions core/heuristic/pubmed_heuristic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
from torch_geometric.transforms import RandomLinkSplit
import pandas as pd
from torch_geometric.data import Data, InMemoryDataset
from data_utils.load_data_lp import get_raw_text_pubmed, get_pubmed_lp
import matplotlib.pyplot as plt
from core.graphgps.visualization.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj
from core.model_finetuning.adj import construct_sparse_adj, plot_coo_matrix, plot_pos_neg_adj
import scipy.sparse as ssp

from heuristic.lsf import CN, AA, RA, InverseRA
Expand Down
92 changes: 49 additions & 43 deletions core/graphgps/visualization/adj.py → core/model_finetuning/adj.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ogb.nodeproppred import NodePropPredDataset
from scipy.sparse import csc_array

from data_utils.load import load_data_nc
from data_utils.load import load_data_nc, load_graph_lp

import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
Expand Down Expand Up @@ -237,56 +237,33 @@ def avg_degree(G):
args = parser.parse_args()

scale = 100000
name_list = ['ogbn-arxiv', 'arxiv_2023', 'cora', 'pubmed']
name_list = ['cora', 'pubmed', 'arxiv_2023', 'ogbn-arxiv', 'citationv8']

for name in name_list:
if name == 'cora':
data, text = load_graph_lp[name](use_mask=False)
raise NotImplementedError

if name == 'ogbn-products':
dataset = NodePropPredDataset(name)
edge_index = dataset[0][0]['edge_index']

# edge index to sparse matrix
edge_index = edge_index[:, ::scale]
m = construct_sparse_adj(edge_index)

if name == 'pubmed':
data, text = load_data_nc[name](use_mask=False)
G = nx.from_scipy_sparse_array(m)
plot_coo_matrix(m, f'{name}_data_edges.png')

fig, ax = spy.spy_to_mpl(m)
fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight')

data, text = load_data_nc[name]()
m = construct_sparse_adj(data.edge_index.coo())
plot_coo_matrix(m, f'{name}_data_index.png')

compare_adj(name, data.edge_index.numpy())
m = construct_sparse_adj(data.edge_index.numpy())
fig, ax = spy.spy_to_mpl(m)
fig.savefig(f"{name}_data_index_spy.png", bbox_inches='tight')

heterogeneity = calculate_heterogeneity(G)
print(f"{name}, heterogeneity: {heterogeneity}. num_node: {dataset[0].num_node}")

if name == 'ogbn-arxiv':
dataset = NodePropPredDataset(name)
edge_index = dataset[0][0]['edge_index']

m = construct_sparse_adj(edge_index[:, ::2])
G = nx.from_scipy_sparse_array(m)

plot_coo_matrix(m, f'{name}_data_edges.png')

fig, ax = spy.spy_to_mpl(m)
fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight')

heterogeneity = calculate_heterogeneity(G)
num_nodes = dataset[0][0]['num_nodes']
num_edges = dataset[0][0]['edge_index'].shape[1]
num_nodes = data.num_nodes
num_edges = data.edge_index.shape[1]
avg_degree_arithmetic = int(num_edges / num_nodes)
avg_degree_G, avg_degree_dict = avg_degree(G)
avg_degree_G2 = avg_degree2(G, avg_degree_dict)
print(f"{name}, heterogeneity: {heterogeneity}. num_node: {num_nodes}, num_edges: {num_edges}, \
avg degree arithmetic {avg_degree_arithmetic}, \
avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.")


if name == 'arxiv_2023':
data, text = load_data_nc[name]()
m = construct_sparse_adj(data.edge_index.numpy())
Expand All @@ -307,21 +284,50 @@ def avg_degree(G):
avg degree arithmetic {avg_degree_arithmetic}, \
avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.")

if name == 'pubmed':
data, text = load_data_nc[name](use_mask=False)

if name == 'ogbn-arxiv':
dataset = NodePropPredDataset(name)
edge_index = dataset[0][0]['edge_index']

m = construct_sparse_adj(edge_index[:, ::2])
G = nx.from_scipy_sparse_array(m)

compare_adj(name, data.edge_index.numpy())
m = construct_sparse_adj(data.edge_index.numpy())
plot_coo_matrix(m, f'{name}_data_edges.png')

fig, ax = spy.spy_to_mpl(m)
fig.savefig(f"{name}_data_index_spy.png", bbox_inches='tight')
fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight')

heterogeneity = calculate_heterogeneity(G)
num_nodes = data.num_nodes
num_edges = data.edge_index.shape[1]
num_nodes = dataset[0][0]['num_nodes']
num_edges = dataset[0][0]['edge_index'].shape[1]
avg_degree_arithmetic = int(num_edges / num_nodes)
avg_degree_G, avg_degree_dict = avg_degree(G)
avg_degree_G2 = avg_degree2(G, avg_degree_dict)
print(f"{name}, heterogeneity: {heterogeneity}. num_node: {num_nodes}, num_edges: {num_edges}, \
avg degree arithmetic {avg_degree_arithmetic}, \
avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.")
avg degree G {avg_degree_G}, avg degree G2 {avg_degree_G2}, clustering {nx.average_clustering(G)}.")


if name == 'citationv8':
dataset = NodePropPredDataset(name)
edge_index = dataset[0][0]['edge_index']

# edge index to sparse matrix
edge_index = edge_index[:, ::scale]
m = construct_sparse_adj(edge_index)

G = nx.from_scipy_sparse_array(m)
plot_coo_matrix(m, f'{name}_data_edges.png')

fig, ax = spy.spy_to_mpl(m)
fig.savefig(f"{name}_data_edges_spy.png", bbox_inches='tight')

data, text = load_data_nc[name]()
m = construct_sparse_adj(data.edge_index.coo())
plot_coo_matrix(m, f'{name}_data_index.png')

fig, ax = spy.spy_to_mpl(m)
fig.savefig(f"{name}_data_index_spy.png", bbox_inches='tight')

heterogeneity = calculate_heterogeneity(G)
print(f"{name}, heterogeneity: {heterogeneity}. num_node: {dataset[0].num_node}")
Loading

0 comments on commit 650b39f

Please sign in to comment.