diff --git a/graph_neural_network/.dockerignore b/graph_neural_network/.dockerignore new file mode 100644 index 000000000..aa3d00cd0 --- /dev/null +++ b/graph_neural_network/.dockerignore @@ -0,0 +1 @@ +mlcube/workspace/* \ No newline at end of file diff --git a/graph_neural_network/Dockerfile_mlcube b/graph_neural_network/Dockerfile_mlcube index 151052903..c4743f3e7 100644 --- a/graph_neural_network/Dockerfile_mlcube +++ b/graph_neural_network/Dockerfile_mlcube @@ -7,9 +7,10 @@ RUN pip install scikit-learn==0.24.2 RUN pip install torch_geometric==2.4.0 RUN pip install --no-index torch_scatter==2.1.1 torch_sparse==0.6.17 -f https://data.pyg.org/whl/torch-1.13.0+cu117.html RUN pip install graphlearn-torch==0.2.2 +RUN pip install numpy==1.26.4 RUN apt update -RUN apt install -y git wget +RUN apt install -y git wget unzip RUN pip install git+https://github.com/mlcommons/logging.git # TF32 instead of FP32 for faster compute diff --git a/graph_neural_network/compress_graph_demo.py b/graph_neural_network/compress_graph_demo.py new file mode 100644 index 000000000..37cdc445b --- /dev/null +++ b/graph_neural_network/compress_graph_demo.py @@ -0,0 +1,82 @@ +import argparse, datetime, os +import numpy as np +import torch +import os.path as osp + +import graphlearn_torch as glt + +from dataset import float2half +from download import download_dataset +from torch_geometric.utils import add_self_loops, remove_self_loops +from typing import Literal + + +class IGBHeteroDatasetCompress(object): + def __init__(self, + path, + dataset_size, + layout: Literal['CSC', 'CSR'] = 'CSC',): + self.dir = path + self.dataset_size = dataset_size + self.layout = layout + + self.ntypes = ['paper'] + self.etypes = None + self.edge_dict = {} + self.paper_nodes_num = {'tiny':100000, 'small':1000000, 'medium':10000000, 'large':100000000, 'full':269346174} + if not osp.exists(osp.join(path, self.dataset_size, 'processed')): + download_dataset(path, 'heterogeneous', dataset_size) + self.process() + + def process(self): + paper_paper_edges = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed', + 'paper__cites__paper', 'edge_index.npy'))).t() + cites_edge = add_self_loops(remove_self_loops(paper_paper_edges)[0])[0] + self.edge_dict = { + ('paper', 'cites', 'paper'): (torch.cat([cites_edge[1, :], cites_edge[0, :]]), torch.cat([cites_edge[0, :], cites_edge[1, :]])), + } + self.etypes = list(self.edge_dict.keys()) + + # init graphlearn_torch Dataset. + edge_dir = 'out' if self.layout == 'CSR' else 'in' + glt_dataset = glt.data.Dataset(edge_dir=edge_dir) + glt_dataset.init_graph( + edge_index=self.edge_dict, + graph_mode='CPU', + ) + + # save the corresponding csr or csc file + compress_edge_dict = {} + compress_edge_dict[('paper', 'cites', 'paper')] = 'paper__cites__paper' + + for etype in self.etypes: + graph = glt_dataset.get_graph(etype) + indptr, indices, _ = graph.export_topology() + path = os.path.join(self.dir, self.dataset_size, 'processed', self.layout, compress_edge_dict[etype]) + if not os.path.exists(path): + os.makedirs(path) + torch.save(indptr, os.path.join(path, 'indptr.pt')) + torch.save(indices, os.path.join(path, 'indices.pt')) + path = os.path.join(self.dir, self.dataset_size, 'processed', self.layout) + print(f"The {self.layout} graph has been persisted in path: {path}") + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + root = osp.join(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__)))), 'data', 'igbh') + glt.utils.ensure_dir(root) + parser.add_argument('--path', type=str, default=root, + help='path containing the datasets') + parser.add_argument('--dataset_size', type=str, default='full', + choices=['tiny', 'small', 'medium', 'large', 'full'], + help='size of the datasets') + parser.add_argument("--layout", type=str, default='CSC') + parser.add_argument('--use_fp16', action="store_true", + help="convert the node/edge feature into fp16 format") + args = parser.parse_args() + print(f"Start constructing the {args.layout} graph...") + igbh_dataset = IGBHeteroDatasetCompress(args.path, args.dataset_size, args.layout) + if args.use_fp16: + base_path = osp.join(args.path, args.dataset_size, 'processed') + float2half(base_path, args.dataset_size) diff --git a/graph_neural_network/download_demo.sh b/graph_neural_network/download_demo.sh new file mode 100644 index 000000000..1bbfd45e9 --- /dev/null +++ b/graph_neural_network/download_demo.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +DATA_DIR="./igbh/full/processed" + +# Capture MLCube parameter +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATA_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + +echo "Minified dataset download starting ..." +mkdir -p $DATA_DIR +cd $DATA_DIR + +wget https://mlcube.mlcommons-storage.org/minibenchmarks/gnn.zip +unzip -o gnn.zip +rm gnn.zip +echo "completed!" \ No newline at end of file diff --git a/graph_neural_network/mlcube/.DS_Store b/graph_neural_network/mlcube/.DS_Store new file mode 100644 index 000000000..b48f21201 Binary files /dev/null and b/graph_neural_network/mlcube/.DS_Store differ diff --git a/graph_neural_network/mlcube/dockerignore b/graph_neural_network/mlcube/dockerignore deleted file mode 100644 index 948de618c..000000000 --- a/graph_neural_network/mlcube/dockerignore +++ /dev/null @@ -1 +0,0 @@ -workspace/* \ No newline at end of file diff --git a/graph_neural_network/mlcube/mlcube.yaml b/graph_neural_network/mlcube/mlcube.yaml index 2773910d2..cdb6a50cd 100644 --- a/graph_neural_network/mlcube/mlcube.yaml +++ b/graph_neural_network/mlcube/mlcube.yaml @@ -1,4 +1,4 @@ -name: graph_nn +name: graph_nn9 description: Graph Neural Network authors: - { name: "MLCommons Best Practices Working Group" } @@ -34,3 +34,15 @@ tasks: data_dir: data/ outputs: log_dir: logs/ + download_demo: + entrypoint: ./download_demo.sh -a + parameters: + outputs: + data_dir: data/ + demo: + entrypoint: ./run_demo.sh -a + parameters: + inputs: + data_dir: data/ + outputs: + log_dir: logs/ diff --git a/graph_neural_network/run_demo.sh b/graph_neural_network/run_demo.sh new file mode 100644 index 000000000..39619d58b --- /dev/null +++ b/graph_neural_network/run_demo.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set +x +set -e + +# start timing +start=$(date +%s) +start_fmt=$(date +%Y-%m-%d\ %r) +echo "STARTING TIMING RUN AT $start_fmt" + +# Set variables +: "${DATA_DIR:=./igbh/full/processed}" +: "${LOG_DIR:=./workspace/logs}" + +# Handle MLCube parameters +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATA_DIR="${1#*=}" + ;; + --log_dir=*) + LOG_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + +# run benchmark +echo "running benchmark" + +python compress_graph_demo.py --path $DATA_DIR \ + --dataset_size='tiny' \ + --layout='CSC' |& tee "$LOG_DIR/train_console.log" + +# end timing +end=$(date +%s) +end_fmt=$(date +%Y-%m-%d\ %r) +echo "ENDING TIMING RUN AT $end_fmt" \ No newline at end of file diff --git a/graph_neural_network/split_seeds.py b/graph_neural_network/split_seeds.py index c8675ba92..a01540b5e 100644 --- a/graph_neural_network/split_seeds.py +++ b/graph_neural_network/split_seeds.py @@ -1,4 +1,5 @@ import argparse +import os import os.path as osp import torch @@ -34,6 +35,7 @@ def process(self): val_idx = shuffled_index[n_train : n_train + n_val] path = osp.join(self.path, self.dataset_size, 'processed') + os.makedirs(path, exist_ok=True) torch.save(train_idx, osp.join(path, 'train_idx.pt')) torch.save(val_idx, osp.join(path, 'val_idx.pt'))