diff --git a/experiments/reproduction.py b/experiments/reproduction.py index 414f36b..35064ff 100644 --- a/experiments/reproduction.py +++ b/experiments/reproduction.py @@ -23,7 +23,7 @@ def train(): tb_logger = pl_loggers.TensorBoardLogger(save_dir="logs/") trainer = pl.Trainer( accelerator=device, - max_steps=100000, + max_steps=10000, # 100000, limit_val_batches=1, # TODO Debugging gradient_clip_val=5.0, # TODO There was something about this in the code. logger=tb_logger, diff --git a/quarto/notes.qmd b/quarto/notes.qmd index e374448..473c3c0 100644 --- a/quarto/notes.qmd +++ b/quarto/notes.qmd @@ -400,6 +400,30 @@ Not sure off the top of my head where the massive memory footprint comes from, m - [ ] Visualise the adaptive adjacency evolution using nx in tensorboard. - [ ] Add PEMS-BAY dataset. +It looks like the dense_to_sparse function takes up a lot of time. +Potentially I can mess around trying to optimise the operation? +In the original implementation they don't work with sparse matrices and thus don't have the issues I'm having. +But the sparse matrices are central to working with PyG, and with the selling point of scalability. +It would be fun to try to build the CUDA module for sparsification, but for now the easiest solution might be to manually construct the matrix. +Notice that I'm sparsifying a matrix of size batch_size * N, and the 0 entries scale as batch_size^2. +Ok, so it seems that there is a Pytorch Issue (https://github.com/pytorch/pytorch/issues/31942) addressing block_diag sparse tensors. +It would likely be better to work on that instead, and it should make my implementation faster anyway. +No way forward without fixing that issue! + +Actually, couldn't you: +- Take powers before block-diagonalisation, then block-diag it. + - Yes, but you still have to sparsify a dense block-diag matrix, which is the bottleneck. +- Use torch.sparse.mm to compute the powersafter sparsification? + - Yes, but that doesn't solve constructing the sparse block-diag. + +How about using the torch geometric collation function to build the sparse matrix as a batch? + +Well, turns out the issue is not with sparsifying the matrix. +It's to do with the size of the 207x207 adjacency - using softmax directly removes the 0s, hence I've been working with a dense matrix. +Changing softmax for normalize, and initialising using a Gaussian removes about 50% of the values, and speeds up the computation. +Question is, does the adaptive adjacency tend towards sparsity on its own? + +Well, it seems to run at a more acceptable rate, and it outperforms the no adjacency case. ### References diff --git a/src/gwnet/model/gwnet.py b/src/gwnet/model/gwnet.py index 74b044f..7129fba 100644 --- a/src/gwnet/model/gwnet.py +++ b/src/gwnet/model/gwnet.py @@ -223,7 +223,7 @@ def __init__( raise Exception(adp_err_msg) self.node_embeddings = torch.nn.Parameter( - torch.rand(n_nodes, adaptive_embedding_dim) + torch.randn(n_nodes, adaptive_embedding_dim) ) adp = True @@ -310,8 +310,8 @@ def _update_adp_adj(self, batch_size: int, k_hops: int) -> None: self.global_elements["adj_weights"] = {} # (N, C) @ (C, N) -> (N, N) - adp_adj = F.softmax( - F.relu(self.node_embeddings @ self.node_embeddings.T), dim=1 + adp_adj = F.normalize( + F.relu(self.node_embeddings @ self.node_embeddings.T), dim=1, p=1 ) adp_adj_dense_batch = torch.block_diag(*[adp_adj] * batch_size)