Merge branch 'pg2455-big_graphs'

eliorc · Dec 29, 2018 · 8408324 · 8408324
2 parents 2ccc817 + 3091c63
commit 8408324
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -1,15 +1,16 @@
-# Node2Vec 
+# Node2Vec
 [![Downloads](http://pepy.tech/badge/node2vec)](http://pepy.tech/project/node2vec)
 
 Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec and Vid Kocijan.
 [node2vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2016.](https://snap.stanford.edu/node2vec/)
 
 ## Changes:
 
-New in `0.2.2`:
+New in `0.3.0`:
 
-Added edge embedding functionality. Module `node2vec.edges`.
-(Fixed error upon installation)
+Added support for big graphs which cannot be fit into memory during algorithm execution (causing OOM errors).
+
+Thanks  [`@pg2455`](https://github.com/pg2455) for the contribution of this feature.
 
 ## Installation
 
@@ -24,7 +25,7 @@ from node2vec import Node2Vec
 graph = nx.fast_gnp_random_graph(n=100, p=0.5)
 
 # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
-node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) 
+node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # Use temp_folder for big graphs
 
 # Embed nodes
 model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
@@ -79,7 +80,8 @@ edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)
     9. `sampling_strategy`: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
         Use these keys exactly. If not set, will use the global ones which were passed on the object initialization`
     10. `quiet`: Boolean controlling the verbosity. (default: False)
-
+    11. `temp_folder`: String path pointing to folder to save a shared memory copy of the graph - Supply when working on graphs that are too big to fit in memory during algorithm execution.
+
 - `Node2Vec.fit` method:
     Accepts any key word argument acceptable by gensim.Word2Vec
 
@@ -106,6 +108,3 @@ Notice that edge embeddings are defined for any pair of nodes, connected or not
 ## TODO
 - [x] Parallel implementation for walk generation
 - [ ] Parallel implementation for probability precomputation
-
-## Contributing
-I will probably not be maintaining this package actively, if someone wants to contribute and maintain, please contact me.
diff --git a/example.py b/example.py
@@ -11,6 +11,10 @@
 # Precompute probabilities and generate walks
 node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)
 
+## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space
+# Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
+#node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data")
+
 # Embed
 model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
 
@@ -21,4 +25,4 @@
 model.wv.save_word2vec_format(EMBEDDING_FILENAME)
 
 # Save model for later use
-model.save(EMBEDDING_MODEL_FILENAME)
+model.save(EMBEDDING_MODEL_FILENAME)
diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py
@@ -1,7 +1,7 @@
 from collections import defaultdict
 import numpy as np
-import gensim
-from joblib import Parallel, delayed
+import gensim, os
+from joblib import Parallel, delayed, load, dump
 from tqdm import tqdm
 from .parallel import parallel_generate_walks
 
@@ -17,9 +17,10 @@ class Node2Vec:
     Q_KEY = 'q'
 
     def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight',
-                 workers=1, sampling_strategy=None, quiet=False):
+                 workers=1, sampling_strategy=None, quiet=False, temp_folder=None):
         """
         Initiates the Node2Vec object, precomputes walking probabilities and generates the walks.
+
         :param graph: Input graph
         :type graph: Networkx Graph
         :param dimensions: Embedding dimensions (default: 128)
@@ -38,7 +39,10 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
         :type workers: int
         :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
         Use these keys exactly. If not set, will use the global ones which were passed on the object initialization
+        :param temp_folder: Path to folder with enough space to hold the memory map of self.d_graph (for big graphs); to be passed joblib.Parallel.temp_folder
+        :type temp_folder: str
         """
+
         self.graph = graph
         self.dimensions = dimensions
         self.walk_length = walk_length
@@ -48,20 +52,30 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
         self.weight_key = weight_key
         self.workers = workers
         self.quiet = quiet
+        self.d_graph = defaultdict(dict)
 
         if sampling_strategy is None:
             self.sampling_strategy = {}
         else:
             self.sampling_strategy = sampling_strategy
 
-        self.d_graph = self._precompute_probabilities()
+        self.temp_folder, self.require = None, None
+        if temp_folder:
+            if not os.path.isdir(temp_folder):
+                raise NotADirectoryError("temp_folder does not exist or is not a directory. ({})".format(temp_folder))
+
+            self.temp_folder = temp_folder
+            self.require = "sharedmem"
+
+        self._precompute_probabilities()
         self.walks = self._generate_walks()
 
     def _precompute_probabilities(self):
         """
         Precomputes transition probabilities for each node.
         """
-        d_graph = defaultdict(dict)
+
+        d_graph = self.d_graph
         first_travel_done = set()
 
         nodes_generator = self.graph.nodes() if self.quiet \
@@ -117,8 +131,6 @@ def _precompute_probabilities(self):
                 # Save neighbors
                 d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors
 
-        return d_graph
-
     def _generate_walks(self):
         """
         Generates the random walks which will be used as the skip-gram input.
@@ -130,19 +142,20 @@ def _generate_walks(self):
         # Split num_walks for each worker
         num_walks_lists = np.array_split(range(self.num_walks), self.workers)
 
-        walk_results = Parallel(n_jobs=self.workers)(delayed(parallel_generate_walks)(self.d_graph,
-                                                                                      self.walk_length,
-                                                                                      len(num_walks),
-                                                                                      idx,
-                                                                                      self.sampling_strategy,
-                                                                                      self.NUM_WALKS_KEY,
-                                                                                      self.WALK_LENGTH_KEY,
-                                                                                      self.NEIGHBORS_KEY,
-                                                                                      self.PROBABILITIES_KEY,
-                                                                                      self.FIRST_TRAVEL_KEY,
-                                                                                      self.quiet) for
-                                                     idx, num_walks
-                                                     in enumerate(num_walks_lists, 1))
+        walk_results = Parallel(n_jobs=self.workers, temp_folder=self.temp_folder, require=self.require)(
+            delayed(parallel_generate_walks)(self.d_graph,
+                                             self.walk_length,
+                                             len(num_walks),
+                                             idx,
+                                             self.sampling_strategy,
+                                             self.NUM_WALKS_KEY,
+                                             self.WALK_LENGTH_KEY,
+                                             self.NEIGHBORS_KEY,
+                                             self.PROBABILITIES_KEY,
+                                             self.FIRST_TRAVEL_KEY,
+                                             self.quiet) for
+            idx, num_walks
+            in enumerate(num_walks_lists, 1))
 
         walks = flatten(walk_results)
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name='node2vec',
     packages=['node2vec'],
-    version='0.2.2',
+    version='0.3.0',
     description='Implementation of the node2vec algorithm.',
     author='Elior Cohen',
     author_email='[email protected]',