From 5052015c5f8e6953a7563ef499785a7e8c3fa414 Mon Sep 17 00:00:00 2001
From: pg2455 <pg2455@columbia.edu>
Date: Tue, 18 Dec 2018 13:33:31 +0000
Subject: [PATCH 1/8] add mem map for reading and writing bigger graphs

---
 example.py           |  4 +++-
 node2vec/node2vec.py | 30 ++++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/example.py b/example.py
index 180fe12..335ff60 100644
--- a/example.py
+++ b/example.py
@@ -10,6 +10,8 @@
 
 # Precompute probabilities and generate walks
 node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)
+## if d_graph is big enough to fit in the memory, pass tmp_dir which has enough disk space
+node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, tmp_dir="/mnt/tmp_data")
 
 # Embed
 model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
@@ -21,4 +23,4 @@
 model.wv.save_word2vec_format(EMBEDDING_FILENAME)
 
 # Save model for later use
-model.save(EMBEDDING_MODEL_FILENAME)
\ No newline at end of file
+model.save(EMBEDDING_MODEL_FILENAME)
diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py
index 6c74ef5..fc90bb3 100644
--- a/node2vec/node2vec.py
+++ b/node2vec/node2vec.py
@@ -1,7 +1,7 @@
 from collections import defaultdict
 import numpy as np
-import gensim
-from joblib import Parallel, delayed
+import gensim, os, shutil, tempfile
+from joblib import Parallel, delayed, load, dump
 from tqdm import tqdm
 from .parallel import parallel_generate_walks
 
@@ -17,7 +17,7 @@ class Node2Vec:
     Q_KEY = 'q'
 
     def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight',
-                 workers=1, sampling_strategy=None, quiet=False):
+                 workers=1, sampling_strategy=None, quiet=False, tmp_dir=""):
         """
         Initiates the Node2Vec object, precomputes walking probabilities and generates the walks.
         :param graph: Input graph
@@ -38,6 +38,8 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
         :type workers: int
         :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
         Use these keys exactly. If not set, will use the global ones which were passed on the object initialization
+        :param tmp_dir: directory with enough space to hold the temp_d_graph
+        :type tmp_dir: str
         """
         self.graph = graph
         self.dimensions = dimensions
@@ -54,14 +56,24 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
         else:
             self.sampling_strategy = sampling_strategy
 
-        self.d_graph = self._precompute_probabilities()
+        self.d_graph = defaultdict(dict)
+        if tmp_dir != "":
+            assert os.path.isdir(tmp_dir), "tmp_dir does not exists"
+            self.temp_folder = tempfile.mkdtemp(dir=tmp_dir)
+            filename = os.path.join(self.temp_folder, "d_graph.json")
+            if os.path.exists(filename): os.unlink(filename)
+            dump(self.d_graph, filename)
+            self.d_graph = load(filename, mmap_mode="r+")
+            print("Memory map of d_graph: {}".format(filename))
+
+        self._precompute_probabilities()
         self.walks = self._generate_walks()
 
     def _precompute_probabilities(self):
         """
         Precomputes transition probabilities for each node.
         """
-        d_graph = defaultdict(dict)
+        d_graph = self.d_graph
         first_travel_done = set()
 
         nodes_generator = self.graph.nodes() if self.quiet \
@@ -117,7 +129,6 @@ def _precompute_probabilities(self):
                 # Save neighbors
                 d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors
 
-        return d_graph
 
     def _generate_walks(self):
         """
@@ -163,3 +174,10 @@ def fit(self, **skip_gram_params):
             skip_gram_params['size'] = self.dimensions
 
         return gensim.models.Word2Vec(self.walks, **skip_gram_params)
+
+    def __del__(self):
+        if "temp_folder" in self.__dict__:
+            try:
+                shutil.rmtree(self.temp_folder)
+            except OSError:
+                print("Unable to clean the temporary folder!!!")

From e52affd00b4cbea6bc5711ef84c4225fde73e6e1 Mon Sep 17 00:00:00 2001
From: pg2455 <pg2455@columbia.edu>
Date: Sun, 23 Dec 2018 11:37:55 +0000
Subject: [PATCH 2/8] add require and temp folder to Parallel

---
 node2vec/node2vec.py | 35 +++++++++++++----------------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py
index fc90bb3..d991fd8 100644
--- a/node2vec/node2vec.py
+++ b/node2vec/node2vec.py
@@ -1,9 +1,10 @@
 from collections import defaultdict
 import numpy as np
-import gensim, os, shutil, tempfile
+import gensim, os
 from joblib import Parallel, delayed, load, dump
 from tqdm import tqdm
-from .parallel import parallel_generate_walks
+# from .parallel import parallel_generate_walks
+from parallel import parallel_generate_walks
 
 
 class Node2Vec:
@@ -17,7 +18,7 @@ class Node2Vec:
     Q_KEY = 'q'
 
     def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight',
-                 workers=1, sampling_strategy=None, quiet=False, tmp_dir=""):
+                 workers=1, sampling_strategy=None, quiet=False, temp_folder=None):
         """
         Initiates the Node2Vec object, precomputes walking probabilities and generates the walks.
         :param graph: Input graph
@@ -38,8 +39,8 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
         :type workers: int
         :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
         Use these keys exactly. If not set, will use the global ones which were passed on the object initialization
-        :param tmp_dir: directory with enough space to hold the temp_d_graph
-        :type tmp_dir: str
+        :param temp_folder: folder with enough space to hold the memory map of self.d_graph (for bigger graphs); to be passed joblib.Parallel.temp_folder
+        :type temp_folder: str
         """
         self.graph = graph
         self.dimensions = dimensions
@@ -50,21 +51,18 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
         self.weight_key = weight_key
         self.workers = workers
         self.quiet = quiet
+        self.d_graph = defaultdict(dict)
 
         if sampling_strategy is None:
             self.sampling_strategy = {}
         else:
             self.sampling_strategy = sampling_strategy
 
-        self.d_graph = defaultdict(dict)
-        if tmp_dir != "":
-            assert os.path.isdir(tmp_dir), "tmp_dir does not exists"
-            self.temp_folder = tempfile.mkdtemp(dir=tmp_dir)
-            filename = os.path.join(self.temp_folder, "d_graph.json")
-            if os.path.exists(filename): os.unlink(filename)
-            dump(self.d_graph, filename)
-            self.d_graph = load(filename, mmap_mode="r+")
-            print("Memory map of d_graph: {}".format(filename))
+        self.temp_folder, self.require = None, None
+        if temp_folder:
+            assert os.path.isdir(temp_folder), "tmp_dir does not exists"
+            self.temp_folder = temp_folder
+            self.require = "sharedmem"
 
         self._precompute_probabilities()
         self.walks = self._generate_walks()
@@ -141,7 +139,7 @@ def _generate_walks(self):
         # Split num_walks for each worker
         num_walks_lists = np.array_split(range(self.num_walks), self.workers)
 
-        walk_results = Parallel(n_jobs=self.workers)(delayed(parallel_generate_walks)(self.d_graph,
+        walk_results = Parallel(n_jobs=self.workers, temp_folder = self.temp_folder, require = self.require)(delayed(parallel_generate_walks)(self.d_graph,
                                                                                       self.walk_length,
                                                                                       len(num_walks),
                                                                                       idx,
@@ -174,10 +172,3 @@ def fit(self, **skip_gram_params):
             skip_gram_params['size'] = self.dimensions
 
         return gensim.models.Word2Vec(self.walks, **skip_gram_params)
-
-    def __del__(self):
-        if "temp_folder" in self.__dict__:
-            try:
-                shutil.rmtree(self.temp_folder)
-            except OSError:
-                print("Unable to clean the temporary folder!!!")

From 55e945ff40db689ba035ed6c12e1514354477317 Mon Sep 17 00:00:00 2001
From: pg2455 <pg2455@columbia.edu>
Date: Sun, 23 Dec 2018 11:45:01 +0000
Subject: [PATCH 3/8] add relevant example

---
 example.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/example.py b/example.py
index 335ff60..c0f5966 100644
--- a/example.py
+++ b/example.py
@@ -10,8 +10,10 @@
 
 # Precompute probabilities and generate walks
 node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)
-## if d_graph is big enough to fit in the memory, pass tmp_dir which has enough disk space
-node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, tmp_dir="/mnt/tmp_data")
+
+## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space
+# Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
+#node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data")
 
 # Embed
 model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

From 2f4c71fda1dc23327847b7adbfb19c6435381035 Mon Sep 17 00:00:00 2001
From: pg2455 <pg2455@columbia.edu>
Date: Sun, 23 Dec 2018 11:46:23 +0000
Subject: [PATCH 4/8] edit readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 2e5216c..b972b6d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Node2Vec 
+# Node2Vec
 [![Downloads](http://pepy.tech/badge/node2vec)](http://pepy.tech/project/node2vec)
 
 Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec and Vid Kocijan.
@@ -24,7 +24,7 @@ from node2vec import Node2Vec
 graph = nx.fast_gnp_random_graph(n=100, p=0.5)
 
 # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
-node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) 
+node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # pass temp_folder for the big graphs
 
 # Embed nodes
 model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
@@ -79,7 +79,7 @@ edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)
     9. `sampling_strategy`: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
         Use these keys exactly. If not set, will use the global ones which were passed on the object initialization`
     10. `quiet`: Boolean controlling the verbosity. (default: False)
-    
+
 - `Node2Vec.fit` method:
     Accepts any key word argument acceptable by gensim.Word2Vec
 

From 49ca2a67466c4dab9941461dca8c7e2c777da6fd Mon Sep 17 00:00:00 2001
From: pg2455 <pg2455@columbia.edu>
Date: Sun, 23 Dec 2018 11:59:38 +0000
Subject: [PATCH 5/8] chnage import of parallel

---
 node2vec/node2vec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py
index d991fd8..ddbd898 100644
--- a/node2vec/node2vec.py
+++ b/node2vec/node2vec.py
@@ -1,10 +1,10 @@
+
 from collections import defaultdict
 import numpy as np
 import gensim, os
 from joblib import Parallel, delayed, load, dump
 from tqdm import tqdm
-# from .parallel import parallel_generate_walks
-from parallel import parallel_generate_walks
+from .parallel import parallel_generate_walks
 
 
 class Node2Vec:

From 4c3064e7909d66b069f1c2c1cec01351ec16d185 Mon Sep 17 00:00:00 2001
From: Elior Cohen <elior.cohen.p@gmail.com>
Date: Sat, 29 Dec 2018 14:45:31 +0200
Subject: [PATCH 6/8] Added documentation for temp_folder parameter

---
 README.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index b972b6d..dbaf183 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,11 @@ Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec an
 
 ## Changes:
 
-New in `0.2.2`:
+New in `0.3.0`:
 
-Added edge embedding functionality. Module `node2vec.edges`.
-(Fixed error upon installation)
+Added support for big graphs which cannot be fit into memory during algorithm execution (causing OOM errors).
+
+Thanks  [`@pg2455`](https://github.com/pg2455) for the contribution of this feature.
 
 ## Installation
 
@@ -24,7 +25,7 @@ from node2vec import Node2Vec
 graph = nx.fast_gnp_random_graph(n=100, p=0.5)
 
 # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
-node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # pass temp_folder for the big graphs
+node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # Use temp_folder for big graphs
 
 # Embed nodes
 model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
@@ -79,6 +80,7 @@ edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)
     9. `sampling_strategy`: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
         Use these keys exactly. If not set, will use the global ones which were passed on the object initialization`
     10. `quiet`: Boolean controlling the verbosity. (default: False)
+    11. `temp_folder`: String path pointing to folder to save a shared memory copy of the graph - Supply when working on graphs that are too big to fit in memory during algorithm execution.
 
 - `Node2Vec.fit` method:
     Accepts any key word argument acceptable by gensim.Word2Vec
@@ -106,6 +108,3 @@ Notice that edge embeddings are defined for any pair of nodes, connected or not
 ## TODO
 - [x] Parallel implementation for walk generation
 - [ ] Parallel implementation for probability precomputation
-
-## Contributing
-I will probably not be maintaining this package actively, if someone wants to contribute and maintain, please contact me.

From f525b23f91591491b1c196a55663c4cca5c68082 Mon Sep 17 00:00:00 2001
From: Elior Cohen <elior.cohen.p@gmail.com>
Date: Sat, 29 Dec 2018 14:46:46 +0200
Subject: [PATCH 7/8] Minor changes in code (no functionality changes) and
 pydoc

---
 node2vec/node2vec.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py
index ddbd898..0ee0ada 100644
--- a/node2vec/node2vec.py
+++ b/node2vec/node2vec.py
@@ -1,4 +1,3 @@
-
 from collections import defaultdict
 import numpy as np
 import gensim, os
@@ -21,6 +20,7 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
                  workers=1, sampling_strategy=None, quiet=False, temp_folder=None):
         """
         Initiates the Node2Vec object, precomputes walking probabilities and generates the walks.
+
         :param graph: Input graph
         :type graph: Networkx Graph
         :param dimensions: Embedding dimensions (default: 128)
@@ -39,9 +39,10 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
         :type workers: int
         :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
         Use these keys exactly. If not set, will use the global ones which were passed on the object initialization
-        :param temp_folder: folder with enough space to hold the memory map of self.d_graph (for bigger graphs); to be passed joblib.Parallel.temp_folder
+        :param temp_folder: Path to folder with enough space to hold the memory map of self.d_graph (for big graphs); to be passed joblib.Parallel.temp_folder
         :type temp_folder: str
         """
+
         self.graph = graph
         self.dimensions = dimensions
         self.walk_length = walk_length
@@ -60,7 +61,9 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
 
         self.temp_folder, self.require = None, None
         if temp_folder:
-            assert os.path.isdir(temp_folder), "tmp_dir does not exists"
+            if not os.path.isdir(temp_folder):
+                raise NotADirectoryError("temp_folder does not exist or is not a directory. ({})".format(temp_folder))
+
             self.temp_folder = temp_folder
             self.require = "sharedmem"
 
@@ -71,6 +74,7 @@ def _precompute_probabilities(self):
         """
         Precomputes transition probabilities for each node.
         """
+
         d_graph = self.d_graph
         first_travel_done = set()
 
@@ -127,7 +131,6 @@ def _precompute_probabilities(self):
                 # Save neighbors
                 d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors
 
-
     def _generate_walks(self):
         """
         Generates the random walks which will be used as the skip-gram input.
@@ -139,19 +142,20 @@ def _generate_walks(self):
         # Split num_walks for each worker
         num_walks_lists = np.array_split(range(self.num_walks), self.workers)
 
-        walk_results = Parallel(n_jobs=self.workers, temp_folder = self.temp_folder, require = self.require)(delayed(parallel_generate_walks)(self.d_graph,
-                                                                                      self.walk_length,
-                                                                                      len(num_walks),
-                                                                                      idx,
-                                                                                      self.sampling_strategy,
-                                                                                      self.NUM_WALKS_KEY,
-                                                                                      self.WALK_LENGTH_KEY,
-                                                                                      self.NEIGHBORS_KEY,
-                                                                                      self.PROBABILITIES_KEY,
-                                                                                      self.FIRST_TRAVEL_KEY,
-                                                                                      self.quiet) for
-                                                     idx, num_walks
-                                                     in enumerate(num_walks_lists, 1))
+        walk_results = Parallel(n_jobs=self.workers, temp_folder=self.temp_folder, require=self.require)(
+            delayed(parallel_generate_walks)(self.d_graph,
+                                             self.walk_length,
+                                             len(num_walks),
+                                             idx,
+                                             self.sampling_strategy,
+                                             self.NUM_WALKS_KEY,
+                                             self.WALK_LENGTH_KEY,
+                                             self.NEIGHBORS_KEY,
+                                             self.PROBABILITIES_KEY,
+                                             self.FIRST_TRAVEL_KEY,
+                                             self.quiet) for
+            idx, num_walks
+            in enumerate(num_walks_lists, 1))
 
         walks = flatten(walk_results)
 

From 3091c634ed1334571db3f143b84778484a972148 Mon Sep 17 00:00:00 2001
From: Elior Cohen <elior.cohen.p@gmail.com>
Date: Sat, 29 Dec 2018 14:47:08 +0200
Subject: [PATCH 8/8] Version 0.3.0

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1f262d6..15e8d4d 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name='node2vec',
     packages=['node2vec'],
-    version='0.2.2',
+    version='0.3.0',
     description='Implementation of the node2vec algorithm.',
     author='Elior Cohen',
     author_email='elior.cohen.p@gmail.com',