From 5052015c5f8e6953a7563ef499785a7e8c3fa414 Mon Sep 17 00:00:00 2001 From: pg2455 Date: Tue, 18 Dec 2018 13:33:31 +0000 Subject: [PATCH 1/8] add mem map for reading and writing bigger graphs --- example.py | 4 +++- node2vec/node2vec.py | 30 ++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/example.py b/example.py index 180fe12..335ff60 100644 --- a/example.py +++ b/example.py @@ -10,6 +10,8 @@ # Precompute probabilities and generate walks node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) +## if d_graph is big enough to fit in the memory, pass tmp_dir which has enough disk space +node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, tmp_dir="/mnt/tmp_data") # Embed model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) @@ -21,4 +23,4 @@ model.wv.save_word2vec_format(EMBEDDING_FILENAME) # Save model for later use -model.save(EMBEDDING_MODEL_FILENAME) \ No newline at end of file +model.save(EMBEDDING_MODEL_FILENAME) diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py index 6c74ef5..fc90bb3 100644 --- a/node2vec/node2vec.py +++ b/node2vec/node2vec.py @@ -1,7 +1,7 @@ from collections import defaultdict import numpy as np -import gensim -from joblib import Parallel, delayed +import gensim, os, shutil, tempfile +from joblib import Parallel, delayed, load, dump from tqdm import tqdm from .parallel import parallel_generate_walks @@ -17,7 +17,7 @@ class Node2Vec: Q_KEY = 'q' def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight', - workers=1, sampling_strategy=None, quiet=False): + workers=1, sampling_strategy=None, quiet=False, tmp_dir=""): """ Initiates the Node2Vec object, precomputes walking probabilities and generates the walks. :param graph: Input graph @@ -38,6 +38,8 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 :type workers: int :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization + :param tmp_dir: directory with enough space to hold the temp_d_graph + :type tmp_dir: str """ self.graph = graph self.dimensions = dimensions @@ -54,14 +56,24 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 else: self.sampling_strategy = sampling_strategy - self.d_graph = self._precompute_probabilities() + self.d_graph = defaultdict(dict) + if tmp_dir != "": + assert os.path.isdir(tmp_dir), "tmp_dir does not exists" + self.temp_folder = tempfile.mkdtemp(dir=tmp_dir) + filename = os.path.join(self.temp_folder, "d_graph.json") + if os.path.exists(filename): os.unlink(filename) + dump(self.d_graph, filename) + self.d_graph = load(filename, mmap_mode="r+") + print("Memory map of d_graph: {}".format(filename)) + + self._precompute_probabilities() self.walks = self._generate_walks() def _precompute_probabilities(self): """ Precomputes transition probabilities for each node. """ - d_graph = defaultdict(dict) + d_graph = self.d_graph first_travel_done = set() nodes_generator = self.graph.nodes() if self.quiet \ @@ -117,7 +129,6 @@ def _precompute_probabilities(self): # Save neighbors d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors - return d_graph def _generate_walks(self): """ @@ -163,3 +174,10 @@ def fit(self, **skip_gram_params): skip_gram_params['size'] = self.dimensions return gensim.models.Word2Vec(self.walks, **skip_gram_params) + + def __del__(self): + if "temp_folder" in self.__dict__: + try: + shutil.rmtree(self.temp_folder) + except OSError: + print("Unable to clean the temporary folder!!!") From e52affd00b4cbea6bc5711ef84c4225fde73e6e1 Mon Sep 17 00:00:00 2001 From: pg2455 Date: Sun, 23 Dec 2018 11:37:55 +0000 Subject: [PATCH 2/8] add require and temp folder to Parallel --- node2vec/node2vec.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py index fc90bb3..d991fd8 100644 --- a/node2vec/node2vec.py +++ b/node2vec/node2vec.py @@ -1,9 +1,10 @@ from collections import defaultdict import numpy as np -import gensim, os, shutil, tempfile +import gensim, os from joblib import Parallel, delayed, load, dump from tqdm import tqdm -from .parallel import parallel_generate_walks +# from .parallel import parallel_generate_walks +from parallel import parallel_generate_walks class Node2Vec: @@ -17,7 +18,7 @@ class Node2Vec: Q_KEY = 'q' def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight', - workers=1, sampling_strategy=None, quiet=False, tmp_dir=""): + workers=1, sampling_strategy=None, quiet=False, temp_folder=None): """ Initiates the Node2Vec object, precomputes walking probabilities and generates the walks. :param graph: Input graph @@ -38,8 +39,8 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 :type workers: int :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization - :param tmp_dir: directory with enough space to hold the temp_d_graph - :type tmp_dir: str + :param temp_folder: folder with enough space to hold the memory map of self.d_graph (for bigger graphs); to be passed joblib.Parallel.temp_folder + :type temp_folder: str """ self.graph = graph self.dimensions = dimensions @@ -50,21 +51,18 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 self.weight_key = weight_key self.workers = workers self.quiet = quiet + self.d_graph = defaultdict(dict) if sampling_strategy is None: self.sampling_strategy = {} else: self.sampling_strategy = sampling_strategy - self.d_graph = defaultdict(dict) - if tmp_dir != "": - assert os.path.isdir(tmp_dir), "tmp_dir does not exists" - self.temp_folder = tempfile.mkdtemp(dir=tmp_dir) - filename = os.path.join(self.temp_folder, "d_graph.json") - if os.path.exists(filename): os.unlink(filename) - dump(self.d_graph, filename) - self.d_graph = load(filename, mmap_mode="r+") - print("Memory map of d_graph: {}".format(filename)) + self.temp_folder, self.require = None, None + if temp_folder: + assert os.path.isdir(temp_folder), "tmp_dir does not exists" + self.temp_folder = temp_folder + self.require = "sharedmem" self._precompute_probabilities() self.walks = self._generate_walks() @@ -141,7 +139,7 @@ def _generate_walks(self): # Split num_walks for each worker num_walks_lists = np.array_split(range(self.num_walks), self.workers) - walk_results = Parallel(n_jobs=self.workers)(delayed(parallel_generate_walks)(self.d_graph, + walk_results = Parallel(n_jobs=self.workers, temp_folder = self.temp_folder, require = self.require)(delayed(parallel_generate_walks)(self.d_graph, self.walk_length, len(num_walks), idx, @@ -174,10 +172,3 @@ def fit(self, **skip_gram_params): skip_gram_params['size'] = self.dimensions return gensim.models.Word2Vec(self.walks, **skip_gram_params) - - def __del__(self): - if "temp_folder" in self.__dict__: - try: - shutil.rmtree(self.temp_folder) - except OSError: - print("Unable to clean the temporary folder!!!") From 55e945ff40db689ba035ed6c12e1514354477317 Mon Sep 17 00:00:00 2001 From: pg2455 Date: Sun, 23 Dec 2018 11:45:01 +0000 Subject: [PATCH 3/8] add relevant example --- example.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/example.py b/example.py index 335ff60..c0f5966 100644 --- a/example.py +++ b/example.py @@ -10,8 +10,10 @@ # Precompute probabilities and generate walks node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) -## if d_graph is big enough to fit in the memory, pass tmp_dir which has enough disk space -node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, tmp_dir="/mnt/tmp_data") + +## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space +# Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs +#node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data") # Embed model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) From 2f4c71fda1dc23327847b7adbfb19c6435381035 Mon Sep 17 00:00:00 2001 From: pg2455 Date: Sun, 23 Dec 2018 11:46:23 +0000 Subject: [PATCH 4/8] edit readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2e5216c..b972b6d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Node2Vec +# Node2Vec [![Downloads](http://pepy.tech/badge/node2vec)](http://pepy.tech/project/node2vec) Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec and Vid Kocijan. @@ -24,7 +24,7 @@ from node2vec import Node2Vec graph = nx.fast_gnp_random_graph(n=100, p=0.5) # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1** -node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) +node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) # pass temp_folder for the big graphs # Embed nodes model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) @@ -79,7 +79,7 @@ edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME) 9. `sampling_strategy`: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization` 10. `quiet`: Boolean controlling the verbosity. (default: False) - + - `Node2Vec.fit` method: Accepts any key word argument acceptable by gensim.Word2Vec From 49ca2a67466c4dab9941461dca8c7e2c777da6fd Mon Sep 17 00:00:00 2001 From: pg2455 Date: Sun, 23 Dec 2018 11:59:38 +0000 Subject: [PATCH 5/8] chnage import of parallel --- node2vec/node2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py index d991fd8..ddbd898 100644 --- a/node2vec/node2vec.py +++ b/node2vec/node2vec.py @@ -1,10 +1,10 @@ + from collections import defaultdict import numpy as np import gensim, os from joblib import Parallel, delayed, load, dump from tqdm import tqdm -# from .parallel import parallel_generate_walks -from parallel import parallel_generate_walks +from .parallel import parallel_generate_walks class Node2Vec: From 4c3064e7909d66b069f1c2c1cec01351ec16d185 Mon Sep 17 00:00:00 2001 From: Elior Cohen Date: Sat, 29 Dec 2018 14:45:31 +0200 Subject: [PATCH 6/8] Added documentation for temp_folder parameter --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index b972b6d..dbaf183 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec an ## Changes: -New in `0.2.2`: +New in `0.3.0`: -Added edge embedding functionality. Module `node2vec.edges`. -(Fixed error upon installation) +Added support for big graphs which cannot be fit into memory during algorithm execution (causing OOM errors). + +Thanks [`@pg2455`](https://github.com/pg2455) for the contribution of this feature. ## Installation @@ -24,7 +25,7 @@ from node2vec import Node2Vec graph = nx.fast_gnp_random_graph(n=100, p=0.5) # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1** -node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) # pass temp_folder for the big graphs +node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) # Use temp_folder for big graphs # Embed nodes model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) @@ -79,6 +80,7 @@ edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME) 9. `sampling_strategy`: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization` 10. `quiet`: Boolean controlling the verbosity. (default: False) + 11. `temp_folder`: String path pointing to folder to save a shared memory copy of the graph - Supply when working on graphs that are too big to fit in memory during algorithm execution. - `Node2Vec.fit` method: Accepts any key word argument acceptable by gensim.Word2Vec @@ -106,6 +108,3 @@ Notice that edge embeddings are defined for any pair of nodes, connected or not ## TODO - [x] Parallel implementation for walk generation - [ ] Parallel implementation for probability precomputation - -## Contributing -I will probably not be maintaining this package actively, if someone wants to contribute and maintain, please contact me. From f525b23f91591491b1c196a55663c4cca5c68082 Mon Sep 17 00:00:00 2001 From: Elior Cohen Date: Sat, 29 Dec 2018 14:46:46 +0200 Subject: [PATCH 7/8] Minor changes in code (no functionality changes) and pydoc --- node2vec/node2vec.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py index ddbd898..0ee0ada 100644 --- a/node2vec/node2vec.py +++ b/node2vec/node2vec.py @@ -1,4 +1,3 @@ - from collections import defaultdict import numpy as np import gensim, os @@ -21,6 +20,7 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 workers=1, sampling_strategy=None, quiet=False, temp_folder=None): """ Initiates the Node2Vec object, precomputes walking probabilities and generates the walks. + :param graph: Input graph :type graph: Networkx Graph :param dimensions: Embedding dimensions (default: 128) @@ -39,9 +39,10 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 :type workers: int :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization - :param temp_folder: folder with enough space to hold the memory map of self.d_graph (for bigger graphs); to be passed joblib.Parallel.temp_folder + :param temp_folder: Path to folder with enough space to hold the memory map of self.d_graph (for big graphs); to be passed joblib.Parallel.temp_folder :type temp_folder: str """ + self.graph = graph self.dimensions = dimensions self.walk_length = walk_length @@ -60,7 +61,9 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 self.temp_folder, self.require = None, None if temp_folder: - assert os.path.isdir(temp_folder), "tmp_dir does not exists" + if not os.path.isdir(temp_folder): + raise NotADirectoryError("temp_folder does not exist or is not a directory. ({})".format(temp_folder)) + self.temp_folder = temp_folder self.require = "sharedmem" @@ -71,6 +74,7 @@ def _precompute_probabilities(self): """ Precomputes transition probabilities for each node. """ + d_graph = self.d_graph first_travel_done = set() @@ -127,7 +131,6 @@ def _precompute_probabilities(self): # Save neighbors d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors - def _generate_walks(self): """ Generates the random walks which will be used as the skip-gram input. @@ -139,19 +142,20 @@ def _generate_walks(self): # Split num_walks for each worker num_walks_lists = np.array_split(range(self.num_walks), self.workers) - walk_results = Parallel(n_jobs=self.workers, temp_folder = self.temp_folder, require = self.require)(delayed(parallel_generate_walks)(self.d_graph, - self.walk_length, - len(num_walks), - idx, - self.sampling_strategy, - self.NUM_WALKS_KEY, - self.WALK_LENGTH_KEY, - self.NEIGHBORS_KEY, - self.PROBABILITIES_KEY, - self.FIRST_TRAVEL_KEY, - self.quiet) for - idx, num_walks - in enumerate(num_walks_lists, 1)) + walk_results = Parallel(n_jobs=self.workers, temp_folder=self.temp_folder, require=self.require)( + delayed(parallel_generate_walks)(self.d_graph, + self.walk_length, + len(num_walks), + idx, + self.sampling_strategy, + self.NUM_WALKS_KEY, + self.WALK_LENGTH_KEY, + self.NEIGHBORS_KEY, + self.PROBABILITIES_KEY, + self.FIRST_TRAVEL_KEY, + self.quiet) for + idx, num_walks + in enumerate(num_walks_lists, 1)) walks = flatten(walk_results) From 3091c634ed1334571db3f143b84778484a972148 Mon Sep 17 00:00:00 2001 From: Elior Cohen Date: Sat, 29 Dec 2018 14:47:08 +0200 Subject: [PATCH 8/8] Version 0.3.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1f262d6..15e8d4d 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='node2vec', packages=['node2vec'], - version='0.2.2', + version='0.3.0', description='Implementation of the node2vec algorithm.', author='Elior Cohen', author_email='elior.cohen.p@gmail.com',