diff --git a/README.md b/README.md index 2e5216c..dbaf183 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Node2Vec +# Node2Vec [![Downloads](http://pepy.tech/badge/node2vec)](http://pepy.tech/project/node2vec) Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec and Vid Kocijan. @@ -6,10 +6,11 @@ Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec an ## Changes: -New in `0.2.2`: +New in `0.3.0`: -Added edge embedding functionality. Module `node2vec.edges`. -(Fixed error upon installation) +Added support for big graphs which cannot be fit into memory during algorithm execution (causing OOM errors). + +Thanks [`@pg2455`](https://github.com/pg2455) for the contribution of this feature. ## Installation @@ -24,7 +25,7 @@ from node2vec import Node2Vec graph = nx.fast_gnp_random_graph(n=100, p=0.5) # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1** -node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) +node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) # Use temp_folder for big graphs # Embed nodes model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) @@ -79,7 +80,8 @@ edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME) 9. `sampling_strategy`: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization` 10. `quiet`: Boolean controlling the verbosity. (default: False) - + 11. `temp_folder`: String path pointing to folder to save a shared memory copy of the graph - Supply when working on graphs that are too big to fit in memory during algorithm execution. + - `Node2Vec.fit` method: Accepts any key word argument acceptable by gensim.Word2Vec @@ -106,6 +108,3 @@ Notice that edge embeddings are defined for any pair of nodes, connected or not ## TODO - [x] Parallel implementation for walk generation - [ ] Parallel implementation for probability precomputation - -## Contributing -I will probably not be maintaining this package actively, if someone wants to contribute and maintain, please contact me. diff --git a/example.py b/example.py index 180fe12..c0f5966 100644 --- a/example.py +++ b/example.py @@ -11,6 +11,10 @@ # Precompute probabilities and generate walks node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) +## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space +# Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs +#node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data") + # Embed model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) @@ -21,4 +25,4 @@ model.wv.save_word2vec_format(EMBEDDING_FILENAME) # Save model for later use -model.save(EMBEDDING_MODEL_FILENAME) \ No newline at end of file +model.save(EMBEDDING_MODEL_FILENAME) diff --git a/node2vec/node2vec.py b/node2vec/node2vec.py index 6c74ef5..0ee0ada 100644 --- a/node2vec/node2vec.py +++ b/node2vec/node2vec.py @@ -1,7 +1,7 @@ from collections import defaultdict import numpy as np -import gensim -from joblib import Parallel, delayed +import gensim, os +from joblib import Parallel, delayed, load, dump from tqdm import tqdm from .parallel import parallel_generate_walks @@ -17,9 +17,10 @@ class Node2Vec: Q_KEY = 'q' def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight', - workers=1, sampling_strategy=None, quiet=False): + workers=1, sampling_strategy=None, quiet=False, temp_folder=None): """ Initiates the Node2Vec object, precomputes walking probabilities and generates the walks. + :param graph: Input graph :type graph: Networkx Graph :param dimensions: Embedding dimensions (default: 128) @@ -38,7 +39,10 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 :type workers: int :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization + :param temp_folder: Path to folder with enough space to hold the memory map of self.d_graph (for big graphs); to be passed joblib.Parallel.temp_folder + :type temp_folder: str """ + self.graph = graph self.dimensions = dimensions self.walk_length = walk_length @@ -48,20 +52,30 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1 self.weight_key = weight_key self.workers = workers self.quiet = quiet + self.d_graph = defaultdict(dict) if sampling_strategy is None: self.sampling_strategy = {} else: self.sampling_strategy = sampling_strategy - self.d_graph = self._precompute_probabilities() + self.temp_folder, self.require = None, None + if temp_folder: + if not os.path.isdir(temp_folder): + raise NotADirectoryError("temp_folder does not exist or is not a directory. ({})".format(temp_folder)) + + self.temp_folder = temp_folder + self.require = "sharedmem" + + self._precompute_probabilities() self.walks = self._generate_walks() def _precompute_probabilities(self): """ Precomputes transition probabilities for each node. """ - d_graph = defaultdict(dict) + + d_graph = self.d_graph first_travel_done = set() nodes_generator = self.graph.nodes() if self.quiet \ @@ -117,8 +131,6 @@ def _precompute_probabilities(self): # Save neighbors d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors - return d_graph - def _generate_walks(self): """ Generates the random walks which will be used as the skip-gram input. @@ -130,19 +142,20 @@ def _generate_walks(self): # Split num_walks for each worker num_walks_lists = np.array_split(range(self.num_walks), self.workers) - walk_results = Parallel(n_jobs=self.workers)(delayed(parallel_generate_walks)(self.d_graph, - self.walk_length, - len(num_walks), - idx, - self.sampling_strategy, - self.NUM_WALKS_KEY, - self.WALK_LENGTH_KEY, - self.NEIGHBORS_KEY, - self.PROBABILITIES_KEY, - self.FIRST_TRAVEL_KEY, - self.quiet) for - idx, num_walks - in enumerate(num_walks_lists, 1)) + walk_results = Parallel(n_jobs=self.workers, temp_folder=self.temp_folder, require=self.require)( + delayed(parallel_generate_walks)(self.d_graph, + self.walk_length, + len(num_walks), + idx, + self.sampling_strategy, + self.NUM_WALKS_KEY, + self.WALK_LENGTH_KEY, + self.NEIGHBORS_KEY, + self.PROBABILITIES_KEY, + self.FIRST_TRAVEL_KEY, + self.quiet) for + idx, num_walks + in enumerate(num_walks_lists, 1)) walks = flatten(walk_results) diff --git a/setup.py b/setup.py index 1f262d6..15e8d4d 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='node2vec', packages=['node2vec'], - version='0.2.2', + version='0.3.0', description='Implementation of the node2vec algorithm.', author='Elior Cohen', author_email='elior.cohen.p@gmail.com',