Skip to content

Commit

Permalink
Merge branch 'pg2455-big_graphs'
Browse files Browse the repository at this point in the history
  • Loading branch information
eliorc committed Dec 29, 2018
2 parents 2ccc817 + 3091c63 commit 8408324
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 31 deletions.
17 changes: 8 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# Node2Vec
# Node2Vec
[![Downloads](http://pepy.tech/badge/node2vec)](http://pepy.tech/project/node2vec)

Python3 implementation of the node2vec algorithm Aditya Grover, Jure Leskovec and Vid Kocijan.
[node2vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2016.](https://snap.stanford.edu/node2vec/)

## Changes:

New in `0.2.2`:
New in `0.3.0`:

Added edge embedding functionality. Module `node2vec.edges`.
(Fixed error upon installation)
Added support for big graphs which cannot be fit into memory during algorithm execution (causing OOM errors).

Thanks [`@pg2455`](https://github.com/pg2455) for the contribution of this feature.

## Installation

Expand All @@ -24,7 +25,7 @@ from node2vec import Node2Vec
graph = nx.fast_gnp_random_graph(n=100, p=0.5)

# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) # Use temp_folder for big graphs

# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
Expand Down Expand Up @@ -79,7 +80,8 @@ edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)
9. `sampling_strategy`: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
Use these keys exactly. If not set, will use the global ones which were passed on the object initialization`
10. `quiet`: Boolean controlling the verbosity. (default: False)

11. `temp_folder`: String path pointing to folder to save a shared memory copy of the graph - Supply when working on graphs that are too big to fit in memory during algorithm execution.

- `Node2Vec.fit` method:
Accepts any key word argument acceptable by gensim.Word2Vec

Expand All @@ -106,6 +108,3 @@ Notice that edge embeddings are defined for any pair of nodes, connected or not
## TODO
- [x] Parallel implementation for walk generation
- [ ] Parallel implementation for probability precomputation

## Contributing
I will probably not be maintaining this package actively, if someone wants to contribute and maintain, please contact me.
6 changes: 5 additions & 1 deletion example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
# Precompute probabilities and generate walks
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)

## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space
# Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
#node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data")

# Embed
model = node2vec.fit(window=10, min_count=1, batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

Expand All @@ -21,4 +25,4 @@
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)
model.save(EMBEDDING_MODEL_FILENAME)
53 changes: 33 additions & 20 deletions node2vec/node2vec.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import defaultdict
import numpy as np
import gensim
from joblib import Parallel, delayed
import gensim, os
from joblib import Parallel, delayed, load, dump
from tqdm import tqdm
from .parallel import parallel_generate_walks

Expand All @@ -17,9 +17,10 @@ class Node2Vec:
Q_KEY = 'q'

def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1, weight_key='weight',
workers=1, sampling_strategy=None, quiet=False):
workers=1, sampling_strategy=None, quiet=False, temp_folder=None):
"""
Initiates the Node2Vec object, precomputes walking probabilities and generates the walks.
:param graph: Input graph
:type graph: Networkx Graph
:param dimensions: Embedding dimensions (default: 128)
Expand All @@ -38,7 +39,10 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
:type workers: int
:param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
Use these keys exactly. If not set, will use the global ones which were passed on the object initialization
:param temp_folder: Path to folder with enough space to hold the memory map of self.d_graph (for big graphs); to be passed joblib.Parallel.temp_folder
:type temp_folder: str
"""

self.graph = graph
self.dimensions = dimensions
self.walk_length = walk_length
Expand All @@ -48,20 +52,30 @@ def __init__(self, graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1
self.weight_key = weight_key
self.workers = workers
self.quiet = quiet
self.d_graph = defaultdict(dict)

if sampling_strategy is None:
self.sampling_strategy = {}
else:
self.sampling_strategy = sampling_strategy

self.d_graph = self._precompute_probabilities()
self.temp_folder, self.require = None, None
if temp_folder:
if not os.path.isdir(temp_folder):
raise NotADirectoryError("temp_folder does not exist or is not a directory. ({})".format(temp_folder))

self.temp_folder = temp_folder
self.require = "sharedmem"

self._precompute_probabilities()
self.walks = self._generate_walks()

def _precompute_probabilities(self):
"""
Precomputes transition probabilities for each node.
"""
d_graph = defaultdict(dict)

d_graph = self.d_graph
first_travel_done = set()

nodes_generator = self.graph.nodes() if self.quiet \
Expand Down Expand Up @@ -117,8 +131,6 @@ def _precompute_probabilities(self):
# Save neighbors
d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors

return d_graph

def _generate_walks(self):
"""
Generates the random walks which will be used as the skip-gram input.
Expand All @@ -130,19 +142,20 @@ def _generate_walks(self):
# Split num_walks for each worker
num_walks_lists = np.array_split(range(self.num_walks), self.workers)

walk_results = Parallel(n_jobs=self.workers)(delayed(parallel_generate_walks)(self.d_graph,
self.walk_length,
len(num_walks),
idx,
self.sampling_strategy,
self.NUM_WALKS_KEY,
self.WALK_LENGTH_KEY,
self.NEIGHBORS_KEY,
self.PROBABILITIES_KEY,
self.FIRST_TRAVEL_KEY,
self.quiet) for
idx, num_walks
in enumerate(num_walks_lists, 1))
walk_results = Parallel(n_jobs=self.workers, temp_folder=self.temp_folder, require=self.require)(
delayed(parallel_generate_walks)(self.d_graph,
self.walk_length,
len(num_walks),
idx,
self.sampling_strategy,
self.NUM_WALKS_KEY,
self.WALK_LENGTH_KEY,
self.NEIGHBORS_KEY,
self.PROBABILITIES_KEY,
self.FIRST_TRAVEL_KEY,
self.quiet) for
idx, num_walks
in enumerate(num_walks_lists, 1))

walks = flatten(walk_results)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name='node2vec',
packages=['node2vec'],
version='0.2.2',
version='0.3.0',
description='Implementation of the node2vec algorithm.',
author='Elior Cohen',
author_email='[email protected]',
Expand Down

0 comments on commit 8408324

Please sign in to comment.