diff --git a/.gitignore b/.gitignore index 3b8d9dacf..05bb781c1 100644 --- a/.gitignore +++ b/.gitignore @@ -115,3 +115,5 @@ data/ \.DS_Store + +*.ipynb diff --git a/.pyup.yml b/.pyup.yml new file mode 100644 index 000000000..f9f59f8a1 --- /dev/null +++ b/.pyup.yml @@ -0,0 +1,33 @@ +#Taken from https://pyup.io/docs/bot/config/ + +# configure updates globally +# default: all +# allowed: all, insecure, False +update: all + +# configure dependency pinning globally +# default: True +# allowed: True, False +pin: True + +# set the default branch +# default: empty, the default branch on GitHub +branch: develop + +# update schedule +# default: empty +# allowed: "every day", "every week", .. +schedule: "every day" + +# search for requirement files +# default: True +# allowed: True, False +search: True + +# configure the branch prefix the bot is using +# default: pyup- +branch_prefix: pyup/ + +# allow to close stale PRs +# default: True +close_prs: True diff --git a/AUTHORS.rst b/AUTHORS.rst index 8f0028ac2..efb6e7642 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -9,8 +9,3 @@ Development Lead * Bertrand Charpentier * Nathan De Lara * Quentin Lutz - -Contributors ------------- - -* Alexandre Hollocou diff --git a/README.rst b/README.rst index 934678476..9a02650bc 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ +.. figure:: ../logo_sknetwork.png + :align: right + :width: 100px + :alt: logo sknetwork + ============== scikit-network ============== @@ -25,11 +30,25 @@ Graph algorithms * Documentation: https://scikit-network.readthedocs.io. -Features --------- +How to use scikit-network? +-------------------------- + +Graphs have a unified format, namely ``scipy.sparse.csr_matrix``. + + +About the documentation +----------------------- + +We use the following notations in the documentation: + +* :math:`A` denotes the adjacency matrix for undirected and directed graphs. + +* :math:`B` denotes the biadjacency matrix for bipartite graphs (possibly non-square). + +* :math:`d = A1` or :math:`B1` is the out-degree vector and :math:`D = \text{diag}(d)` the associated diagonal matrix. + +* :math:`f = A^T1` or :math:`B^T1` is the in-degree vector and :math:`F = \text{diag}(f)` the associated diagonal matrix. -* Embedding -* Clustering -* Hierarchy +* :math:`w = 1^TA1` or :math:`1 ^TB1` is the total weight of the graph. diff --git a/docs/conf.py b/docs/conf.py index dbce19d43..21e51db8a 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,8 +58,8 @@ # General information about the project. project = u'scikit-network' -copyright = u"2018, Bertrand Charpentier" -author = u"Bertrand Charpentier" +copyright = u"2019, scikit-network" +author = u"scikit-network team" # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout @@ -94,7 +94,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -#html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -105,7 +105,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = [] # -- Options for HTMLHelp output --------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index fe513ea85..1179bef21 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,3 +1,8 @@ +.. figure:: ../sknetwork/../logo_sknetwork.png + :align: right + :width: 200px + :alt: logo sknetwork + Welcome to scikit-network's documentation! ========================================== @@ -9,8 +14,6 @@ Welcome to scikit-network's documentation! installation usage reference/index - reference2 - modules contributing authors history diff --git a/docs/reference/clustering.rst b/docs/reference/clustering.rst index 1bd1f6fbb..2f2eb2ad7 100644 --- a/docs/reference/clustering.rst +++ b/docs/reference/clustering.rst @@ -7,18 +7,38 @@ Clustering Louvain ------- -.. automodule:: sknetwork.clustering.louvain - :toctree: generated/ +.. automodule:: sknetwork.clustering -.. autoclass:: Louvain +.. autoclass:: sknetwork.clustering.Louvain :members: -.. autoclass:: GreedyModularity +.. autoclass:: sknetwork.clustering.GreedyModularity :members: -.. autoclass:: GreedyModularityJiT +.. autoclass:: sknetwork.clustering.GreedyModularityNumba :members: +Louvain for bipartite graphs +---------------------------- + +.. autoclass:: sknetwork.clustering.BiLouvain + :members: + +.. autoclass:: sknetwork.clustering.GreedyBipartite + :members: + +.. autoclass:: sknetwork.clustering.GreedyBipartiteNumba + :members: + + Metrics ------- +.. autofunction:: sknetwork.clustering.modularity + +.. autofunction:: sknetwork.clustering.bimodularity + +.. autofunction:: sknetwork.clustering.cocitation_modularity + +.. autofunction:: sknetwork.clustering.performance +.. autofunction:: sknetwork.clustering.cocitation_performance diff --git a/docs/reference/embedding.rst b/docs/reference/embedding.rst index 2a1bbb77c..ec04509ac 100644 --- a/docs/reference/embedding.rst +++ b/docs/reference/embedding.rst @@ -6,17 +6,30 @@ Embedding .. currentmodule:: sknetwork -Spectral embeddings -------------------- -.. automodule:: sknetwork.embedding.forwardbackward_embedding - :toctree: generated/ +Spectral +-------- +.. automodule:: sknetwork.embedding -.. autoclass:: ForwardBackwardEmbedding +.. autoclass:: sknetwork.embedding.SpectralEmbedding :members: -.. automodule:: sknetwork.embedding.spectral_embedding - :toctree: generated/ +SVD +--- -.. autoclass:: SpectralEmbedding +.. autoclass:: sknetwork.embedding.GSVDEmbedding :members: +Metrics +------- +.. autofunction:: sknetwork.embedding.dot_modularity + +.. autofunction:: sknetwork.embedding.hscore + + +Randomized methods +------------------ +.. autofunction:: sknetwork.embedding.randomized_range_finder + +.. autofunction:: sknetwork.embedding.randomized_svd + +.. autofunction:: sknetwork.embedding.randomized_eig diff --git a/docs/reference/hierarchical_clustering.rst b/docs/reference/hierarchical_clustering.rst deleted file mode 100644 index 199860609..000000000 --- a/docs/reference/hierarchical_clustering.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. _hierarchical_clustering: - -Hierarchical Clustering -*********************** - -.. currentmodule:: sknetwork - -Agglomerative approach ----------------------- -.. automodule:: sknetwork.hierarhcical_clustering.agglomerative_clustering -.. autosummary:: - :toctree: generated/ - -.. autofunction:: sknetwork.linkage_clustering - -Divisive approach ------------------ - -Metrics -------- -.. automodule:: sknetwork.hierarhcical_clustering.metrics -.. autosummary:: - :toctree: generated/ - -.. autofunction:: sknetwork.hierarhcical_clustering.metrics.hierarchical_cost diff --git a/docs/reference/hierarchy.rst b/docs/reference/hierarchy.rst new file mode 100644 index 000000000..101983db4 --- /dev/null +++ b/docs/reference/hierarchy.rst @@ -0,0 +1,22 @@ +.. _hierarchical_clustering: + +Hierarchical clustering +*********************** + +.. currentmodule:: sknetwork + +Paris +----- +.. automodule:: sknetwork.hierarchy +.. autosummary:: + :toctree: generated/ + +.. autoclass:: sknetwork.hierarchy.paris.Paris + :members: + + +Metrics +------- +.. autofunction:: sknetwork.hierarchy.metrics.dasgupta_cost + +.. autofunction:: sknetwork.hierarchy.metrics.tree_sampling_divergence diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 64217f278..db97d28f0 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -6,5 +6,7 @@ Reference .. toctree:: embedding - flat_clustering - hierarchical_clustering + clustering + hierarchy + loader + toy_graphs diff --git a/docs/reference/loader.rst b/docs/reference/loader.rst new file mode 100644 index 000000000..07620aa90 --- /dev/null +++ b/docs/reference/loader.rst @@ -0,0 +1,9 @@ +.. _loader: + +Loader +****** + +.. currentmodule:: sknetwork + +.. autoclass:: sknetwork.loader.Parser + :members: diff --git a/docs/reference/toy_graphs.rst b/docs/reference/toy_graphs.rst new file mode 100644 index 000000000..a5f70dbc9 --- /dev/null +++ b/docs/reference/toy_graphs.rst @@ -0,0 +1,12 @@ +.. _toy_graphs: + +Toy graphs +********** + +.. currentmodule:: sknetwork + +.. autofunction:: sknetwork.toy_graphs.house_graph + +.. autofunction:: sknetwork.toy_graphs.karate_club_graph + +.. autofunction:: sknetwork.toy_graphs.star_wars_villains_graph diff --git a/logo_sknetwork.png b/logo_sknetwork.png new file mode 100644 index 000000000..7966d5380 Binary files /dev/null and b/logo_sknetwork.png differ diff --git a/make.bat b/make.bat new file mode 100644 index 000000000..543c6b13b --- /dev/null +++ b/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/sknetwork/__init__.py b/sknetwork/__init__.py index 893f60b6b..c100aa26b 100644 --- a/sknetwork/__init__.py +++ b/sknetwork/__init__.py @@ -1,7 +1,14 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- +"""Top-level package for scikit-network""" -"""Top-level package for scikit-network.""" - -__author__ = """Bertrand Charpentier""" -__email__ = 'bertrand.charpentier@live.fr' +__author__ = """scikit-network team""" +__email__ = "bonald@enst.fr" __version__ = '0.2.0' + +from scipy.sparse.csgraph import * +from sknetwork.toy_graphs import * +from sknetwork.loader import * +from sknetwork.clustering import * +from sknetwork.hierarchy import * +from sknetwork.embedding import * \ No newline at end of file diff --git a/sknetwork/clustering/__init__.py b/sknetwork/clustering/__init__.py index e69de29bb..422a59ca1 100644 --- a/sknetwork/clustering/__init__.py +++ b/sknetwork/clustering/__init__.py @@ -0,0 +1,3 @@ +from sknetwork.clustering.louvain import * +from sknetwork.clustering.bilouvain import * +from sknetwork.clustering.metrics import * diff --git a/sknetwork/clustering/bilouvain.py b/sknetwork/clustering/bilouvain.py new file mode 100644 index 000000000..7e078293d --- /dev/null +++ b/sknetwork/clustering/bilouvain.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mar 3, 2019 +@author: Nathan de Lara +""" + +try: + from numba import njit, prange + default = 'numba' +except ImportError: + def njit(func): + return func + prange = range + default = 'python' + +import numpy as np +from scipy import sparse +from typing import Union + + +class BipartiteGraph: + """ + A class of graphs suitable for the BiLouvain algorithm. + + Attributes + ---------- + n_samples: int, + the cardinal of V1 (the set of samples) + n_features: int, + the cardinal of V2 (the set of features) + sample_weights: np.ndarray, + the normalized degree vector of the samples + feature_weights: np.ndarray, + the normalized degree vector of the features + + """ + + def __init__(self, biadjacency: sparse.csr_matrix): + """ + + Parameters + ---------- + biadjacency: biadjacency matrix of the graph as SciPy sparse matrix + """ + self.n_samples: int = biadjacency.shape[0] + self.n_features: int = biadjacency.shape[1] + self.norm_adjacency: sparse.csr_matrix = biadjacency / biadjacency.data.sum() + self.sample_weights: np.ndarray = self.norm_adjacency.dot(np.ones(self.n_features)) + self.feature_weights: np.ndarray = self.norm_adjacency.T.dot(np.ones(self.n_samples)) + + def aggregate(self, sample_membership: sparse.csr_matrix, feature_membership: sparse.csr_matrix): + """ + Aggregates nodes belonging to the same clusters while keeping the bipartite structure. + + Parameters + ---------- + sample_membership: + matrix of shape n_samples x n_clusters, + row number i is the one-hot cluster vector of sample i + feature_membership: + matrix of shape n_features x n_clusters, + row number i is the one-hot cluster vector of feature i + + Returns + ------- + self: :class:`BipartiteGraph` + """ + self.norm_adjacency = sample_membership.T.dot(self.norm_adjacency.dot(feature_membership)).tocsr() + self.sample_weights = np.array(sample_membership.T.dot(self.sample_weights).T) + self.feature_weights = np.array(feature_membership.T.dot(self.feature_weights).T) + self.n_samples, self.n_features = self.norm_adjacency.shape + return self + + +class BiOptimizer: + """ + A generic optimization algorithm. + + Attributes + ---------- + score_: float + Total increase of the objective function. + sample_labels_: np.ndarray + Cluster index of each node in V1. + feature_labels_: np.ndarray + Cluster index of each node in V2. + """ + def __init__(self): + self.score_ = None + self.sample_labels_ = None + self.feature_labels_ = None + + def fit(self, graph: BipartiteGraph): + """Fit the clusters to the objective function. + + Parameters + ---------- + graph: + Graph to cluster. + + Returns + ------- + self: :class:`BiOptimizer` + + """ + return self + + +class GreedyBipartite(BiOptimizer): + """ + A greedy bipartite modularity optimizer. + + Attributes + ---------- + resolution: + modularity resolution + tol: + minimum bimodularity increase to enter a new optimization pass + """ + + def __init__(self, resolution: float = 1, tol: float = 1e-3): + BiOptimizer.__init__(self) + self.resolution = resolution + self.tol = tol + + def fit(self, graph: BipartiteGraph): + """Iterates over the nodes of the graph and moves them to the cluster of highest increase among their neighbors. + + Parameters + ---------- + graph: + the graph to cluster + + Returns + ------- + self: :class:`BiOptimizer` + """ + increase: bool = True + total_increase: float = 0. + + sample_labels: np.ndarray = np.arange(graph.n_samples) + feature_labels: np.ndarray = graph.n_samples + np.arange(graph.n_features) + sample_clusters_proba: np.ndarray = np.hstack((graph.sample_weights.copy(), np.zeros(graph.n_features))) + feature_clusters_proba: np.ndarray = np.hstack((np.zeros(graph.n_samples), graph.feature_weights.copy())) + + sample_indptr: np.ndarray = graph.norm_adjacency.indptr + sample_indices: np.ndarray = graph.norm_adjacency.indices + sample_data: np.ndarray = graph.norm_adjacency.data + + transposed_adjacency = graph.norm_adjacency.T.tocsr() + feature_indptr: np.ndarray = transposed_adjacency.indptr + feature_indices: np.ndarray = transposed_adjacency.indices + feature_data: np.ndarray = transposed_adjacency.data + del transposed_adjacency + + while increase: + increase = False + pass_increase: float = 0. + + sample_args = (graph.n_samples, graph.sample_weights, sample_labels, sample_clusters_proba, + sample_indptr, sample_indices, sample_data) + feature_args = (graph.n_features, graph.feature_weights, feature_labels, feature_clusters_proba, + feature_indptr, feature_indices, feature_data) + + for source, target in [(sample_args, feature_args), (feature_args, sample_args)]: + n_nodes, source_weights, source_labels, source_cluster_proba, indptr, indices, data = source + target_labels, target_cluster_proba = target[2], target[3] + + for node in range(n_nodes): + node_cluster: int = source_labels[node] + target_weights: np.ndarray = data[indptr[node]:indptr[node + 1]] + neighbors: np.ndarray = indices[indptr[node]:indptr[node + 1]] + neighbors_clusters: np.ndarray = target_labels[neighbors] + unique_clusters: list = list(set(neighbors_clusters.tolist()) - {node_cluster}) + n_clusters: int = len(unique_clusters) + + if n_clusters > 0: + node_proba: float = source_weights[node] + node_ratio: float = self.resolution * node_proba + + # total weight of edges to neighbors in the same cluster + out_delta: float = target_weights[neighbors_clusters == node_cluster].sum() + # minus the probability to choose a neighbor in the same cluster (with resolution factor) + out_delta -= node_ratio * target_cluster_proba[node_cluster] + + local_delta: np.ndarray = np.full(n_clusters, -out_delta) + + for index_cluster, cluster in enumerate(unique_clusters): + # total weight of edges to neighbors in the candidate cluster + in_delta: float = target_weights[neighbors_clusters == cluster].sum() + # minus the probability to choose a neighbor in the candidate cluster + in_delta -= node_ratio * target_cluster_proba[cluster] + + local_delta[index_cluster] += in_delta + + delta_argmax: int = local_delta.argmax() + best_delta: float = local_delta[delta_argmax] + if best_delta > 0: + pass_increase += best_delta + best_cluster = unique_clusters[delta_argmax] + + source_cluster_proba[node_cluster] -= node_proba + source_cluster_proba[best_cluster] += node_proba + source_labels[node] = best_cluster + + total_increase += pass_increase + if pass_increase > self.tol: + increase = True + + self.score_ = total_increase + _, self.sample_labels_ = np.unique(sample_labels, return_inverse=True) + _, self.feature_labels_ = np.unique(feature_labels, return_inverse=True) + + return self + + +@njit +def fit_core(sample_args, feature_args, resolution, tol): + n_samples, n_features = sample_args[0], feature_args[0] + sample_weights, feature_weights = sample_args[1], feature_args[1] + + increase = True + total_increase = 0. + + sample_labels: np.ndarray = np.arange(n_samples) + feature_labels: np.ndarray = n_samples + np.arange(n_features) + + sample_clusters_proba: np.ndarray = np.hstack((sample_weights.copy(), np.zeros(n_features))) + feature_clusters_proba: np.ndarray = np.hstack((np.zeros(n_samples), feature_weights.copy())) + + while increase: + increase = False + pass_increase = 0. + + all_sample_args = (sample_args, sample_labels, sample_clusters_proba) + all_feature_args = (feature_args, feature_labels, feature_clusters_proba) + + for source, target in [(all_sample_args, all_feature_args), (all_feature_args, all_sample_args)]: + source_args, source_labels, source_clusters_proba = source + target_args, target_labels, target_clusters_proba = target + + n_nodes, source_weights, indptr, indices, data = source_args + for node in prange(n_nodes): + node_cluster: int = source_labels[node] + unique_clusters = set(target_labels[indices[indptr[node]:indptr[node + 1]]]) + unique_clusters.discard(node_cluster) + n_clusters = len(unique_clusters) + + if n_clusters > 0: + node_proba: float = source_weights[node] + out_delta: float = resolution * node_proba * target_clusters_proba[node_cluster] + + unique_clusters_list = list(unique_clusters) + neighbor_cluster_weights = np.full(n_clusters, 0.0) + + for ix in range(indptr[node], indptr[node + 1]): + neighbor_cluster = target_labels[indices[ix]] + if neighbor_cluster == node_cluster: + out_delta -= data[ix] + else: + neighbor_cluster_ix = unique_clusters_list.index(neighbor_cluster) + neighbor_cluster_weights[neighbor_cluster_ix] += data[ix] + + best_delta = 0.0 + best_cluster = node_cluster + + for ix, cluster in enumerate(unique_clusters): + in_delta: float = neighbor_cluster_weights[ix] + in_delta -= resolution * node_proba * target_clusters_proba[cluster] + + local_delta = out_delta + in_delta + if local_delta > best_delta: + best_delta = local_delta + best_cluster = cluster + + if best_delta > 0: + pass_increase += best_delta + source_clusters_proba[node_cluster] -= node_proba + source_clusters_proba[best_cluster] += node_proba + source_labels[node] = best_cluster + + total_increase += pass_increase + if pass_increase > tol: + increase = True + + return sample_labels, feature_labels, total_increase + + +class GreedyBipartiteNumba(BiOptimizer): + """ + A greedy modularity optimizer using Numba for enhanced performance. + + Attributes + ---------- + resolution: + bimodularity resolution + tol: + minimum bimodularity increase to enter a new optimization pass + + Notes + ----- + Tested with Numba v0.42.0. + """ + + def __init__(self, resolution: float = 1., tol: float = 1e-3): + BiOptimizer.__init__(self) + self.resolution = resolution + self.tol = tol + + def fit(self, graph: BipartiteGraph): + """Iterates over the nodes of the graph and moves them to the cluster of highest increase among their neighbors. + + Parameters + ---------- + graph: + the graph to cluster + + Returns + ------- + self: :class:`BiOptimizer` + + """ + transposed_adjacency = graph.norm_adjacency.T.tocsr() + sample_args = (graph.n_samples, graph.sample_weights, + graph.norm_adjacency.indptr, graph.norm_adjacency.indices, graph.norm_adjacency.data) + feature_args = (graph.n_features, graph.feature_weights, + transposed_adjacency.indptr, transposed_adjacency.indices, transposed_adjacency.data) + + sample_labels, feature_labels, total_increase = fit_core(sample_args, feature_args, + self.resolution, self.tol) + + self.score_ = total_increase + _, self.sample_labels_ = np.unique(sample_labels, return_inverse=True) + _, self.feature_labels_ = np.unique(feature_labels, return_inverse=True) + + return self + + +class BiLouvain: + """ + BiLouvain algorithm for graph clustering in Python (default) and Numba. + + Attributes + ---------- + sample_labels_: np.ndarray + Cluster index of each node in V1. + feature_labels_: np.ndarray + Cluster index of each node in V2. + iteration_count_: int + Total number of aggregations performed. + aggregate_graph_: sparse.csr_matrix + Aggregated graph at the end of the algorithm. + score_: float + objective function value after fit + n_clusters_: int + number of clusters after fit + """ + + def __init__(self, algorithm: Union[str, BiOptimizer] = default, resolution: float = 1, tol: float = 1e-3, + agg_tol: float = 1e-3, max_agg_iter: int = -1, verbose: bool = False): + """ + Parameters + ---------- + algorithm: + The optimization algorithm. + Requires a fit method. + Requires score_, sample_labels_, and labels_ attributes. + resolution: + Resolution parameter. + tol: + Minimum increase in the objective function to enter a new optimization pass. + agg_tol: + Minimum increase in the objective function to enter a new aggregation pass. + max_agg_iter: + Maximum number of aggregations. + A negative value is interpreted as no limit. + verbose: + Verbose mode. + """ + if type(algorithm) == str: + if algorithm == "numba": + self.algorithm = GreedyBipartiteNumba(resolution, tol) + elif algorithm == "python": + self.algorithm = GreedyBipartite(resolution, tol) + else: + raise ValueError('Unknown algorithm name.') + elif isinstance(algorithm, BiOptimizer): + self.algorithm = algorithm + else: + raise TypeError('Algorithm must be a string ("numba" or "python") or a valid algorithm.') + + if type(max_agg_iter) != int: + raise TypeError('The maximum number of iterations must be an integer.') + self.agg_tol = agg_tol + self.max_agg_iter = max_agg_iter + self.verbose = verbose + self.sample_labels_ = None + self.feature_labels_ = None + self.iteration_count_ = None + self.aggregate_graph_ = None + self.score_ = None + self.n_clusters_ = None + + def fit(self, biadjacency: sparse.csr_matrix): + """Alternates local optimization and aggregation until convergence. + + Parameters + ---------- + biadjacency: + adjacency matrix of the graph to cluster, treated as a biadjacency matrix + + Returns + ------- + self: :class:`BiLouvain` + """ + if type(biadjacency) != sparse.csr_matrix: + raise TypeError('The adjacency matrix must be in a scipy compressed sparse row (csr) format.') + graph = BipartiteGraph(biadjacency) + sample_membership = sparse.identity(graph.n_samples, format='csr') + feature_membership = sparse.identity(graph.n_features, format='csr') + increase: bool = True + iteration_count: int = 0 + if self.verbose: + print("Starting with", biadjacency.shape, "nodes") + + self.score_ = 0. + while increase: + iteration_count += 1 + self.algorithm.fit(graph) + if self.algorithm.score_ - self.score_ <= self.agg_tol: + increase = False + else: + self.score_ = self.algorithm.score_ + + sample_row = np.arange(graph.n_samples) + sample_col = self.algorithm.sample_labels_ + sample_data = np.ones(graph.n_samples) + sample_agg_membership = sparse.csr_matrix((sample_data, (sample_row, sample_col))) + sample_membership = sample_membership.dot(sample_agg_membership) + + feature_row = np.arange(graph.n_features) + feature_col = self.algorithm.feature_labels_ + feature_data = np.ones(graph.n_features) + feature_agg_membership = sparse.csr_matrix((feature_data, (feature_row, feature_col))) + feature_membership = feature_membership.dot(feature_agg_membership) + + graph.aggregate(sample_agg_membership, feature_agg_membership) + + if self.verbose: + print("Iteration", iteration_count, "completed with", + graph.n_samples, graph.n_features, "clusters") + print(self.algorithm.score_) + if iteration_count == self.max_agg_iter: + break + + self.iteration_count_ = iteration_count + self.sample_labels_ = sample_membership.indices + _, self.sample_labels_ = np.unique(self.sample_labels_, return_inverse=True) + self.feature_labels_ = feature_membership.indices + _, self.feature_labels_ = np.unique(self.feature_labels_, return_inverse=True) + self.n_clusters_ = max(max(self.sample_labels_), max(self.feature_labels_)) + 1 + self.aggregate_graph_ = graph.norm_adjacency * biadjacency.data.sum() + return self diff --git a/sknetwork/clustering/louvain.py b/sknetwork/clustering/louvain.py index 6d9e59ca9..a7132e32a 100644 --- a/sknetwork/clustering/louvain.py +++ b/sknetwork/clustering/louvain.py @@ -4,122 +4,142 @@ Created on Nov 2, 2018 @author: Nathan de Lara @author: Quentin Lutz +@author: Thomas Bonald """ try: from numba import njit + default = 'numba' except ImportError: def njit(func): return func + default = 'python' import numpy as np from scipy import sparse +from typing import Union class NormalizedGraph: """ - A class of graphs suitable for the Louvain algorithm. + A class of graphs suitable for the Louvain algorithm. Each node represents a cluster. Attributes ---------- n_nodes: int - the number of nodes in the graph - norm_adj: normalized adjacency matrix (summing to 1) - node_weights: vector of node weights + Number of nodes. + norm_adjacency: sparse.csr_matrix + Normalized adjacency matrix (sums to 1). + node_probs : np.ndarray + Distribution of node weights (sums to 1). """ - def __init__(self, adj_matrix, node_weights='degree'): + def __init__(self, adjacency: sparse.csr_matrix, node_probs: np.ndarray): """ - Parameters ---------- - adj_matrix: adjacency matrix of the graph as SciPy sparse matrix - node_weights: node weights to be used in the second term of the modularity + adjacency : + Adjacency matrix of the graph. + node_probs : + Distribution of node weights (sums to 1), used in the second term of modularity. """ - self.n_nodes = adj_matrix.shape[0] - self.norm_adj = adj_matrix / adj_matrix.data.sum() - if type(node_weights) == np.ndarray: - if len(node_weights) != self.n_nodes: - raise ValueError('The number of node weights must match the number of nodes.') - if any(node_weights < np.zeros(self.n_nodes)): - raise ValueError('All node weights must be non-negative.') - else: - self.node_weights = node_weights - elif type(node_weights) == str: - if node_weights == 'degree': - self.node_weights = self.norm_adj.dot(np.ones(self.n_nodes)) - elif node_weights == 'uniform': - self.node_weights = np.ones(self.n_nodes) / self.n_nodes - else: - raise ValueError('Unknown distribution type.') - else: - raise TypeError( - 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.') + self.n_nodes = adjacency.shape[0] + self.norm_adjacency = adjacency / adjacency.data.sum() + self.node_probs = node_probs.copy() + + def aggregate(self, membership: sparse.csr_matrix): + """Aggregates nodes belonging to the same cluster. - def aggregate(self, membership): - """ - Aggregates nodes belonging to the same clusters. Parameters ---------- - membership: scipy sparse matrix of shape n_nodes x n_clusters + membership: + Entry i,j equals to 1 if node i belongs to cluster j (0 otherwise). Returns ------- - the aggregated graph + The aggregated graph """ - self.norm_adj = membership.T.dot(self.norm_adj.dot(membership)).tocsr() - self.node_weights = np.array(membership.T.dot(self.node_weights).T) - self.n_nodes = self.norm_adj.shape[0] + self.norm_adjacency = membership.T.dot(self.norm_adjacency.dot(membership)).tocsr() + self.node_probs = np.array(membership.T.dot(self.node_probs).T) + self.n_nodes = self.norm_adjacency.shape[0] return self -class GreedyModularity: - """ - A greedy modularity optimizer. +class Optimizer: + """A generic optimization algorithm. Attributes ---------- - score_: total increase of modularity after fitting - labels_: partition of the nodes. labels[node] = cluster_index + score_: float + Total increase of the objective function. + labels_: np.ndarray + Cluster index of each node. + """ + def __init__(self): + self.score_ = None + self.labels_ = None + + def fit(self, graph: NormalizedGraph): + """Fit the clusters to the objective function. + + Parameters + ---------- + graph: + Graph to cluster. + + Returns + ------- + self: :class:Ě€Optimizer` + + """ + return self + + +class GreedyModularity(Optimizer): + """ + A greedy modularity optimizer. """ - def __init__(self, resolution=1., tol=0., shuffle_nodes=False): + def __init__(self, resolution: float = 1, tol: float = 1e-3, shuffle_nodes: bool = False): """ Parameters ---------- - resolution: modularity resolution - tol: minimum modularity increase to enter a new optimization pass - shuffle_nodes: whether to shuffle the nodes before beginning an optimization pass + resolution: + Resolution parameter (positive) + tol: + Minimum increase in modularity to enter a new optimization pass. + shuffle_nodes: + If true, shuffle the nodes before starting a new optimization pass. """ + Optimizer.__init__(self) self.resolution = resolution self.tol = tol self.shuffle_nodes = shuffle_nodes - self.score_ = None - self.labels_ = None def fit(self, graph: NormalizedGraph): - """ - Iterates over the nodes of the graph and moves them to the cluster of highest increase among their neighbors. - Parameters - ---------- - graph: the graph to cluster + """Iterate over the nodes to increase modularity. - Returns - ------- - self + Parameters + ---------- + graph: + Graph to cluster. - """ + Returns + ------- + self: :class:`Optimizer` + """ increase = True - total_increase = 0. + total_increase: float = 0 labels: np.ndarray = np.arange(graph.n_nodes) - clusters_proba: np.ndarray = graph.node_weights.copy() - self_loops: np.ndarray = graph.norm_adj.diagonal() + node_probs: np.ndarray = graph.node_probs + cluster_probs = node_probs.copy() + self_loops: np.ndarray = graph.norm_adjacency.diagonal() while increase: increase = False - pass_increase = 0. + pass_increase: float = 0 if self.shuffle_nodes: nodes = np.random.permutation(np.arange(graph.n_nodes)) @@ -128,40 +148,41 @@ def fit(self, graph: NormalizedGraph): for node in nodes: node_cluster: int = labels[node] - node_weights: np.ndarray = graph.norm_adj.data[ - graph.norm_adj.indptr[node]:graph.norm_adj.indptr[node + 1]] - neighbors: np.ndarray = graph.norm_adj.indices[ - graph.norm_adj.indptr[node]:graph.norm_adj.indptr[node + 1]] - neighbors_clusters: np.ndarray = labels[neighbors] - unique_clusters: list = list(set(neighbors_clusters.tolist()) - {node_cluster}) - n_clusters: int = len(unique_clusters) + neighbor_weights: np.ndarray = graph.norm_adjacency.data[ + graph.norm_adjacency.indptr[node]:graph.norm_adjacency.indptr[node + 1]] + neighbors: np.ndarray = graph.norm_adjacency.indices[ + graph.norm_adjacency.indptr[node]:graph.norm_adjacency.indptr[node + 1]] + neighbor_clusters: np.ndarray = labels[neighbors] + unique_clusters: list = list(set(neighbor_clusters.tolist()) - {node_cluster}) + n_clusters = len(unique_clusters) - if n_clusters > 0: - node_proba: float = graph.node_weights[node] - node_ratio: float = self.resolution * node_proba + if n_clusters: + node_prob: float = node_probs[node] + node_prob_res: float = self.resolution * node_prob - # node_weights of connections to all other nodes in original cluster - out_delta: float = (self_loops[node] - node_weights.dot(neighbors_clusters == node_cluster)) - # proba to choose (node, other_neighbor) among original cluster - out_delta += node_ratio * (clusters_proba[node_cluster] - node_proba) + # total weight of edges to neighbors in the same cluster + out_delta: float = neighbor_weights[neighbor_clusters == node_cluster].sum() - self_loops[node] + # minus the probability to choose a neighbor in the same cluster (with resolution factor) + out_delta -= node_prob_res * (cluster_probs[node_cluster] - node_prob) - local_delta: np.ndarray = np.full(n_clusters, out_delta) + local_delta: np.ndarray = np.full(n_clusters, -out_delta) for index_cluster, cluster in enumerate(unique_clusters): - # node_weights of connections to all other nodes in candidate cluster - in_delta: float = node_weights.dot(neighbors_clusters == cluster) - # proba to choose (node, other_neighbor) among new cluster - in_delta -= node_ratio * clusters_proba[cluster] + # total weight of edges to neighbors in the candidate cluster + in_delta: float = neighbor_weights[neighbor_clusters == cluster].sum() + # minus the probability to choose a neighbor in the candidate cluster (with resolution factor) + in_delta -= node_prob_res * cluster_probs[cluster] local_delta[index_cluster] += in_delta - best_delta: float = 2 * max(local_delta) + best_ix: int = local_delta.argmax() + best_delta: float = 2 * local_delta[best_ix] if best_delta > 0: pass_increase += best_delta - best_cluster = unique_clusters[local_delta.argmax()] + best_cluster = unique_clusters[best_ix] - clusters_proba[node_cluster] -= node_proba - clusters_proba[best_cluster] += node_proba + cluster_probs[node_cluster] -= node_prob + cluster_probs[best_cluster] += node_prob labels[node] = best_cluster total_increase += pass_increase @@ -175,38 +196,48 @@ def fit(self, graph: NormalizedGraph): @njit -def fit_core(shuffle_nodes, n_nodes, node_weights, resolution, self_loops, tol, indptr, indices, weights): +def fit_core(resolution: float, tol: float, shuffle_nodes: bool, n_nodes: int, node_probs: np.ndarray, + self_loops: np.ndarray, data: np.ndarray, indices: np.ndarray, indptr: np.ndarray) -> (np.ndarray, float): """ Parameters ---------- - shuffle_nodes: if True, a random permutation of the node is done. The natural order is used otherwise - n_nodes: number of nodes in the graph - node_weights: the node weights in the graph - resolution: the resolution for the Louvain modularity - self_loops: the weights of the self loops for each node - tol: the minimum desired increase for each maximization pass - indptr: the indptr array from the Scipy CSR adjacency matrix - indices: the indices array from the Scipy CSR adjacency matrix - weights: the data array from the Scipy CSR adjacency matrix + resolution: + Resolution parameter (positive). + tol: + Minimum increase in modularity to enter a new optimization pass. + shuffle_nodes: + If True, shuffle the nodes before starting a new optimization pass. + n_nodes: + Number of nodes. + node_probs: + Distribution of node weights (sums to 1). + self_loops: + Weights of self loops. + data: + CSR format data array of the normalized adjacency matrix. + indices: + CSR format index array of the normalized adjacency matrix. + indptr: + CSR format index pointer array of the normalized adjacency matrix. Returns ------- - a tuple consisting of: - -the labels found by the algorithm - -the score of the algorithm (total modularity increase) + labels: + Cluster index of each node + total_increase: + Score of the clustering (total increase in modularity) """ - increase = True - total_increase = 0 + increase: bool = True + total_increase: float = 0 labels: np.ndarray = np.arange(n_nodes) - clusters_proba: np.ndarray = node_weights.copy() + cluster_probs: np.ndarray = node_probs.copy() - local_cluster_weights = np.full(n_nodes, 0.0) nodes = np.arange(n_nodes) while increase: increase = False - pass_increase = 0. + pass_increase: float = 0 if shuffle_nodes: nodes = np.random.permutation(np.arange(n_nodes)) @@ -214,43 +245,41 @@ def fit_core(shuffle_nodes, n_nodes, node_weights, resolution, self_loops, tol, for node in nodes: node_cluster = labels[node] - for k in range(indptr[node], indptr[node + 1]): - local_cluster_weights[labels[indices[k]]] += weights[k] + neighbor_cluster_weights = np.zeros(n_nodes) + for i in range(indptr[node], indptr[node + 1]): + neighbor_cluster_weights[labels[indices[i]]] += data[i] unique_clusters = set(labels[indices[indptr[node]:indptr[node + 1]]]) unique_clusters.discard(node_cluster) if len(unique_clusters): - node_proba = node_weights[node] - node_ratio = resolution * node_proba + node_prob = node_probs[node] + node_prob_res = resolution * node_prob - # neighbors_weights of connections to all other nodes in original cluster - out_delta = self_loops[node] - local_cluster_weights[node_cluster] + # total weight of edges to neighbors in the same cluster + out_delta: float = neighbor_cluster_weights[node_cluster] - self_loops[node] + # minus the probability to choose a neighbor in the same cluster (with resolution factor) + out_delta -= node_prob_res * (cluster_probs[node_cluster] - node_prob) - # proba to choose (node, other_neighbor) among original cluster - out_delta += node_ratio * (clusters_proba[node_cluster] - node_proba) - - best_delta = 0.0 + best_delta: float = 0 best_cluster = node_cluster for cluster in unique_clusters: - # neighbors_weights of connections to all other nodes in candidate cluster - in_delta = local_cluster_weights[ - cluster] # np.sum(neighbors_weights[neighbors_clusters == cluster]) - local_cluster_weights[cluster] = 0.0 - # proba to choose (node, other_neighbor) among new cluster - in_delta -= node_ratio * clusters_proba[cluster] - local_delta = 2 * (out_delta + in_delta) + # total weight of edges to neighbors in the candidate cluster + in_delta: float = neighbor_cluster_weights[cluster] + # minus the probability to choose a neighbor in the candidate cluster (with resolution factor) + in_delta -= node_prob_res * cluster_probs[cluster] + + local_delta = 2 * (in_delta - out_delta) if local_delta > best_delta: best_delta = local_delta best_cluster = cluster if best_delta > 0: pass_increase += best_delta - clusters_proba[node_cluster] -= node_proba - clusters_proba[best_cluster] += node_proba + cluster_probs[node_cluster] -= node_prob + cluster_probs[best_cluster] += node_prob labels[node] = best_cluster - local_cluster_weights[node_cluster] = 0.0 total_increase += pass_increase if pass_increase > tol: @@ -258,75 +287,71 @@ def fit_core(shuffle_nodes, n_nodes, node_weights, resolution, self_loops, tol, return labels, total_increase -class GreedyModularityJiT: - """ - A greedy modularity optimizer using Numba for enhanced performance. - - Tested with Numba v0.40.1. +class GreedyModularityNumba(Optimizer): + """A greedy modularity optimizer using Numba for enhanced performance. Attributes ---------- - score_: total increase of modularity after fitting - labels_: partition of the nodes. labels[node] = cluster_index + labels_: + Cluster index of each node + score_: + Score of the clustering (total increase in modularity) + + Notes + ----- + Tested with Numba v0.40.1. """ - def __init__(self, resolution=1., tol=0., shuffle_nodes=False): + def __init__(self, resolution: float = 1, tol: float = 1e-3, shuffle_nodes: bool = False): """ Parameters ---------- - resolution: modularity resolution - tol: minimum modularity increase to enter a new optimization pass - shuffle_nodes: whether to shuffle the nodes before beginning an optimization pass + resolution: + Modularity resolution. + tol: + Minimum increase in modularity to enter a new optimization pass. + shuffle_nodes: + If True, shuffle the nodes before each optimization pass. """ + Optimizer.__init__(self) self.resolution = resolution self.tol = tol self.shuffle_nodes = shuffle_nodes - self.score_ = None - self.labels_ = None def fit(self, graph: NormalizedGraph): - """ - Iterates over the nodes of the graph and moves them to the cluster of highest increase among their neighbors. - Parameters - ---------- - graph: the graph to cluster + """Iterate over the nodes to increase modularity. - Returns - ------- - self + Parameters + ---------- + graph: + Graph to cluster. - """ - self_loops: np.ndarray = graph.norm_adj.diagonal() - - res_labels, total_increase = fit_core(self.shuffle_nodes, - graph.n_nodes, - graph.node_weights, - self.resolution, - self_loops, - self.tol, - graph.norm_adj.indptr, - graph.norm_adj.indices, - graph.norm_adj.data) + Returns + ------- + self: :class:`Optimizer` + """ + labels, total_increase = fit_core(self.resolution, self.tol, self.shuffle_nodes, graph.n_nodes, + graph.node_probs, graph.norm_adjacency.diagonal(), graph.norm_adjacency.data, + graph.norm_adjacency.indices, graph.norm_adjacency.indptr) self.score_ = total_increase - _, self.labels_ = np.unique(res_labels, return_inverse=True) + _, self.labels_ = np.unique(labels, return_inverse=True) return self class Louvain: - """ - Macro algorithm for Louvain clustering. - - Several versions of the Greedy Modularity Maximization are available. - Those include a pure Python version which is used by default. - A Numba version named 'GreedyModularityJiT' is also available. + """Louvain algorithm for graph clustering in Python (default) and Numba. Attributes ---------- - labels_: partition of the nodes. labels[node] = cluster_index - iteration_count_: number of aggregations performed during the last run of the "fit" method + labels_: np.ndarray + Cluster index of each node. + iteration_count_: int + Total number of aggregations performed. + aggregate_graph_: sparse.csr_matrix + Aggregated graph at the end of the algorithm. Example ------- @@ -334,8 +359,8 @@ class Louvain: >>> graph = sparse.identity(3, format='csr') >>> (louvain.fit(graph).labels_ == np.array([0, 1, 2])).all() True - >>> louvain_jit = Louvain(algorithm=GreedyModularityJiT()) - >>> (louvain_jit.fit(graph).labels_ == np.array([0, 1, 2])).all() + >>> louvain_numba = Louvain(algorithm=GreedyModularityNumba()) + >>> (louvain_numba.fit(graph).labels_ == np.array([0, 1, 2])).all() True References @@ -345,54 +370,105 @@ class Louvain: Journal of statistical mechanics: theory and experiment, 2008 """ - def __init__(self, algorithm=GreedyModularity(), tol=0., max_agg_iter: int = -1, verbose=0): + def __init__(self, algorithm: Union[str, Optimizer] = default, resolution: float = 1, tol: float = 1e-3, + shuffle_nodes: bool = False, agg_tol: float = 1e-3, max_agg_iter: int = -1, verbose: bool = False): """ Parameters ---------- - algorithm: the fixed level optimization algorithm, requires a fit method and score_ and labels_ attributes. - tol: the minimum modularity increase to keep aggregating. - max_agg_iter: the maximum number of aggregations to perform, a negative value is interpreted as no limit - verbose: enables verbosity + algorithm: + The optimization algorithm. + Requires a fit method. + Requires score_ and labels_ attributes. + resolution: + Resolution parameter. + tol: + Minimum increase in the objective function to enter a new optimization pass. + shuffle_nodes: + If True, shuffle the nodes before each optimization pass. + agg_tol: + Minimum increase in the objective function to enter a new aggregation pass. + max_agg_iter: + Maximum number of aggregations. + A negative value is interpreted as no limit. + verbose: + Verbose mode. """ - self.algorithm = algorithm - self.tol = tol + if type(algorithm) == str: + if algorithm == "numba": + self.algorithm = GreedyModularityNumba(resolution, tol, shuffle_nodes) + elif algorithm == "python": + self.algorithm = GreedyModularity(resolution, tol, shuffle_nodes) + else: + raise ValueError('Unknown algorithm name.') + elif isinstance(algorithm, Optimizer): + self.algorithm = algorithm + else: + raise TypeError('Algorithm must be a string ("numba" or "python") or a valid algorithm.') + if type(max_agg_iter) != int: - raise TypeError('The maximum number of iterations must be a integer') + raise TypeError('The maximum number of iterations must be an integer.') + self.agg_tol = agg_tol self.max_agg_iter = max_agg_iter self.verbose = verbose self.labels_ = None self.iteration_count_ = None + self.aggregate_graph_ = None + + def fit(self, adjacency: sparse.csr_matrix, node_weights: Union[str, np.ndarray] = 'degree') -> 'Louvain': + """Clustering using chosen Optimizer. - def fit(self, adj_matrix: sparse.csr_matrix, node_weights="degree"): - """ - Alternates local optimization and aggregation until convergence. Parameters ---------- - adj_matrix: adjacency matrix of the graph to cluster - node_weights: node node_weights distribution to be used in the second term of the modularity + adjacency : + Adjacency matrix of the graph to cluster. + node_weights : + Node weights used in the second term of modularity. Returns ------- - self + self: :class:`Louvain` """ - if type(adj_matrix) != sparse.csr_matrix: + if type(adjacency) != sparse.csr_matrix: raise TypeError('The adjacency matrix must be in a scipy compressed sparse row (csr) format.') - # check that the graph is not directed - if adj_matrix.shape[0] != adj_matrix.shape[1]: + if adjacency.shape[0] != adjacency.shape[1]: raise ValueError('The adjacency matrix must be square.') - if (adj_matrix != adj_matrix.T).nnz != 0: - raise ValueError('The graph must not be directed. Please fit a symmetric adjacency matrix.') - graph = NormalizedGraph(adj_matrix, node_weights) + if (adjacency != adjacency.T).nnz != 0: + raise ValueError('The graph cannot be directed. Please fit a symmetric adjacency matrix.') + + n_nodes = adjacency.shape[0] + + if type(node_weights) == np.ndarray: + if len(node_weights) != n_nodes: + raise ValueError('The number of node weights must match the number of nodes.') + else: + node_weights_vec = node_weights + elif type(node_weights) == str: + if node_weights == 'degree': + node_weights_vec = adjacency.dot(np.ones(n_nodes)) + elif node_weights == 'uniform': + node_weights_vec = np.ones(n_nodes) + else: + raise ValueError('Unknown distribution of node weights.') + else: + raise TypeError( + 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.') + + if np.any(node_weights_vec <= 0): + raise ValueError('All node weights must be positive.') + else: + node_weights_vec = node_weights_vec / np.sum(node_weights_vec) + + graph = NormalizedGraph(adjacency, node_weights_vec) membership = sparse.identity(graph.n_nodes, format='csr') increase = True iteration_count = 0 if self.verbose: - print("Starting with", graph.n_nodes, "nodes") + print("Starting with", graph.n_nodes, "nodes.") while increase: iteration_count += 1 self.algorithm.fit(graph) - if self.algorithm.score_ <= self.tol: + if self.algorithm.score_ <= self.agg_tol: increase = False else: row = np.arange(graph.n_nodes) @@ -412,4 +488,5 @@ def fit(self, adj_matrix: sparse.csr_matrix, node_weights="degree"): self.iteration_count_ = iteration_count self.labels_ = membership.indices _, self.labels_ = np.unique(self.labels_, return_inverse=True) + self.aggregate_graph_ = graph.norm_adjacency * adjacency.data.sum() return self diff --git a/sknetwork/clustering/metrics.py b/sknetwork/clustering/metrics.py index 620c25893..50e8b3521 100644 --- a/sknetwork/clustering/metrics.py +++ b/sknetwork/clustering/metrics.py @@ -8,32 +8,36 @@ import numpy as np from itertools import combinations -from scipy import errstate, sqrt, isinf, sparse +from scipy import sparse +from typing import Union -def modularity(adjacency_matrix, partition, resolution=1.0) -> float: +def modularity(adjacency: Union[sparse.csr_matrix, np.ndarray], partition: Union[dict, np.ndarray], + resolution: float = 1) -> float: """ Compute the modularity of a node partition. + + :math:`Q = \\sum_{ij}(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{d_id_j}{w^2})\\delta_{ij}` + Parameters ---------- - partition: dict or np.ndarray - The partition of the nodes. - The keys of the dictionary correspond to the nodes and the values to the communities. - adjacency_matrix: scipy.csr_matrix or np.ndarray + partition : dict or np.ndarray + The partition of the nodes. The keys of the dictionary correspond to the nodes and the values to the communities. + adjacency : scipy.csr_matrix or np.ndarray The adjacency matrix of the graph (sparse or dense). - resolution: double, optional - The resolution parameter in the modularity function (default=1.). + resolution : float, optional (default=1.) + The resolution parameter in the modularity function. Returns ------- modularity : float - The modularity. + The modularity. """ - if type(adjacency_matrix) == sparse.csr_matrix: - adj_matrix = adjacency_matrix - elif sparse.isspmatrix(adjacency_matrix) or type(adjacency_matrix) == np.ndarray: - adj_matrix = sparse.csr_matrix(adjacency_matrix) + if type(adjacency) == sparse.csr_matrix: + adj_matrix = adjacency + elif sparse.isspmatrix(adjacency) or type(adjacency) == np.ndarray: + adj_matrix = sparse.csr_matrix(adjacency) else: raise TypeError( "The argument must be a NumPy array or a SciPy Sparse matrix.") @@ -54,84 +58,131 @@ def modularity(adjacency_matrix, partition, resolution=1.0) -> float: data = np.ones(n_nodes) membership = sparse.csr_matrix((data, (row, col))) - pos = ((membership.multiply(norm_adj.dot(membership))).dot(np.ones(membership.shape[1]))).sum() - neg = np.linalg.norm(membership.T.dot(probs)) ** 2 - return float(pos - resolution * neg) + fit = ((membership.multiply(norm_adj.dot(membership))).dot(np.ones(membership.shape[1]))).sum() + diversity = np.linalg.norm(membership.T.dot(probs)) ** 2 + return float(fit - resolution * diversity) + +def bimodularity(biadjacency: sparse.csr_matrix, sample_labels: np.ndarray, feature_labels: np.ndarray, + resolution: float = 1) -> float: + """ + Modularity for bipartite graphs. + + :math:`Q = \\sum_{ij}(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{d_if_j}{w^2})\\delta_{ij}` + + Parameters + ---------- + biadjacency: + matrix of shape n1 x n2 + sample_labels: + partition of the samples, sample_labels[node] = cluster_index + feature_labels: + partition of the features, feature_labels[node] = cluster_index + resolution: + scaling for the second term of the bimodularity -def cocitation_modularity(adjacency_matrix, partition, resolution=1.0) -> float: + Returns + ------- + bimodularity: float + The bimodularity + """ + n_samples, n_features = biadjacency.shape + one_samples, one_features = np.ones(n_samples), np.ones(n_features) + total_weight: float = biadjacency.data.sum() + sample_weights = biadjacency.dot(one_features) / total_weight + features_weights = biadjacency.T.dot(one_samples) / total_weight + + _, sample_labels = np.unique(sample_labels, return_inverse=True) + _, feature_labels = np.unique(feature_labels, return_inverse=True) + n_clusters = max(len(set(sample_labels)), len(set(feature_labels))) + + sample_membership = sparse.csr_matrix((one_samples, (np.arange(n_samples), sample_labels)), + shape=(n_samples, n_clusters)) + feature_membership = sparse.csc_matrix((one_features, (np.arange(n_features), feature_labels)), + shape=(n_features, n_clusters)) + + fit: float = sample_membership.multiply(biadjacency.dot(feature_membership)).data.sum() / total_weight + div: float = (sample_membership.T.dot(sample_weights)).dot(feature_membership.T.dot(features_weights)) + + return fit - resolution * div + + +def cocitation_modularity(adjacency, partition, resolution: float = 1) -> float: """ Compute the modularity of a node partition on the normalized cocitation graph associated to a network without explicit computation of the cocitation graph. + + :math:`Q = \\sum_{ij}(\\dfrac{(AF^{-1}A^T)_{ij}}{w} - \\gamma \\dfrac{d_id_j}{w^2})\\delta_{ij}` + Parameters ---------- partition: dict or np.ndarray - The partition of the nodes. - The keys of the dictionary correspond to the nodes and the values to the communities. - adjacency_matrix: scipy.csr_matrix or np.ndarray + The partition of the nodes. The keys of the dictionary correspond to the nodes and the values to the communities. + adjacency: scipy.csr_matrix or np.ndarray The adjacency matrix of the graph (sparse or dense). - resolution: double, optional - The resolution parameter in the modularity function (default=1.). + resolution: float (default=1.) + The resolution parameter in the modularity function. Returns ------- - modularity : float - The modularity. + modularity: float + The modularity on the normalized cocitation graph. """ - if type(adjacency_matrix) == sparse.csr_matrix: - adj_matrix = adjacency_matrix - elif sparse.isspmatrix(adjacency_matrix) or type(adjacency_matrix) == np.ndarray: - adj_matrix = sparse.csr_matrix(adjacency_matrix) + if type(adjacency) == sparse.csr_matrix: + adj_matrix = adjacency + elif sparse.isspmatrix(adjacency) or type(adjacency) == np.ndarray: + adj_matrix = sparse.csr_matrix(adjacency) else: raise TypeError( "The argument must be a NumPy array or a SciPy Sparse matrix.") - n_nodes = adj_matrix.shape[0] - out_degree = adj_matrix.dot(np.ones(adj_matrix.shape[0])) - in_degree = adj_matrix.T.dot(np.ones(adj_matrix.shape[1])) - total_weight = out_degree.sum() + n_samples, n_features = adj_matrix.shape + total_weight = adj_matrix.data.sum() + pou = adj_matrix.dot(np.ones(n_samples)) / total_weight + din = adj_matrix.T.dot(np.ones(n_features)) + + # pseudo inverse square-root in-degree matrix + dhin = sparse.diags(np.sqrt(din), shape=(n_features, n_features), format='csr') + dhin.data = 1 / dhin.data - with errstate(divide='ignore'): - in_degree_sqrt = 1.0 / sqrt(in_degree) - in_degree_sqrt[isinf(in_degree_sqrt)] = 0 - in_degree_sqrt = sparse.spdiags(in_degree_sqrt, [0], adj_matrix.shape[1], adj_matrix.shape[1], format='csr') - normalized_adjacency = (adj_matrix.dot(in_degree_sqrt)).T + normalized_adjacency = (adj_matrix.dot(dhin)).T.tocsr() if type(partition) == dict: - labels = np.array([partition[i] for i in range(n_nodes)]) + labels = np.array([partition[i] for i in range(n_samples)]) elif type(partition) == np.ndarray: labels = partition.copy() else: raise TypeError('The partition must be a dictionary or a NumPy array.') - clusters_indices = set(labels.tolist()) - mod = 0. - for cluster in clusters_indices: - indicator_vector = (labels == cluster) - mod += np.linalg.norm(normalized_adjacency.dot(indicator_vector)) ** 2 - mod -= (resolution / total_weight) * (np.dot(out_degree, indicator_vector)) ** 2 + membership = sparse.csc_matrix((np.ones(n_samples), (np.arange(n_samples), labels)), + shape=(n_samples, labels.max() + 1)) + fit = ((normalized_adjacency.dot(membership)).data ** 2).sum() / total_weight + diversity = np.linalg.norm(membership.T.dot(pou)) ** 2 - return float(mod / total_weight) + return float(fit - resolution * diversity) -def performance(adjacency_matrix: sparse.csr_matrix, labels: np.ndarray) -> float: - """ - The performance is the ratio of the total number of intra-cluster edges plus the total number of inter-cluster +def performance(adjacency: sparse.csr_matrix, labels: np.ndarray) -> float: + """The performance is the ratio of the total number of intra-cluster edges plus the total number of inter-cluster non-edges with the number of potential edges in the graph. + Parameters ---------- - adjacency_matrix: the adjacency matrix of the graph - labels: the cluster indices, labels[node] = index of the cluster of node. + adjacency: + the adjacency matrix of the graph + labels: + the cluster indices, labels[node] = index of the cluster of node. Returns ------- - the performance metric + : float + the performance metric """ - n_nodes, m_nodes = adjacency_matrix.shape + n_nodes, m_nodes = adjacency.shape if n_nodes != m_nodes: raise ValueError('The adjacency matrix is not square.') - bool_adj = abs(adjacency_matrix.sign()) + bool_adj = abs(adjacency.sign()) clusters = set(labels.tolist()) cluster_indicator = {cluster: (labels == cluster) for cluster in clusters} @@ -148,34 +199,35 @@ def performance(adjacency_matrix: sparse.csr_matrix, labels: np.ndarray) -> floa return perf / n_nodes**2 -def cocitation_performance(adjacency_matrix: sparse.csr_matrix, labels: np.ndarray) -> float: - """ - The performance of the clustering on the normalized cocitation graph associated to the provided adjacency +def cocitation_performance(adjacency: sparse.csr_matrix, labels: np.ndarray) -> float: + """The performance of the clustering on the normalized cocitation graph associated to the provided adjacency matrix without explicit computation of the graph. + Parameters ---------- - adjacency_matrix: the adjacency matrix of the graph - labels: the cluster indices, labels[node] = index of the cluster of node. + adjacency: + the adjacency matrix of the graph + labels: + the cluster indices, labels[node] = index of the cluster of node. Returns ------- - The performance metric. + : float + The performance metric on the normalized cocitation graph. """ - n_nodes, m_nodes = adjacency_matrix.shape - if n_nodes != m_nodes: + n_samples, n_features = adjacency.shape + if n_samples != n_features: raise ValueError('The adjacency matrix is not square.') - bool_adj = abs(adjacency_matrix.sign()) + bool_adj = abs(adjacency.sign()) clusters = set(labels.tolist()) cluster_indicator = {cluster: (labels == cluster) for cluster in clusters} cluster_sizes = {cluster: cluster_indicator[cluster].sum() for cluster in clusters} - din = bool_adj.T.dot(np.ones(m_nodes)) - with errstate(divide='ignore'): - din_sqrt = 1.0 / sqrt(din) - din_sqrt[isinf(din_sqrt)] = 0 + din = bool_adj.T.dot(np.ones(n_features)) # pseudo inverse square-root in-degree matrix - dhin = sparse.spdiags(din_sqrt, [0], m_nodes, m_nodes, format='csr') + dhin = sparse.diags(np.sqrt(din), shape=(n_features, n_features), format='csr') + dhin.data = 1 / dhin.data norm_backward_adj = (dhin.dot(bool_adj.T)).tocsr() @@ -188,4 +240,4 @@ def cocitation_performance(adjacency_matrix: sparse.csr_matrix, labels: np.ndarr perf -= 2 * cluster_indicator[cluster_i].dot( bool_adj.dot(norm_backward_adj.dot(cluster_indicator[cluster_j]))) - return perf / n_nodes**2 + return perf / n_samples**2 diff --git a/sknetwork/clustering/test/test_bilouvain.py b/sknetwork/clustering/test/test_bilouvain.py new file mode 100644 index 000000000..d31872cb1 --- /dev/null +++ b/sknetwork/clustering/test/test_bilouvain.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""tests for bilouvain.py""" + + +import unittest +from sknetwork.clustering import * +from sknetwork.toy_graphs import * + + +class TestBiLouvainClustering(unittest.TestCase): + + def setUp(self): + self.bilouvain = BiLouvain() + self.bilouvain_high_resolution = BiLouvain(resolution=2) + self.star_wars_graph = star_wars_villains_graph() + + def test_star_wars_graph(self): + labels = self.bilouvain.fit(self.star_wars_graph).sample_labels_ + self.assertEqual(labels.shape, (4,)) + labels = self.bilouvain_high_resolution.fit(self.star_wars_graph).sample_labels_ + self.assertEqual(labels.shape, (4,)) diff --git a/sknetwork/clustering/test/test_louvain.py b/sknetwork/clustering/test/test_louvain.py index 08b941fe3..4965bf261 100644 --- a/sknetwork/clustering/test/test_louvain.py +++ b/sknetwork/clustering/test/test_louvain.py @@ -1,31 +1,40 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# test for louvain.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Quentin Lutz -# -# This file is part of Scikit-network. +"""tests for louvain.py""" + import unittest -from sknetwork.clustering.louvain import Louvain +from sknetwork.clustering import * +from sknetwork.toy_graphs import * from scipy.sparse import identity class TestLouvainClustering(unittest.TestCase): def setUp(self): - self.louvain_basic = Louvain() + self.louvain = Louvain() + self.louvain_high_resolution = Louvain(resolution=2) + self.karate_club_graph = karate_club_graph() def test_unknown_types(self): with self.assertRaises(TypeError): - self.louvain_basic.fit(identity(1)) + self.louvain.fit(identity(1)) with self.assertRaises(TypeError): - self.louvain_basic.fit(identity(2, format='csr'), node_weights=1) + self.louvain.fit(identity(2, format='csr'), node_weights=1) def test_unknown_options(self): with self.assertRaises(ValueError): - self.louvain_basic.fit(identity(2, format='csr'), node_weights='unknown') + self.louvain.fit(identity(2, format='csr'), node_weights='unknown') def test_single_node_graph(self): - self.assertEqual(self.louvain_basic.fit(identity(1, format='csr')).labels_, [0]) + self.assertEqual(self.louvain.fit(identity(1, format='csr')).labels_, [0]) + + def test_karate_club_graph(self): + labels = self.louvain.fit(self.karate_club_graph).labels_ + self.assertEqual(labels.shape, (34,)) + self.assertAlmostEqual(modularity(self.karate_club_graph, labels), 0.42, 2) + labels = self.louvain_high_resolution.fit(self.karate_club_graph).labels_ + self.assertEqual(labels.shape, (34,)) + self.assertAlmostEqual(modularity(self.karate_club_graph, labels), 0.34, 2) + diff --git a/sknetwork/clustering/test/test_metrics.py b/sknetwork/clustering/test/test_metrics.py index 686fc5564..d1f643b47 100644 --- a/sknetwork/clustering/test/test_metrics.py +++ b/sknetwork/clustering/test/test_metrics.py @@ -1,15 +1,10 @@ # -*- coding: utf-8 -*- # test for metrics.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Nathan de Lara -# -# This file is part of Scikit-network. +""""tests for clustering metrics""" import unittest -import numpy as np -from scipy import sparse -from sknetwork.clustering.metrics import modularity, cocitation_modularity, performance, cocitation_performance +from sknetwork.clustering import * +from sknetwork.toy_graphs import * class TestClusteringMetrics(unittest.TestCase): @@ -19,6 +14,7 @@ def setUp(self): [1, 0, 0, 0], [1, 0, 0, 1], [1, 0, 1, 0]])) + self.star_wars_graph = star_wars_villains_graph() self.labels = np.array([0, 1, 0, 0]) self.unique_cluster = np.zeros(4, dtype=int) @@ -26,6 +22,9 @@ def test_modularity(self): self.assertAlmostEqual(modularity(self.graph, self.labels), -0.0312, 3) self.assertAlmostEqual(modularity(self.graph, self.unique_cluster), 0.) + def test_bimodularity(self): + self.assertAlmostEqual(bimodularity(self.graph, self.unique_cluster, self.unique_cluster), 0.) + def test_cocitation_modularity(self): self.assertAlmostEqual(cocitation_modularity(self.graph, self.labels), 0.0521, 3) self.assertAlmostEqual(cocitation_modularity(self.graph, self.unique_cluster), 0.) diff --git a/sknetwork/embedding/__init__.py b/sknetwork/embedding/__init__.py index e69de29bb..617853707 100644 --- a/sknetwork/embedding/__init__.py +++ b/sknetwork/embedding/__init__.py @@ -0,0 +1,4 @@ +from sknetwork.embedding.gsvd import * +from sknetwork.embedding.spectral import * +from sknetwork.embedding.metrics import * +from sknetwork.embedding.randomized_matrix_factorization import * diff --git a/sknetwork/embedding/gsvd.py b/sknetwork/embedding/gsvd.py new file mode 100755 index 000000000..32990bc97 --- /dev/null +++ b/sknetwork/embedding/gsvd.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 31 17:16:22 2018 +@author: Nathan de Lara +""" + +import numpy as np + +from sknetwork.embedding.randomized_matrix_factorization import randomized_svd +from scipy import sparse, linalg +from typing import Union + + +class GSVDEmbedding: + """Generalized Singular Value Decomposition for non-linear dimensionality reduction. + + Parameters + ----------- + embedding_dimension: int, optional + The dimension of the projected subspace (default=2). + + Attributes + ---------- + embedding_ : array, shape = (n_samples, embedding_dimension) + Embedding of the samples (rows of the training matrix) + features_ : array, shape = (n_features, embedding_dimension) + Embedding of the features (columns of the training matrix) + singular_values_ : array, shape = (embedding_dimension) + Singular values of the training matrix + + References + ---------- + - Bonald, De Lara. "Interpretable Graph Embedding using Generalized SVD." + """ + + def __init__(self, embedding_dimension=2): + self.embedding_dimension = embedding_dimension + self.embedding_ = None + self.features_ = None + self.singular_values_ = None + + def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], randomized_decomposition: bool = True, + n_iter='auto', power_iteration_normalizer: Union[str, None] = 'auto', random_state=None) -> 'GSVDEmbedding': + """Fits the model from data in adjacency_matrix. + + Parameters + ---------- + adjacency: array-like, shape = (n, m) + Adjacency matrix, where n = m is the number of nodes for a standard directed or undirected graph, + n is the cardinal of V1 and m is the cardinal of V2 for a bipartite graph. + randomized_decomposition: + whether to use a randomized (and faster) svd method or the standard scipy one. + n_iter: int or ``'auto'`` (default is ``'auto'``) + See :meth:`sknetwork.embedding.randomized_matrix_factorization.randomized_range_finder` + power_iteration_normalizer: ``'auto'`` (default), ``'QR'``, ``'LU'``, ``None`` + See :meth:`sknetwork.embedding.randomized_matrix_factorization.randomized_range_finder` + random_state: int, RandomState instance or ``None``, optional (default= ``None``) + See :meth:`sknetwork.embedding.randomized_matrix_factorization.randomized_range_finder` + + Returns + ------- + self: :class:`GSVDEmbedding` + """ + if type(adjacency) == sparse.csr_matrix: + adjacency = adjacency + elif type(adjacency) == np.ndarray: + adjacency = sparse.csr_matrix(adjacency) + else: + raise TypeError( + "The argument must be a NumPy array or a SciPy Compressed Sparse Row matrix.") + n_nodes, m_nodes = adjacency.shape + total_weight = adjacency.data.sum() + # out-degree vector + dou = adjacency.dot(np.ones(m_nodes)) + # in-degree vector + din = adjacency.T.dot(np.ones(n_nodes)) + + # pseudo inverse square-root out-degree matrix + dhou = sparse.diags(np.sqrt(dou), shape=(n_nodes, n_nodes), format='csr') + dhou.data = 1 / dhou.data + # pseudo inverse square-root in-degree matrix + dhin = sparse.diags(np.sqrt(din), shape=(m_nodes, m_nodes), format='csr') + dhin.data = 1 / dhin.data + + laplacian = dhou.dot(adjacency.dot(dhin)) + + if randomized_decomposition: + u, sigma, vt = randomized_svd(laplacian, self.embedding_dimension, + n_iter=n_iter, + power_iteration_normalizer=power_iteration_normalizer, + random_state=random_state) + else: + u, sigma, vt = linalg.svds(laplacian, self.embedding_dimension) + + self.singular_values_ = sigma + self.embedding_ = np.sqrt(total_weight) * dhou.dot(u) * sigma + self.features_ = np.sqrt(total_weight) * dhin.dot(vt.T) + # shift the center of mass + self.embedding_ -= np.ones((n_nodes, 1)).dot(self.embedding_.T.dot(dou)[:, np.newaxis].T) / total_weight + self.features_ -= np.ones((m_nodes, 1)).dot(self.features_.T.dot(din)[:, np.newaxis].T) / total_weight + + return self diff --git a/sknetwork/embedding/metrics.py b/sknetwork/embedding/metrics.py index 831b26488..bb38948af 100644 --- a/sknetwork/embedding/metrics.py +++ b/sknetwork/embedding/metrics.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -# coding: utf-8 - +# -*- coding: utf-8 -*- """ Created on Nov 6 2018 @@ -13,23 +12,95 @@ import numpy as np from scipy import sparse +from scipy.stats import hmean -def cosine_modularity(adjacency_matrix, embedding: np.ndarray, return_all: bool=False): +def dot_modularity(adjacency_matrix, embedding: np.ndarray, features=None, resolution=1., weights='degree', + return_all: bool=False): """ - Computes the difference of the weighted average cosine similarity between pairs of neighbors in the graph - and pairs of nodes in the graph. + Difference of the weighted average dot product between embeddings of pairs of neighbors in the graph + (fit term) and pairs of nodes in the graph (diversity term). + + :math:r`Q = \\sum_{ij}(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{d_id_j}{w^2})x_i^Tx_j` + + This metric is normalized to lie between -1 and 1. + If the embeddings are normalized, this reduces to the cosine modularity. + Parameters ---------- adjacency_matrix: sparse.csr_matrix or np.ndarray - the adjacency matrix of the graph + the adjacency matrix of the graph embedding: np.ndarray - the embedding to evaluate, embedding[i] must represent the embedding of node i - return_all: whether to return both means besides their difference, or only the difference + the embedding to evaluate, embedding[i] must represent the embedding of node i + features: None or np.ndarray + For bipartite graphs, features should be the embedding of the second part + resolution: float + scaling for first-order approximation + weights: ``'degree'`` or ``'uniform'`` + prior distribution on the nodes + + return_all: bool (default = ``False``) + whether to return (fit, diversity) or fit - diversity Returns ------- - the difference of means or (the difference of means, mean_neighbors, mean_pairs) + dot_modularity: a float or a tuple of floats. + """ + if type(adjacency_matrix) == sparse.csr_matrix: + adj_matrix = adjacency_matrix + elif sparse.isspmatrix(adjacency_matrix) or type(adjacency_matrix) == np.ndarray: + adj_matrix = sparse.csr_matrix(adjacency_matrix) + else: + raise TypeError( + "The argument must be a NumPy array or a SciPy Sparse matrix.") + n_nodes, m_nodes = adj_matrix.shape + total_weight: float = adjacency_matrix.data.sum() + + if features is None: + if n_nodes != m_nodes: + raise ValueError('feature cannot be None for non-square adjacency matrices.') + else: + normalization = np.linalg.norm(embedding) ** 2 / np.sqrt(n_nodes * m_nodes) + features = embedding + else: + normalization = np.linalg.norm(embedding.dot(features.T)) / np.sqrt(n_nodes * m_nodes) + + if weights == 'degree': + wou = adj_matrix.dot(np.ones(m_nodes)) / total_weight + win = adj_matrix.T.dot(np.ones(n_nodes)) / total_weight + elif weights == 'uniform': + wou = np.ones(n_nodes) / n_nodes + win = np.ones(m_nodes) / m_nodes + else: + raise ValueError('weights must be degree or uniform.') + + fit = (np.multiply(embedding, adjacency_matrix.dot(features))).sum() / (total_weight * normalization) + diversity = (embedding.T.dot(wou)).dot(features.T.dot(win)) / normalization + + if return_all: + return fit, resolution * diversity + else: + return fit - resolution * diversity + + +def hscore(adjacency_matrix, embedding: np.ndarray, order='second', return_all: bool=False): + """Harmonic mean of fit and diversity with respect to first or second order node similarity. + + Parameters + ---------- + adjacency_matrix: sparse.csr_matrix or np.ndarray + the adjacency matrix of the graph + embedding: np.ndarray + the embedding to evaluate, embedding[i] must represent the embedding of node i + order: \'first\' or \'second\'. + The order of the node similarity metric to use. First-order corresponds to edges weights while second-order + corresponds to the weights of the edges in the normalized cocitation graph. + return_all: bool (default = ``False``) + whether to return (fit, diversity) or hmean(fit, diversity) + + Returns + ------- + hscore: a float or a tuple of floats. """ if type(adjacency_matrix) == sparse.csr_matrix: @@ -40,19 +111,33 @@ def cosine_modularity(adjacency_matrix, embedding: np.ndarray, return_all: bool= raise TypeError( "The argument must be a NumPy array or a SciPy Sparse matrix.") n_nodes, m_nodes = adj_matrix.shape - if n_nodes != m_nodes: - raise ValueError("The adjacency matrix must be a square matrix.") - if (adj_matrix != adj_matrix.maximum(adj_matrix.T)).nnz != 0: - raise ValueError("The adjacency matrix is not symmetric.") + if order == 'first' and (n_nodes != m_nodes): + raise ValueError('For fist order similarity, the adjacency matrix must be square.') + total_weight = adj_matrix.data.sum() + # out-degree vector + dou = adj_matrix.dot(np.ones(m_nodes)) + # in-degree vector + din = adj_matrix.T.dot(np.ones(n_nodes)) - total_weight: float = adjacency_matrix.data.sum() - normalized_embedding = embedding / embedding.sum(axis=1)[:, np.newaxis] + pdhou, pdhin = np.zeros(n_nodes), np.zeros(m_nodes) + pdhou[dou.nonzero()] = 1 / np.sqrt(dou[dou.nonzero()]) + pdhin[din.nonzero()] = 1 / np.sqrt(din[din.nonzero()]) - neighbors_mean = (np.multiply(normalized_embedding, adjacency_matrix.dot(normalized_embedding))).sum() - neighbors_mean /= total_weight - pairs_mean = np.linalg.norm(normalized_embedding.mean(axis=0))**2 + normalization = np.linalg.norm(embedding.T * np.sqrt(dou)) ** 2 + if order == 'first': + fit = (np.multiply(embedding, adjacency_matrix.dot(embedding))).sum() + fit /= total_weight * (np.linalg.norm(embedding) ** 2 / n_nodes) + elif order == 'second': + fit = np.linalg.norm(adj_matrix.T.dot(embedding).T * pdhin) ** 2 / normalization + else: + raise ValueError('The similarity order should be \'first\' or \'second\'.') + diversity = (np.linalg.norm(embedding.T.dot(dou))) ** 2 / total_weight + diversity = 1 - diversity / normalization if return_all: - return neighbors_mean - pairs_mean, neighbors_mean, pairs_mean + return fit, diversity else: - return neighbors_mean - pairs_mean + if np.isclose(fit, 0.) or np.isclose(diversity, 0.): + return 0. + else: + return hmean([fit, diversity]) diff --git a/sknetwork/embedding/randomized_matrix_factorization.py b/sknetwork/embedding/randomized_matrix_factorization.py index 06c1d64c6..6f2427744 100644 --- a/sknetwork/embedding/randomized_matrix_factorization.py +++ b/sknetwork/embedding/randomized_matrix_factorization.py @@ -2,30 +2,30 @@ # -*- coding: utf-8 -*- """ Created on Oct 12 2018 - @author: Nathan de Lara - -Part of this code was adapted from the scikit-learn project. +Part of this code was adapted from the scikit-learn project: https://scikit-learn.org/stable/. """ import numpy as np from numpy.random import mtrand from scipy import sparse, linalg from scipy.sparse import linalg as splinalg +from typing import Union def safe_sparse_dot(a, b, dense_output=False): """ - Dot product that handle the sparse matrix case correctly - Uses BLAS GEMM as replacement for numpy.dot where possible + Dot product that handle the sparse matrix case correctly. Uses BLAS GEMM as replacement for numpy.dot where possible to avoid unnecessary copies. + Parameters ---------- - a : array or sparse matrix - b : array or sparse matrix - dense_output : boolean, default False - When False, either ``a`` or ``b`` being sparse will yield sparse - output. When True, output will always be an array. + a: array or sparse matrix + b: array or sparse matrix + dense_output: bool (default= ``False``) + When False, either ``a`` or ``b`` being sparse will yield sparse output. + When True, output will always be an array. + Returns ------- dot_product : array or sparse matrix @@ -42,6 +42,7 @@ def safe_sparse_dot(a, b, dense_output=False): def check_random_state(seed): """Turn seed into a np.random.RandomState instance + Parameters ---------- seed : None | int | instance of RandomState @@ -51,6 +52,7 @@ def check_random_state(seed): Otherwise raise ValueError. """ if seed is None or seed is np.random: + # noinspection PyProtectedMember return mtrand._rand if isinstance(seed, np.integer) or isinstance(seed, int): return np.random.RandomState(seed) @@ -63,44 +65,46 @@ def check_random_state(seed): def randomized_range_finder(matrix, size, n_iter, power_iteration_normalizer='auto', random_state=None, return_all: bool = False) -> [np.ndarray, tuple]: """Computes an orthonormal matrix whose range approximates the range of A. + Parameters ---------- matrix : two dimensional array The input data matrix size : integer Size of the return array - n_iter : integer - Number of power iterations used to stabilize the result - power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none' - Whether the power iterations are normalized with step-by-step - QR factorization (the slowest but most accurate), 'none' - (the fastest but numerically unstable when `n_iter` is large, e.g. - typically 5 or larger), or 'LU' factorization (numerically stable - but can lose slightly in accuracy). The 'auto' mode applies no - normalization if `n_iter`<=2 and switches to LU otherwise. - .. versionadded:: 0.18 - random_state : int, RandomState instance or None, optional (default=None) + n_iter : int + Number of power iterations. It can be used to deal with very noisy + problems. When 'auto', it is set to 4, unless ``size`` is small + (< .1 * min(matrix.shape)) in which case ``n_iter`` is set to 7. + This improves precision with few components. + power_iteration_normalizer: ``'auto'`` (default), ``'QR'``, ``'LU'``, ``None`` + Whether the power iterations are normalized with step-by-step + QR factorization (the slowest but most accurate), ``None`` + (the fastest but numerically unstable when ``n_iter`` is large, e.g. + typically 5 or larger), or ``'LU'`` factorization (numerically stable + but can lose slightly in accuracy). The ``'auto'`` mode applies no + normalization if ``n_iter`` <= 2 and switches to ``'LU'`` otherwise. + random_state: int, RandomState instance or ``None``, optional (default= ``None``) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState + generator; If ``None``, the random number generator is the RandomState instance used by `np.random`. return_all : if True, returns (range_matrix, random_matrix, random_proj) else returns range_matrix. + Returns ------- range_matrix : 2D array matrix (size x size) projection matrix, the range of which approximates well the range of the input matrix. + Notes ----- Follows Algorithm 4.3 of - Finding structure with randomness: Stochastic algorithms for constructing - approximate matrix decompositions + + Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 (arXiv:909) http://arxiv.org/pdf/0909.4061 - An implementation of a randomized algorithm for principal component - analysis - A. Szlam et al. 2014 """ random_state = check_random_state(random_state) @@ -149,19 +153,19 @@ def svd_flip(u, v, u_based_decision=True): """Sign correction to ensure deterministic output from SVD. Adjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive. + Parameters ---------- - u, v : ndarray - u and v are the output of `linalg.svd` or - `sklearn.utils.extmath.randomized_svd`, with matching inner dimensions - so one can compute `np.dot(u * s, v)`. - u_based_decision : boolean, (default=True) + u, v: np.ndarray + u and v are the output of the svd, with matching inner dimensions so one can compute `np.dot(u * s, v)`. + u_based_decision : bool, (default=True) If True, use the columns of u as the basis for sign flipping. - Otherwise, use the rows of v. The choice of which variable to base the - decision on is generally algorithm dependent. + Otherwise, use the rows of v. + The choice of which variable to base the decision on is generally algorithm dependent. + Returns ------- - u_adjusted, v_adjusted : arrays with the same dimensions as the input. + u_adjusted, v_adjusted: arrays with the same dimensions as the input. """ if u_based_decision: # columns of u, rows of v @@ -180,65 +184,53 @@ def svd_flip(u, v, u_based_decision=True): def randomized_svd(matrix, n_components: int, n_oversamples: int = 10, n_iter='auto', transpose='auto', - power_iteration_normalizer='auto', flip_sign: bool = True, random_state=0): + power_iteration_normalizer: Union[str, None] = 'auto', flip_sign: bool = True, random_state=0): """Computes a truncated randomized SVD + Parameters ---------- matrix : ndarray or sparse matrix Matrix to decompose n_components : int Number of singular values and vectors to extract. - n_oversamples : int (default is 10) + n_oversamples : int (default=10) Additional number of random vectors to sample the range of M so as to ensure proper conditioning. The total number of random vectors used to find the range of M is embedding_dimension + n_oversamples. Smaller number can improve speed but can negatively impact the quality of approximation of singular vectors and singular values. n_iter : int or 'auto' (default is 'auto') - Number of power iterations. It can be used to deal with very noisy - problems. When 'auto', it is set to 4, unless `embedding_dimension` is small - (< .1 * min(X.shape)) `n_iter` in which case is set to 7. - This improves precision with few components. - .. versionchanged:: 0.18 - power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none' - Whether the power iterations are normalized with step-by-step - QR factorization (the slowest but most accurate), 'none' - (the fastest but numerically unstable when `n_iter` is large, e.g. - typically 5 or larger), or 'LU' factorization (numerically stable - but can lose slightly in accuracy). The 'auto' mode applies no - normalization if `n_iter`<=2 and switches to LU otherwise. - .. versionadded:: 0.18 + See :meth:`randomized_range_finder` + power_iteration_normalizer : ``'auto'`` (default), ``'QR'``, ``'LU'``, ``None`` + See :meth:`randomized_range_finder` transpose : True, False or 'auto' (default) - Whether the algorithm should be applied to M.T instead of M. The + Whether the algorithm should be applied to ``matrix.T`` instead of ``matrix``. The result should approximately be the same. The 'auto' mode will - trigger the transposition if M.shape[1] > M.shape[0] since this - implementation of randomized SVD tend to be a little faster in that - case. - .. versionchanged:: 0.18 - flip_sign : boolean, (True by default) + trigger the transposition if ``matrix.shape[1] > matrix.shape[0]`` since this + implementation of randomized SVD tends to be a little faster in that case. + flip_sign : boolean, (default=True) The output of a singular value decomposition is only unique up to a permutation of the signs of the singular vectors. If `flip_sign` is set to `True`, the sign ambiguity is resolved by making the largest loadings for each component in the left singular vectors positive. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + See :meth:`randomized_range_finder` + Notes ----- This algorithm finds a (usually very good) approximate truncated singular value decomposition using randomization to speed up the computations. It is particularly fast on large matrices on which you wish to extract only a small number of components. In order to - obtain further speed up, `n_iter` can be set <=2 (at the cost of + obtain further speed up, ``n_iter`` can be set <=2 (at the cost of loss of precision). + References ---------- * Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 + (algorithm 5.1) * A randomized algorithm for the decomposition of matrices Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert * An implementation of a randomized algorithm for principal component @@ -294,46 +286,40 @@ def randomized_svd(matrix, n_components: int, n_oversamples: int = 10, n_iter='a def randomized_eig(matrix, n_components: int, which='LM', n_oversamples: int = 10, n_iter='auto', - power_iteration_normalizer='auto', random_state=0): + power_iteration_normalizer: Union[str, None] = 'auto', random_state=0, one_pass: bool = False): """Computes a randomized eigenvalue decomposition. + Parameters ---------- matrix: ndarray or sparse matrix - Matrix to decompose + Matrix to decompose n_components: int - Number of singular values and vectors to extract. - which: which eigenvalues to compute. 'SM' for Smallest Magnitude, - any other entry will result in Smallest Magnitude. - n_oversamples : int (default is 10) - Additional number of random vectors to sample the range of M so as - to ensure proper conditioning. The total number of random vectors - used to find the range of M is embedding_dimension + n_oversamples. Smaller - number can improve speed but can negatively impact the quality of - approximation of singular vectors and singular values. + Number of singular values and vectors to extract. + which: str + which eigenvalues to compute. ``'LM'`` for Largest Magnitude and ``'SM'`` for Smallest Magnitude. + Any other entry will result in Largest Magnitude. + n_oversamples : int (default=10) + Additional number of random vectors to sample the range of ``matrix`` so as + to ensure proper conditioning. The total number of random vectors + used to find the range of ``matrix`` is ``n_components + n_oversamples``. Smaller number can improve speed + but can negatively impact the quality of approximation of singular vectors and singular values. n_iter: int or 'auto' (default is 'auto') - Number of power iterations. It can be used to deal with very noisy - problems. When 'auto', it is set to 4, unless `embedding_dimension` is small - (< .1 * min(X.shape)) `n_iter` in which case is set to 7. - This improves precision with few components. - .. versionchanged:: 0.18 - power_iteration_normalizer: 'auto' (default), 'QR', 'LU', 'none' - Whether the power iterations are normalized with step-by-step - QR factorization (the slowest but most accurate), 'none' - (the fastest but numerically unstable when `n_iter` is large, e.g. - typically 5 or larger), or 'LU' factorization (numerically stable - but can lose slightly in accuracy). The 'auto' mode applies no - normalization if `n_iter`<=2 and switches to LU otherwise. - .. versionadded:: 0.18 + See :meth:`randomized_range_finder` + power_iteration_normalizer: ``'auto'`` (default), ``'QR'``, ``'LU'``, ``None`` + See :meth:`randomized_range_finder` random_state: int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + See :meth:`randomized_range_finder` + one_pass: bool (default=False) + whether to use algorithm 5.6 instead of 5.3. 5.6 requires less access to the original matrix, + while 5.3 is more accurate. Returns ------- - + References + ---------- + Finding structure with randomness: Stochastic algorithms for constructing + approximate matrix decompositions + Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 """ random_state = check_random_state(random_state) @@ -356,7 +342,10 @@ def randomized_eig(matrix, n_components: int, which='LM', n_oversamples: int = 1 range_matrix, random_matrix, random_proj = randomized_range_finder(matrix, n_random, n_iter, power_iteration_normalizer, random_state, True) - approx_matrix = np.linalg.lstsq(random_matrix.T.dot(range_matrix), random_proj.T.dot(range_matrix), None)[0].T + if one_pass: + approx_matrix = np.linalg.lstsq(random_matrix.T.dot(range_matrix), random_proj.T.dot(range_matrix), None)[0].T + else: + approx_matrix = (matrix.dot(range_matrix)).T.dot(range_matrix) eigenvalues, eigenvectors = np.linalg.eig(approx_matrix) diff --git a/sknetwork/embedding/spectral.py b/sknetwork/embedding/spectral.py index 02e920291..df61bc1ac 100644 --- a/sknetwork/embedding/spectral.py +++ b/sknetwork/embedding/spectral.py @@ -1,36 +1,34 @@ #!/usr/bin/env python3 # coding: utf-8 - """ Created on Thu Sep 13 2018 Authors: Thomas Bonald Nathan De Lara - -Spectral embedding by decomposition of the normalized graph Laplacian. """ import numpy as np from sknetwork.embedding.randomized_matrix_factorization import randomized_eig -from scipy import sparse, errstate, sqrt, isinf +from scipy import sparse from scipy.sparse import csgraph from scipy.sparse.linalg import eigsh +from typing import Union class SpectralEmbedding: """Weighted spectral embedding of a graph - Attributes + Parameters ---------- embedding_dimension : int, optional - Dimension of the embedding space (default=100) - eigenvalue_normalization : bool, optional - Whether to normalize the embedding by the pseudo-inverse square roots of laplacian eigenvalues - (default=True) - node_weights : {'uniform', 'degree', array of length n_nodes with positive entries}, optional + Dimension of the embedding space + node_weights : {``'uniform'``, ``'degree'``, array of length n_nodes with positive entries}, optional Weights used for the normalization for the laplacian, :math:`W^{-1/2} L W^{-1/2}` + + Attributes + ---------- embedding_ : array, shape = (n_nodes, embedding_dimension) Embedding matrix of the nodes eigenvalues_ : array, shape = (embedding_dimension) @@ -42,57 +40,58 @@ class SpectralEmbedding: * Laplacian Eigenmaps for Dimensionality Reduction and Data Representation, M. Belkin, P. Niyogi """ - def __init__(self, embedding_dimension: int = 2, node_weights='degree', eigenvalue_normalization: bool = True): + def __init__(self, embedding_dimension: int = 2, node_weights='degree'): self.embedding_dimension = embedding_dimension self.node_weights = node_weights - self.eigenvalue_normalization = eigenvalue_normalization self.embedding_ = None self.eigenvalues_ = None - def fit(self, adjacency_matrix, node_weights=None, randomized_decomposition: bool = True): + def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], node_weights=None, + randomized_decomposition: bool = True) -> 'SpectralEmbedding': """Fits the model from data in adjacency_matrix Parameters ---------- - adjacency_matrix : Scipy csr matrix or numpy ndarray + adjacency : array-like, shape = (n, n) Adjacency matrix of the graph - randomized_decomposition: whether to use a randomized (and faster) decomposition method or - the standard scipy one. - node_weights : {'uniform', 'degree', array of length n_nodes with positive entries} + randomized_decomposition: bool (default=True) + whether to use a randomized (and faster) decomposition method or the standard scipy one. + node_weights : {``'uniform'``, ``'degree'``, array of length n_nodes with positive entries} Node weights + + Returns + ------- + self: :class:`SpectralEmbedding` """ - if type(adjacency_matrix) == sparse.csr_matrix: - adj_matrix = adjacency_matrix - elif sparse.isspmatrix(adjacency_matrix) or type(adjacency_matrix) == np.ndarray: - adj_matrix = sparse.csr_matrix(adjacency_matrix) + if type(adjacency) == sparse.csr_matrix: + adjacency = adjacency + elif sparse.isspmatrix(adjacency) or type(adjacency) == np.ndarray: + adjacency = sparse.csr_matrix(adjacency) else: raise TypeError( "The argument must be a NumPy array or a SciPy Sparse matrix.") - n_nodes, m_nodes = adj_matrix.shape + n_nodes, m_nodes = adjacency.shape if n_nodes != m_nodes: raise ValueError("The adjacency matrix must be a square matrix.") - if csgraph.connected_components(adj_matrix, directed=False)[0] > 1: + if csgraph.connected_components(adjacency, directed=False)[0] > 1: raise ValueError("The graph must be connected.") - if (adj_matrix != adj_matrix.maximum(adj_matrix.T)).nnz != 0: + if (adjacency != adjacency.maximum(adjacency.T)).nnz != 0: raise ValueError("The adjacency matrix is not symmetric.") # builds standard laplacian - degrees = adj_matrix.dot(np.ones(n_nodes)) + degrees = adjacency.dot(np.ones(n_nodes)) degree_matrix = sparse.diags(degrees, format='csr') - laplacian = degree_matrix - adj_matrix + laplacian = degree_matrix - adjacency # applies normalization by node weights if node_weights is None: node_weights = self.node_weights if type(node_weights) == str: if node_weights == 'uniform': - weight_matrix = sparse.identity(n_nodes) + weights = np.ones(n_nodes) elif node_weights == 'degree': - with errstate(divide='ignore'): - degrees_inv_sqrt = 1.0 / sqrt(degrees) - degrees_inv_sqrt[isinf(degrees_inv_sqrt)] = 0 - weight_matrix = sparse.diags(degrees_inv_sqrt, format='csr') + weights = degrees else: raise ValueError('Unknown weighting policy. Try \'degree\' or \'uniform\'.') else: @@ -101,11 +100,10 @@ def fit(self, adjacency_matrix, node_weights=None, randomized_decomposition: boo elif min(self.node_weights) < 0: raise ValueError('node_weights must be positive.') else: - with errstate(divide='ignore'): - weights_inv_sqrt = 1.0 / sqrt(self.node_weights) - weights_inv_sqrt[isinf(weights_inv_sqrt)] = 0 - weight_matrix = sparse.diags(weights_inv_sqrt, format='csr') + weights = self.node_weights + weight_matrix = sparse.diags(np.sqrt(weights), format='csr') + weight_matrix.data = 1 / weight_matrix.data laplacian = weight_matrix.dot(laplacian.dot(weight_matrix)) # spectral decomposition @@ -116,10 +114,5 @@ def fit(self, adjacency_matrix, node_weights=None, randomized_decomposition: boo eigenvalues, eigenvectors = eigsh(laplacian, n_components, which='SM') self.eigenvalues_ = eigenvalues[1:] - self.embedding_ = np.array(weight_matrix.dot(eigenvectors[:, 1:])) - if self.eigenvalue_normalization: - eigenvalues_inv_sqrt = 1.0 / sqrt(eigenvalues[1:]) - self.embedding_ = eigenvalues_inv_sqrt * self.embedding_ - return self diff --git a/sknetwork/embedding/svd.py b/sknetwork/embedding/svd.py deleted file mode 100755 index fcefd6ed7..000000000 --- a/sknetwork/embedding/svd.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu May 31 17:16:22 2018 - -@author: Nathan de Lara -""" - -import numpy as np - -from sknetwork.embedding.randomized_matrix_factorization import randomized_svd -from scipy import sparse, errstate, sqrt, isinf, linalg - - -def normalized_adjacency(adjacency_matrix): - """Normalized adjacency matrix :math:`(D_{in}^{+})^{1/2} A (D_{out}^{+})^{1/2}`. - Parameters - ---------- - adjacency_matrix: sparse.csr_matrix or np.ndarray - - Returns - ------- - The normalized adjacency matrix. - """ - - if type(adjacency_matrix) == sparse.csr_matrix: - adj_matrix = adjacency_matrix - elif type(adjacency_matrix) == np.ndarray: - adj_matrix = sparse.csr_matrix(adjacency_matrix) - else: - raise TypeError( - "The argument must be a NumPy array or a SciPy Compressed Sparse Row matrix.") - n_nodes, m_nodes = adj_matrix.shape - - # out-degree vector - dou = adj_matrix.dot(np.ones(n_nodes)) - # in-degree vector - din = adj_matrix.T.dot(np.ones(m_nodes)) - - with errstate(divide='ignore'): - dou_sqrt = 1.0 / sqrt(dou) - din_sqrt = 1.0 / sqrt(din) - dou_sqrt[isinf(dou_sqrt)] = 0 - din_sqrt[isinf(din_sqrt)] = 0 - # pseudo inverse square-root out-degree matrix - dhou = sparse.spdiags(dou_sqrt, [0], n_nodes, n_nodes, format='csr') - # pseudo inverse square-root in-degree matrix - dhin = sparse.spdiags(din_sqrt, [0], m_nodes, m_nodes, format='csr') - - return dhou.dot(adj_matrix.dot(dhin)) - - -class ForwardBackwardEmbedding: - """Forward and Backward embeddings for non-linear dimensionality reduction. - - Parameters - ----------- - embedding_dimension: int, optional - The dimension of the projected subspace (default=2). - - Attributes - ---------- - embedding_ : array, shape = (n_samples, embedding_dimension) - Forward embedding of the training matrix. - backward_embedding_ : array, shape = (n_samples, embedding_dimension) - Backward embedding of the training matrix. - singular_values_ : array, shape = (embedding_dimension) - Singular values of the training matrix - - References - ---------- - - Bonald, De Lara. "The Forward-Backward Embedding of Directed Graphs." - """ - - def __init__(self, embedding_dimension=2): - self.embedding_dimension = embedding_dimension - self.embedding_ = None - self.backward_embedding_ = None - self.singular_values_ = None - - def fit(self, adjacency_matrix, randomized_decomposition: bool = True, tol=1e-6, n_iter='auto', - power_iteration_normalizer='auto', random_state=None): - """Fits the model from data in adjacency_matrix. - - Parameters - ---------- - adjacency_matrix: array-like, shape = (n, m) - Adjacency matrix, where n = m = |V| for a standard graph, - n = |V1|, m = |V2| for a bipartite graph. - randomized_decomposition: whether to use a randomized (and faster) svd method or - the standard scipy one. - tol: float, optional - Tolerance for pseudo-inverse of singular values (default=1e-6). - n_iter: int or 'auto' (default is 'auto') - Number of power iterations. It can be used to deal with very noisy - problems. When 'auto', it is set to 4, unless `embedding_dimension` is small - (< .1 * min(X.shape)) `n_iter` in which case is set to 7. - This improves precision with few components. - power_iteration_normalizer: 'auto' (default), 'QR', 'LU', 'none' - Whether the power iterations are normalized with step-by-step - QR factorization (the slowest but most accurate), 'none' - (the fastest but numerically unstable when `n_iter` is large, e.g. - typically 5 or larger), or 'LU' factorization (numerically stable - but can lose slightly in accuracy). The 'auto' mode applies no - normalization if `n_iter`<=2 and switches to LU otherwise. - random_state: int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. - - Returns - ------- - self - - """ - if type(adjacency_matrix) == sparse.csr_matrix: - adj_matrix = adjacency_matrix - elif type(adjacency_matrix) == np.ndarray: - adj_matrix = sparse.csr_matrix(adjacency_matrix) - else: - raise TypeError( - "The argument must be a NumPy array or a SciPy Compressed Sparse Row matrix.") - n_nodes, m_nodes = adj_matrix.shape - - # out-degree vector - dou = adj_matrix.dot(np.ones(n_nodes)) - # in-degree vector - din = adj_matrix.T.dot(np.ones(m_nodes)) - - with errstate(divide='ignore'): - dou_sqrt = 1.0 / sqrt(dou) - din_sqrt = 1.0 / sqrt(din) - dou_sqrt[isinf(dou_sqrt)] = 0 - din_sqrt[isinf(din_sqrt)] = 0 - # pseudo inverse square-root out-degree matrix - dhou = sparse.spdiags(dou_sqrt, [0], n_nodes, n_nodes, format='csr') - # pseudo inverse square-root in-degree matrix - dhin = sparse.spdiags(din_sqrt, [0], m_nodes, m_nodes, format='csr') - - laplacian = dhou.dot(adj_matrix.dot(dhin)) - - if randomized_decomposition: - u, sigma, vt = randomized_svd(laplacian, self.embedding_dimension, - n_iter=n_iter, - power_iteration_normalizer=power_iteration_normalizer, - random_state=random_state) - else: - u, sigma, vt = linalg.svds(laplacian, self.embedding_dimension) - - self.singular_values_ = sigma - - gamma = 1 - sigma ** 2 - gamma_sqrt = np.diag(np.piecewise(gamma, [gamma > tol, gamma <= tol], [lambda x: 1 / np.sqrt(x), 0])) - self.embedding_ = dhou.dot(u).dot(gamma_sqrt) - self.backward_embedding_ = dhin.dot(vt.T).dot(gamma_sqrt) - - return self diff --git a/sknetwork/embedding/tests/test_embeddings.py b/sknetwork/embedding/tests/test_embeddings.py index c553c6dd9..699d29367 100644 --- a/sknetwork/embedding/tests/test_embeddings.py +++ b/sknetwork/embedding/tests/test_embeddings.py @@ -1,13 +1,9 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# test for metrics.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Nathan de Lara -# -# This file is part of Scikit-network. +"""tests for embeddings""" import unittest -from sknetwork.embedding.svd import ForwardBackwardEmbedding +from sknetwork.embedding.gsvd import GSVDEmbedding from sknetwork.embedding.spectral import SpectralEmbedding from scipy import sparse import numpy as np @@ -20,13 +16,22 @@ def setUp(self): [1, 0, 0, 0], [1, 0, 0, 1], [1, 0, 1, 0]])) + self.bipartite = sparse.csr_matrix(np.array([[1, 0, 1], + [1, 0, 0], + [1, 1, 1], + [0, 1, 1]])) - def test_forwardbackward(self): - fb = ForwardBackwardEmbedding(2) - fb.fit(self.graph) - self.assertTrue(fb.embedding_.shape == (4, 2)) - self.assertTrue(fb.backward_embedding_.shape == (4, 2)) - self.assertTrue(type(fb.singular_values_) == np.ndarray and len(fb.singular_values_) == 2) + def test_gsvd(self): + gsvd = GSVDEmbedding(2) + gsvd.fit(self.graph) + self.assertTrue(gsvd.embedding_.shape == (4, 2)) + self.assertTrue(gsvd.features_.shape == (4, 2)) + self.assertTrue(type(gsvd.singular_values_) == np.ndarray and len(gsvd.singular_values_) == 2) + + gsvd.fit(self.bipartite) + self.assertTrue(gsvd.embedding_.shape == (4, 2)) + self.assertTrue(gsvd.features_.shape == (3, 2)) + self.assertTrue(type(gsvd.singular_values_) == np.ndarray and len(gsvd.singular_values_) == 2) def test_spectral(self): sp = SpectralEmbedding(2) diff --git a/sknetwork/embedding/tests/test_metrics.py b/sknetwork/embedding/tests/test_metrics.py index 102332bf8..26d6a841f 100644 --- a/sknetwork/embedding/tests/test_metrics.py +++ b/sknetwork/embedding/tests/test_metrics.py @@ -1,15 +1,12 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# test for metrics.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Nathan de Lara -# -# This file is part of Scikit-network. +"""tests for embeddings metrics""" import unittest import numpy as np from scipy import sparse -from sknetwork.embedding.metrics import cosine_modularity +from sknetwork.embedding.metrics import dot_modularity, hscore +from sknetwork import star_wars_villains_graph class TestClusteringMetrics(unittest.TestCase): @@ -19,7 +16,23 @@ def setUp(self): [1, 0, 0, 0], [1, 0, 0, 1], [1, 0, 1, 0]])) + self.bipartite = star_wars_villains_graph() self.embedding = np.ones((4, 2)) + self.features = np.ones((3, 2)) - def test_cosine_modularity(self): - self.assertAlmostEqual(cosine_modularity(self.graph, self.embedding), 0.) + def test_dot_modularity(self): + self.assertAlmostEqual(dot_modularity(self.graph, self.embedding), 0.) + fit, diversity = dot_modularity(self.graph, self.embedding, return_all=True) + self.assertAlmostEqual(fit, 1.) + self.assertAlmostEqual(diversity, 1.) + + self.assertAlmostEqual(dot_modularity(self.bipartite, self.embedding, self.features), 0.) + fit, diversity = dot_modularity(self.bipartite, self.embedding, self.features, return_all=True) + self.assertAlmostEqual(fit, 1.) + self.assertAlmostEqual(diversity, 1.) + + def test_hscore(self): + self.assertAlmostEqual(hscore(self.graph, self.embedding), 0.) + fit, diversity = hscore(self.graph, self.embedding, return_all=True) + self.assertAlmostEqual(fit, 1.) + self.assertAlmostEqual(diversity, 0.) diff --git a/sknetwork/embedding/tests/test_randomized_matrix_factorization.py b/sknetwork/embedding/tests/test_randomized_matrix_factorization.py index ef025c0c8..3940856c7 100644 --- a/sknetwork/embedding/tests/test_randomized_matrix_factorization.py +++ b/sknetwork/embedding/tests/test_randomized_matrix_factorization.py @@ -1,10 +1,6 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# test for metrics.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Nathan de Lara -# -# This file is part of Scikit-network. +"""tests for randomized matrix factorization""" import unittest import numpy as np diff --git a/sknetwork/hierarchy/__init__.py b/sknetwork/hierarchy/__init__.py index e69de29bb..acef6c2e1 100644 --- a/sknetwork/hierarchy/__init__.py +++ b/sknetwork/hierarchy/__init__.py @@ -0,0 +1,2 @@ +from sknetwork.hierarchy.paris import * +from sknetwork.hierarchy.metrics import * diff --git a/sknetwork/hierarchy/metrics.py b/sknetwork/hierarchy/metrics.py index eea85de18..eec029bc7 100644 --- a/sknetwork/hierarchy/metrics.py +++ b/sknetwork/hierarchy/metrics.py @@ -1,137 +1,184 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# metrics.py - functions for computing metrics on hierarchy -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Bertrand Charpentier -# -# This file is part of Scikit-network. -# -# NetworkX is distributed under a BSD license; see LICENSE.txt for more information. """ -Metrics for hierarchy +Created on March 2019 +@author: Thomas Bonald """ import numpy as np -import networkx as nx - -_AFFINITY = {'unitary', 'weighted'} -_LINKAGE = {'classic'} -_ADMISSIBLE_FUNCTION = {'dasgupta'} - - -def hierarchical_cost(graph, dendrogram, affinity='weighted', linkage='classic', g=lambda a, b: a + b, check=True): - """Compute the hierarchichal cost of an undirected graph hierarchy given a linkage and a admissible function g. - The graph can be weighted or unweighted - - Parameters - ---------- - graph : NetworkX graph - An undirected graph. - dendrogram : numpy array - A dendrogram - affinity : string (default: 'weighted') - The affinity can be either 'weighted' or 'unitary'. - Value 'weighted' takes the attribute 'weight' on the edges. - Value 'unitary' consider that the edges have a weight equal to 1 - linkage : string, optional (default: 'modular') - The parameter linkage can be 'single', 'average', 'complete' or 'modular'. - - =============== ======================================== - Value Linkage - =============== ======================================== - 'single' max wij - 'average' min wij - 'complete' n^2/w*wij/(|i||j|) - 'modular' w wij/(wi wj) - =============== ======================================== - - g : function, optional (default: lambda a, b: a + b) - The function g must be admissible. The classic choice is the dasgupta function. - check : bool, optional (default: True) - If True, reorder the node labels and check that the edges have the - 'weight' attribute corresponding to the given affinity. - - Returns - ------- - dendrogram : numpy array - dendrogram. - - Raises - ------ - ValueError - If the affinity or the linkage is not known. - KeyError - If all the edges do not have the 'weight' attribute with the 'weighted' affinity. - - Notes - ----- - This is a classic implementation of the hierarchical cost function - - See Also - -------- - laplacian_matrix +from scipy import sparse +from sknetwork.hierarchy import AggregateGraph +from typing import Union + + +def dasgupta_cost(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, + node_weights: Union[str, np.ndarray] = 'uniform', normalized: bool = True) -> float: + """Dasgupta's cost of a hierarchy (cost metric) + + Parameters + ---------- + adjacency : + Adjacency matrix of the graph. + dendrogram : + Each row contains the two merged nodes, the height in the dendrogram, and the size of the corresponding cluster + node_weights : + Vector of node weights. Default = 'uniform', weight 1 for each node. + normalized: + If true, normalized by the number of ndoes of the graph. + + Returns + ------- + cost : float + Dasgupta's cost of the hierarchy. + Normalized by the number of nodes to get a value between 0 and 1. + + References + ---------- + S. Dasgupta. A cost function for similarity-based hierarchical clustering. + In Proceedings of ACM symposium on Theory of Computing, 2016. + """ - if affinity not in _AFFINITY: - raise ValueError("Unknown affinity type %s." - "Valid options are %s" % (affinity, _AFFINITY)) + if type(adjacency) != sparse.csr_matrix: + raise TypeError('The adjacency matrix must be in a scipy compressed sparse row (csr) format.') + # check that the graph is not directed + if adjacency.shape[0] != adjacency.shape[1]: + raise ValueError('The adjacency matrix must be square.') + if (adjacency != adjacency.T).nnz != 0: + raise ValueError('The graph cannot be directed. Please fit a symmetric adjacency matrix.') + + n_nodes = adjacency.shape[0] + + if type(node_weights) == np.ndarray: + if len(node_weights) != n_nodes: + raise ValueError('The number of node weights must match the number of nodes.') + else: + node_weights_vec = node_weights + elif type(node_weights) == str: + if node_weights == 'degree': + node_weights_vec = adjacency.dot(np.ones(n_nodes)) + elif node_weights == 'uniform': + node_weights_vec = np.ones(n_nodes) + else: + raise ValueError('Unknown distribution of node weights.') + else: + raise TypeError( + 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.') - if linkage not in _LINKAGE: - raise ValueError("Unknown linkage type %s." - "Valid options are %s" % (linkage, _LINKAGE)) + if np.any(node_weights_vec <= 0): + raise ValueError('All node weights must be positive.') + else: + node_weights_vec = node_weights_vec / np.sum(node_weights_vec) - graph_copy = graph.copy() + aggregate_graph = AggregateGraph(adjacency, node_weights_vec) - if check: + height = np.zeros(n_nodes - 1) + edge_sampling = np.zeros(n_nodes - 1) + cluster_weight = np.zeros(n_nodes - 1) + for t in range(n_nodes - 1): + node1 = int(dendrogram[t][0]) + node2 = int(dendrogram[t][1]) + if node1 >= n_nodes and height[node1 - n_nodes] == dendrogram[t][2]: + edge_sampling[t] = edge_sampling[node1 - n_nodes] + edge_sampling[node1 - n_nodes] = 0 + elif node2 >= n_nodes and height[node2 - n_nodes] == dendrogram[t][2]: + edge_sampling[t] = edge_sampling[node2 - n_nodes] + edge_sampling[node2 - n_nodes] = 0 + height[t] = dendrogram[t][2] + edge_sampling[t] += 2 * aggregate_graph.graph[node1][node2] + cluster_weight[t] = aggregate_graph.cluster_probs[node1] + aggregate_graph.cluster_probs[node2] + aggregate_graph.merge(node1, node2) + + cost: float = (edge_sampling * cluster_weight).sum() + if not normalized: + cost *= node_weights_vec.sum() + return cost - graph_copy = nx.convert_node_labels_to_integers(graph_copy) - if affinity == 'unitary': - for e in graph_copy.edges: - graph_copy.add_edge(e[0], e[1], weight=1) +def tree_sampling_divergence(adj_matrix: sparse.csr_matrix, dendrogram: np.ndarray, + node_weights: Union[str, np.ndarray] = 'degree', normalized: bool = True) -> float: + """Tree sampling divergence of a hierarchy (quality metric) - n_edges = len(list(graph_copy.edges())) - n_weighted_edges = len(nx.get_edge_attributes(graph_copy, 'weight')) - if affinity == 'weighted' and not n_weighted_edges == n_edges: - raise KeyError("%s edges among %s do not have the attribute/key \'weigth\'." - % (n_edges - n_weighted_edges, n_edges)) + Parameters + ---------- + adj_matrix : + Adjacency matrix of the graph. + dendrogram : + Each row contains the two merged nodes, the height in the dendrogram, and the size of the corresponding cluster + node_weights : + Vector of node weights. Default = 'degree', weight of each node in the graph. + normalized: + If true, normalized by the mutual information of the graph. - if linkage == 'classic': - cost = classic_linkage_hierarchical_cost(graph_copy, dendrogram, g) - else: - cost = 0. + Returns + ------- + quality : float + The tree sampling divergence of the hierarchy (quality metric). + Normalized by the mutual information to get a value between 0 and 1. - return cost + References + ---------- + T. Bonald, B. Charpentier (2018), Learning Graph Representations by Dendrograms, https://arxiv.org/abs/1807.05087 + """ -def classic_linkage_hierarchical_cost(graph, dendrogram, g): - n_nodes = np.shape(dendrogram)[0] + 1 + if type(adj_matrix) != sparse.csr_matrix: + raise TypeError('The adjacency matrix must be in a scipy compressed sparse row (csr) format.') + # check that the graph is not directed + if adj_matrix.shape[0] != adj_matrix.shape[1]: + raise ValueError('The adjacency matrix must be square.') + if (adj_matrix != adj_matrix.T).nnz != 0: + raise ValueError('The graph cannot be directed. Please fit a symmetric adjacency matrix.') + + n_nodes = adj_matrix.shape[0] + + if type(node_weights) == np.ndarray: + if len(node_weights) != n_nodes: + raise ValueError('The number of node weights must match the number of nodes.') + else: + node_weights_vec = node_weights + elif type(node_weights) == str: + if node_weights == 'degree': + node_weights_vec = adj_matrix.dot(np.ones(n_nodes)) + elif node_weights == 'uniform': + node_weights_vec = np.ones(n_nodes) + else: + raise ValueError('Unknown distribution of node weights.') + else: + raise TypeError( + 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.') - cost = 0 + if np.any(node_weights_vec <= 0): + raise ValueError('All node weights must be positive.') + else: + node_weights_vec = node_weights_vec / np.sum(node_weights_vec) - u = n_nodes + aggregate_graph = AggregateGraph(adj_matrix, node_weights_vec) - cluster_size = {t: 1 for t in range(n_nodes)} + height = np.zeros(n_nodes - 1) + edge_sampling = np.zeros(n_nodes - 1) + node_sampling = np.zeros(n_nodes - 1) for t in range(n_nodes - 1): - a = int(dendrogram[t][0]) - b = int(dendrogram[t][1]) - - linkage = graph[a][b]['weight'] - cost += linkage * g(cluster_size[a], cluster_size[b]) - - graph.add_node(u) - neighbors_a = list(graph.neighbors(a)) - neighbors_b = list(graph.neighbors(b)) - for v in neighbors_a: - graph.add_edge(u, v, weight=graph[a][v]['weight']) - for v in neighbors_b: - if graph.has_edge(u, v): - graph[u][v]['weight'] += graph[b][v]['weight'] - else: - graph.add_edge(u, v, weight=graph[b][v]['weight']) - graph.remove_node(a) - graph.remove_node(b) - cluster_size[u] = cluster_size.pop(a) + cluster_size.pop(b) - u += 1 - - return cost + node1 = int(dendrogram[t][0]) + node2 = int(dendrogram[t][1]) + if node1 >= n_nodes and height[node1 - n_nodes] == dendrogram[t][2]: + edge_sampling[t] = edge_sampling[node1 - n_nodes] + edge_sampling[node1 - n_nodes] = 0 + node_sampling[t] = node_sampling[node1 - n_nodes] + elif node2 >= n_nodes and height[node2 - n_nodes] == dendrogram[t][2]: + edge_sampling[t] = edge_sampling[node2 - n_nodes] + edge_sampling[node2 - n_nodes] = 0 + node_sampling[t] = node_sampling[node2 - n_nodes] + edge_sampling[t] += 2 * aggregate_graph.graph[node1][node2] + node_sampling[t] += aggregate_graph.cluster_probs[node1] * aggregate_graph.cluster_probs[node2] + height[t] = dendrogram[t][2] + aggregate_graph.merge(node1, node2) + + index = np.where(edge_sampling)[0] + quality: float = np.sum(edge_sampling[index] * np.log(edge_sampling[index] / node_sampling[index])) + if normalized: + inv_node_weights = sparse.diags(1 / node_weights_vec, shape=(n_nodes, n_nodes), format='csr') + sampling_ratio = inv_node_weights.dot(adj_matrix.dot(inv_node_weights)) / adj_matrix.data.sum() + mutual_information = np.sum(adj_matrix.data / adj_matrix.data.sum() * np.log(2 * sampling_ratio.data)) + quality /= mutual_information + return quality diff --git a/sknetwork/hierarchy/paris.py b/sknetwork/hierarchy/paris.py index dc25e2457..9cee3d6df 100644 --- a/sknetwork/hierarchy/paris.py +++ b/sknetwork/hierarchy/paris.py @@ -1,151 +1,106 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# paris.py - agglomerative clustering algorithm -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Thomas Bonald -# Copyright 2018 Bertrand Charpentier -# -# This file is part of Scikit-network. -# -# NetworkX is distributed under a BSD license; see LICENSE.txt for more information. - -try: - # noinspection PyPackageRequirements - from numba import njit -except ImportError: - def njit(func): - return func +""" +Created on March 2019 +@author: Thomas Bonald +@author: Bertrand Charpentier +""" + import numpy as np from scipy import sparse +from typing import Union -class SimilarityGraph: +class AggregateGraph: """ - A class of similarity graph suitable for aggregation. + A class of graph suitable for aggregation. Each node represents a cluster. Attributes ---------- - current_node: int - current node index - neighbor_sim: np.ndarray of lists of tuples - list of (neighbor, similarity) for each node - active_nodes: set - set of active nodes - node_sizes: np.ndarray(dtype=int) - vector of node sizes - + graph : dict[dict] + Dictionary of dictionary of edge weights. + next_cluster : int + Index of the next cluster (resulting from aggregation). + cluster_sizes : dict + Dictionary of cluster sizes. + cluster_probs : dict + Dictionary of cluster probabilities. """ - def __init__(self, adj_matrix, node_weights='degree'): + def __init__(self, adjacency: sparse.csr_matrix, node_probs: np.ndarray): """ Parameters ---------- - adj_matrix: scipy.csr_matrix - adjacency matrix of the graph - node_weights: Union[str,np.ndarray(dtype=float)] - vector of node weights + adjacency : + Adjacency matrix of the graph. + node_probs : + Distribution of node weights. """ - n_nodes = adj_matrix.shape[0] - if type(node_weights) == np.ndarray: - if len(node_weights) != n_nodes: - raise ValueError('The number of node weights must match the number of nodes.') - if np.any(node_weights <= 0): - raise ValueError('All node weights must be positive.') - else: - node_weights_vec = node_weights - elif type(node_weights) == str: - if node_weights == 'degree': - node_weights_vec = adj_matrix.dot(np.ones(n_nodes)) - elif node_weights == 'uniform': - node_weights_vec = np.ones(n_nodes) / n_nodes - else: - raise ValueError('Unknown distribution type.') - else: + n_nodes = adjacency.shape[0] + total_weight = adjacency.data.sum() - raise TypeError( - 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.') - - self.current_node = n_nodes - # arrays have size 2 * n_nodes - 1 for the future nodes (resulting from merges) - self.neighbor_sim = np.empty((2 * n_nodes - 1,), dtype=object) - inv_weights_diag = sparse.spdiags(1 / node_weights_vec, [0], n_nodes, n_nodes, format='csr') - sim_matrix = inv_weights_diag.dot(adj_matrix.dot(inv_weights_diag)) + self.next_cluster = n_nodes + self.graph = {} for node in range(n_nodes): - start = sim_matrix.indptr[node] - end = sim_matrix.indptr[node + 1] - self.neighbor_sim[node] = [(sim_matrix.indices[i], sim_matrix.data[i]) for i in range(start, end) if - sim_matrix.indices[i] != node] - self.active_nodes = set(range(n_nodes)) - self.node_sizes = np.zeros(2 * n_nodes - 1, dtype=int) - self.node_sizes[:n_nodes] = np.ones(n_nodes, dtype=int) - self.node_weights = np.zeros(2 * n_nodes - 1, dtype=float) - self.node_weights[:n_nodes] = node_weights_vec - - def merge(self, nodes): - """ - Merges two nodes. + # normalize so that the total weight is equal to 1 + # remove self-loops + self.graph[node] = {adjacency.indices[i]: adjacency.data[i] / total_weight for i in + range(adjacency.indptr[node], adjacency.indptr[node + 1]) + if adjacency.indices[i] != node} + self.cluster_sizes = {node: 1 for node in range(n_nodes)} + self.cluster_probs = {node: node_probs[node] for node in range(n_nodes)} + + def merge(self, node1: int, node2: int) -> 'AggregateGraph': + """Merges two nodes. Parameters ---------- - nodes: tuple - The two nodes + node1, node2 : + The two nodes to merge. + Returns ------- - The aggregated graph + self: :class:`AggregateGraph` + The aggregated graph (without self-loop). """ - first_node = nodes[0] - second_node = nodes[1] - first_weight = self.node_weights[first_node] - second_weight = self.node_weights[second_node] - new_weight = first_weight + second_weight - first_list = self.neighbor_sim[first_node] - second_list = self.neighbor_sim[second_node] - new_list = [] - while first_list and second_list: - if first_list[-1][0] > second_list[-1][0]: - node, sim = first_list.pop() - if node != second_node: - new_sim = sim * first_weight / new_weight - new_list.append((node, new_sim)) - self.neighbor_sim[node].append((self.current_node, new_sim)) - elif first_list[-1][0] < second_list[-1][0]: - node, sim = second_list.pop() - if node != first_node: - new_sim = sim * second_weight / new_weight - new_list.append((node, new_sim)) - self.neighbor_sim[node].append((self.current_node, new_sim)) - else: - node, first_sim = first_list.pop() - _, second_sim = second_list.pop() - # weighted sum of similarities - new_sim = (first_sim * first_weight + second_sim * second_weight) / new_weight - new_list.append((node, new_sim)) - self.neighbor_sim[node].append((self.current_node, new_sim)) - while first_list: - node, sim = first_list.pop() - if node != second_node: - new_sim = sim * first_weight / new_weight - new_list.append((node, new_sim)) - self.neighbor_sim[node].append((self.current_node, new_sim)) - while second_list: - node, sim = second_list.pop() - if node != first_node: - new_sim = sim * second_weight / new_weight - new_list.append((node, new_sim)) - self.neighbor_sim[node].append((self.current_node, new_sim)) - self.neighbor_sim[self.current_node] = new_list - self.active_nodes -= set(nodes) - self.active_nodes.add(self.current_node) - self.node_sizes[self.current_node] = self.node_sizes[[nodes]].sum() - self.node_sizes[[nodes]] = (0, 0) - self.node_weights[self.current_node] = self.node_weights[[nodes]].sum() - self.current_node += 1 + new_node = self.next_cluster + self.graph[new_node] = {} + common_neighbors = set(self.graph[node1]) & set(self.graph[node2]) - {node1, node2} + for node in common_neighbors: + self.graph[new_node][node] = self.graph[node1][node] + self.graph[node2][node] + self.graph[node][new_node] = self.graph[node].pop(node1) + self.graph[node].pop(node2) + node1_neighbors = set(self.graph[node1]) - set(self.graph[node2]) - {node2} + for node in node1_neighbors: + self.graph[new_node][node] = self.graph[node1][node] + self.graph[node][new_node] = self.graph[node].pop(node1) + node2_neighbors = set(self.graph[node2]) - set(self.graph[node1]) - {node1} + for node in node2_neighbors: + self.graph[new_node][node] = self.graph[node2][node] + self.graph[node][new_node] = self.graph[node].pop(node2) + del self.graph[node1] + del self.graph[node2] + self.cluster_sizes[new_node] = self.cluster_sizes.pop(node1) + self.cluster_sizes.pop(node2) + self.cluster_probs[new_node] = self.cluster_probs.pop(node1) + self.cluster_probs.pop(node2) + self.next_cluster += 1 return self -def reorder_dendrogram(dendrogram: np.ndarray): +def reorder_dendrogram(dendrogram: np.ndarray) -> np.ndarray: + """ + Get the dendrogram in increasing order of height. + + Parameters + ---------- + dendrogram: + Original dendrogram. + + Returns + ------- + dendrogram: np.ndarray + Reordered dendrogram. + """ n_nodes = np.shape(dendrogram)[0] + 1 order = np.zeros((2, n_nodes - 1), float) order[0] = np.arange(n_nodes - 1) @@ -159,13 +114,39 @@ def reorder_dendrogram(dendrogram: np.ndarray): class Paris: - """ - Agglomerative algorithm. + """Agglomerative algorithm. Attributes ---------- - dendrogram_: numpy array of shape (n_nodes - 1) x 4 - dendrogram of the nodes + dendrogram_ : numpy array of shape (n_nodes - 1, 4) + Dendrogram. + + Examples + -------- + >>> import numpy as np + >>> from scipy import sparse + + >>> # House graph + >>> row = np.array([0, 0, 1, 1, 2, 3]) + >>> col = np.array([1, 4, 2, 4, 3, 4]) + >>> adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)), shape=(5, 5)) + >>> adjacency = adjacency + adjacency.T + + >>> paris = Paris() + >>> paris.fit(adjacency).predict() + array([1, 1, 0, 0, 1]) + + Notes + ----- + Each row of the dendrogram = i, j, height, size of cluster i + j. + + The similarity between clusters i,j is :math:`\\dfrac{w_{ij}}{w_i w_j}` where + + * :math:`w_{ij}` is the weight of edge i,j, if any, and 0 otherwise, + + * :math:`w_{i}` is the weight of cluster i + + * :math:`w_{j}` is the weight of cluster j See Also -------- @@ -175,56 +156,81 @@ class Paris: ---------- T. Bonald, B. Charpentier, A. Galland, A. Hollocou (2018). Hierarchical Graph Clustering using Node Pair Sampling. - KDD Workshop, 2018. + Workshop on Mining and Learning with Graphs. + https://arxiv.org/abs/1806.01664 + """ def __init__(self): self.dendrogram_ = None - def fit(self, adj_matrix: sparse.csr_matrix, node_weights="degree", reorder=False): + def fit(self, adjacency: sparse.csr_matrix, node_weights: Union[str, np.ndarray] = 'degree', reorder: bool = True): """ - Agglomerative clustering using the nearest neighbor chain + Agglomerative clustering using the nearest neighbor chain. Parameters ---------- - adj_matrix: scipy.csr_matrix - adjacency matrix of the graph to cluster - node_weights: np.ndarray(dtype=float) - node weights to be used in the linkage - reorder: boolean - reorder the dendrogram in increasing order of heights + adjacency : + Adjacency matrix of the graph to cluster. + node_weights : + Node weights used in the linkage. + reorder : + If True, reorder the dendrogram in increasing order of heights. Returns ------- - self + self: :class:`Paris` """ - if type(adj_matrix) != sparse.csr_matrix: + if type(adjacency) != sparse.csr_matrix: raise TypeError('The adjacency matrix must be in a scipy compressed sparse row (csr) format.') - # check that the graph is not directed - if adj_matrix.shape[0] != adj_matrix.shape[1]: + if adjacency.shape[0] != adjacency.shape[1]: raise ValueError('The adjacency matrix must be square.') - if (adj_matrix != adj_matrix.T).nnz != 0: + if adjacency.shape[0] <= 1: + raise ValueError('The graph must contain at least two nodes.') + if (adjacency != adjacency.T).nnz != 0: raise ValueError('The graph cannot be directed. Please fit a symmetric adjacency matrix.') - sim_graph = SimilarityGraph(adj_matrix, node_weights) + n_nodes = adjacency.shape[0] + + if type(node_weights) == np.ndarray: + if len(node_weights) != n_nodes: + raise ValueError('The number of node weights must match the number of nodes.') + else: + node_probs = node_weights + elif type(node_weights) == str: + if node_weights == 'degree': + node_probs = adjacency.dot(np.ones(n_nodes)) + elif node_weights == 'uniform': + node_probs = np.ones(n_nodes) + else: + raise ValueError('Unknown distribution of node weights.') + else: + raise TypeError( + 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.') + + if np.any(node_probs <= 0): + raise ValueError('All node weights must be positive.') + else: + node_probs = node_probs / np.sum(node_probs) + + aggregate_graph = AggregateGraph(adjacency, node_probs) connected_components = [] dendrogram = [] - while len(sim_graph.active_nodes) > 0: + while len(aggregate_graph.cluster_sizes) > 0: node = None - for node in sim_graph.active_nodes: + for node in aggregate_graph.cluster_sizes: break chain = [node] while chain: node = chain.pop() - # filter active nodes - sim_graph.neighbor_sim[node] = [element for element in sim_graph.neighbor_sim[node] if - (sim_graph.node_sizes[element[0]] > 0)] - if sim_graph.neighbor_sim[node]: + if aggregate_graph.graph[node]: max_sim = -float("inf") nearest_neighbor = None - for neighbor, sim in sim_graph.neighbor_sim[node]: + for neighbor in aggregate_graph.graph[node]: + sim = aggregate_graph.graph[node][neighbor] / aggregate_graph.cluster_probs[node] / \ + aggregate_graph.cluster_probs[neighbor] if sim > max_sim: nearest_neighbor = neighbor max_sim = sim @@ -234,8 +240,9 @@ def fit(self, adj_matrix: sparse.csr_matrix, node_weights="degree", reorder=Fals nearest_neighbor_last = chain.pop() if nearest_neighbor_last == nearest_neighbor: dendrogram.append([node, nearest_neighbor, 1. / max_sim, - sim_graph.node_sizes[node] + sim_graph.node_sizes[nearest_neighbor]]) - sim_graph.merge((node, nearest_neighbor)) + aggregate_graph.cluster_sizes[node] + + aggregate_graph.cluster_sizes[nearest_neighbor]]) + aggregate_graph.merge(node, nearest_neighbor) else: chain.append(nearest_neighbor_last) chain.append(node) @@ -244,16 +251,15 @@ def fit(self, adj_matrix: sparse.csr_matrix, node_weights="degree", reorder=Fals chain.append(node) chain.append(nearest_neighbor) else: - connected_components.append((node, sim_graph.node_sizes[node])) - sim_graph.active_nodes -= {node} - sim_graph.node_sizes[node] = 0 + connected_components.append((node, aggregate_graph.cluster_sizes[node])) + del aggregate_graph.cluster_sizes[node] - node, node_size = connected_components.pop() - for next_node, next_node_size in connected_components: - node_size += next_node_size - dendrogram.append([node, next_node, float("inf"), node_size]) - node = sim_graph.current_node - sim_graph.current_node += 1 + node, cluster_size = connected_components.pop() + for next_node, next_cluster_size in connected_components: + cluster_size += next_cluster_size + dendrogram.append([node, next_node, float("inf"), cluster_size]) + node = aggregate_graph.next_cluster + aggregate_graph.next_cluster += 1 dendrogram = np.array(dendrogram) if reorder: @@ -262,3 +268,37 @@ def fit(self, adj_matrix: sparse.csr_matrix, node_weights="degree", reorder=Fals self.dendrogram_ = dendrogram return self + + def predict(self, n_clusters: int = 2, sorted_clusters: bool = False) -> np.ndarray: + """ + Extract the clustering with given number of clusters from the dendrogram. + + Parameters + ---------- + n_clusters : + Number of clusters. + sorted_clusters : + If True, sort labels in decreasing order of cluster size. + + Returns + ------- + labels : np.ndarray + Cluster index of each node. + """ + if self.dendrogram_ is None: + raise ValueError("First fit data using the fit method.") + n_nodes = np.shape(self.dendrogram_)[0] + 1 + if n_clusters < 1 or n_clusters > n_nodes: + raise ValueError("The number of clusters must be between 1 and the number of nodes.") + + labels = np.zeros(n_nodes, dtype=int) + clusters = {node: [node] for node in range(n_nodes)} + for t in range(n_nodes - n_clusters): + clusters[n_nodes + t] = clusters.pop(int(self.dendrogram_[t][0])) + \ + clusters.pop(int(self.dendrogram_[t][1])) + clusters = list(clusters.values()) + if sorted_clusters: + clusters = sorted(clusters, key=len, reverse=True) + for label, cluster in enumerate(clusters): + labels[cluster] = label + return labels diff --git a/sknetwork/hierarchy/tests/test_metrics.py b/sknetwork/hierarchy/tests/test_metrics.py index 7635f1b5f..9856a6099 100644 --- a/sknetwork/hierarchy/tests/test_metrics.py +++ b/sknetwork/hierarchy/tests/test_metrics.py @@ -1,27 +1,27 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# test for metrics.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Bertrand Charpentier -# -# This file is part of Scikit-network. -# -# NetworkX is distributed under a BSD license; see LICENSE.txt for more information. +""" +Created on March 2019 +@author: Thomas Bonald +""" import unittest -from sknetwork.hierarchy.paris import Paris -#from sknetwork.hierarchy.metrics import hierarchical_cost +from sknetwork.hierarchy import Paris, tree_sampling_divergence, dasgupta_cost +from sknetwork.toy_graphs import karate_club_graph class TestMetrics(unittest.TestCase): def setUp(self): self.paris = Paris() + self.karate_club_graph = karate_club_graph() - #def test_unknown_options(self): - #with self.assertRaises(ValueError): - #hierarchical_cost(self.weighted_graph, self.d_w, affinity='unknown') + def test_karate_club_graph(self): + adjacency = self.karate_club_graph + dendrogram = self.paris.fit(adjacency).dendrogram_ + tsd = tree_sampling_divergence(adjacency, dendrogram, normalized=True) + self.assertAlmostEqual(tsd, .65, 2) + dc = dasgupta_cost(adjacency, dendrogram, normalized=True) + self.assertAlmostEqual(dc, .33, 2) - #with self.assertRaises(ValueError): - #hierarchical_cost(self.weighted_graph, self.dendrogram, linkage='unknown') diff --git a/sknetwork/hierarchy/tests/test_paris.py b/sknetwork/hierarchy/tests/test_paris.py index 0613a0480..02c16547b 100644 --- a/sknetwork/hierarchy/tests/test_paris.py +++ b/sknetwork/hierarchy/tests/test_paris.py @@ -1,20 +1,23 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# test for louvain.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Thomas Bonald -# -# This file is part of Scikit-network. +""" +Created on March 2019 +@author: Thomas Bonald +""" import unittest -from sknetwork.hierarchy.paris import Paris +import numpy as np from scipy.sparse import identity +from sknetwork.hierarchy import Paris +from sknetwork.toy_graphs import house_graph, karate_club_graph class TestParis(unittest.TestCase): def setUp(self): self.paris = Paris() + self.house_graph = house_graph() + self.karate_club_graph = karate_club_graph() def test_unknown_types(self): with self.assertRaises(TypeError): @@ -27,5 +30,14 @@ def test_unknown_options(self): with self.assertRaises(ValueError): self.paris.fit(identity(2, format='csr'), node_weights='unknown') - def test_single_node_graph(self): - self.assertEqual(self.paris.fit(identity(1, format='csr')).dendrogram_.shape[0], 0) + def test_house_graph(self): + self.paris.fit(self.house_graph) + self.assertEqual(self.paris.dendrogram_.shape[0], 4) + labels = self.paris.predict(sorted_clusters=True) + self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0]))) + + def test_karate_club_graph(self): + self.paris.fit(self.karate_club_graph) + self.assertEqual(self.paris.dendrogram_.shape[0], 33) + labels = self.paris.predict() + self.assertEqual(np.max(labels), 1) diff --git a/sknetwork/loader/__init__.py b/sknetwork/loader/__init__.py index e69de29bb..6f74000bc 100644 --- a/sknetwork/loader/__init__.py +++ b/sknetwork/loader/__init__.py @@ -0,0 +1 @@ +from sknetwork.loader.parser import * diff --git a/sknetwork/loader/parser.py b/sknetwork/loader/parser.py index d0569f3fb..d80dfa734 100644 --- a/sknetwork/loader/parser.py +++ b/sknetwork/loader/parser.py @@ -2,11 +2,12 @@ # -*- coding: utf-8 -*- """ Created on Dec 5, 2018 -@author: Quentin Lutz +@author: Quentin Lutz , Nathan de Lara """ -from scipy import sparse import numpy as np +from scipy import sparse +from typing import Union class Parser: @@ -14,23 +15,34 @@ class Parser: A collection of parsers to import datasets into a SciPy format """ + @staticmethod - def parse_tsv(path, directed=False, weighted=False, labeled=False, offset=1): + def parse_tsv(path, directed: bool = False, bipartite: bool = False, weighted: bool = False, labeled: bool = False, + offset: int = 1) -> Union[sparse.csr_matrix, tuple]: """ A parser for Tabulation-Separated Values (TSV) datasets. Parameters ---------- - path : str, the path to the dataset in TSV format - directed : bool, ensures the adjacency matrix is symmetric - weighted : bool, retrieves the weights in the third field of the file, raises an error if no such field exists - labeled : bool, retrieves the names given to the nodes and renumbers them. Returns an additional array - offset : int, renumbers the nodes (useful if the nodes are indexed from a given value other than 0) + path : str + the path to the dataset in TSV format + directed : bool + ensures the adjacency matrix is symmetric + bipartite : bool + if True, returns the biadjacency matrix of shape (n1, n2) + weighted : bool + retrieves the weights in the third field of the file, raises an error if no such field exists + labeled : bool + retrieves the names given to the nodes and renumbers them. Returns an additional array + offset : int + renumbers the nodes (useful if the nodes are indexed from a given value other than 0) Returns ------- - adj_matrix : csr_matrix, the adjacency_matrix of the graph - labels : numpy.array, optional, an array such that labels[k] is the label given to the k-th node + adj_matrix : csr_matrix + the adjacency_matrix of the graph + labels : numpy.array + optional, an array such that labels[k] is the label given to the k-th node """ parsed_file = np.loadtxt(path, dtype=str, unpack=True, comments='%') @@ -43,6 +55,7 @@ def parse_tsv(path, directed=False, weighted=False, labeled=False, offset=1): row = new_nodes[:n_edges] col = new_nodes[n_edges:] else: + labels = None row = parsed_file[0].astype(int) - offset * np.ones(n_edges, dtype=int) col = parsed_file[1].astype(int) - offset * np.ones(n_edges, dtype=int) if weighted: @@ -51,12 +64,21 @@ def parse_tsv(path, directed=False, weighted=False, labeled=False, offset=1): else: data = parsed_file[2].astype(float) else: - data = np.ones(n_edges, dtype=int) - adj_matrix = sparse.csr_matrix((data, (row, col))) - n_nodes = max(adj_matrix.shape) - adj_matrix.resize((n_nodes, n_nodes)) - if not directed: - adj_matrix += adj_matrix.transpose() + data = np.ones(n_edges, dtype=bool) + + n_nodes = max(max(row), max(col)) + 1 + if n_nodes < 2 * 10e9: + dtype = np.int32 + else: + dtype = np.int64 + + if bipartite: + adj_matrix = sparse.csr_matrix((data, (row, col)), dtype=dtype) + else: + adj_matrix = sparse.csr_matrix((data, (row, col)), shape=(n_nodes, n_nodes), dtype=dtype) + if not directed: + adj_matrix += adj_matrix.transpose() if labeled: return adj_matrix, labels - return adj_matrix + else: + return adj_matrix diff --git a/sknetwork/toy_graphs/__init__.py b/sknetwork/toy_graphs/__init__.py index e69de29bb..866fd1552 100644 --- a/sknetwork/toy_graphs/__init__.py +++ b/sknetwork/toy_graphs/__init__.py @@ -0,0 +1 @@ +from sknetwork.toy_graphs.graph_data import * diff --git a/sknetwork/toy_graphs/graph_data.py b/sknetwork/toy_graphs/graph_data.py index bd2c52845..4e71833bf 100644 --- a/sknetwork/toy_graphs/graph_data.py +++ b/sknetwork/toy_graphs/graph_data.py @@ -3,41 +3,82 @@ """ Created on Nov 29, 2018 @author: Quentin Lutz +@author: Nathan de Lara +@author: Thomas Bonald """ import numpy as np from scipy import sparse -class GraphConstants: +def house_graph(): """ - This class is meant to make a few toy graphs easily available to the user. + House graph + + 5 nodes, 6 edges + + Returns + ------- + adjacency: sparse.csr_matrix + Adjacency matrix of the graph. + """ + row = np.array([0, 0, 1, 1, 2, 3]) + col = np.array([1, 4, 2, 4, 3, 4]) + adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)), shape=(5, 5)) + return adjacency + adjacency.T + + +def karate_club_graph(): + """ + Zachary's Karate Club Graph + + Data file from: http://vlado.fmf.uni-lj.si/pub/networks/data/Ucinet/UciData.htm + + 34 nodes, 78 edges + + Returns + ------- + adjacency: sparse.csr_matrix + Adjacency matrix of the graph. + """ + row = np.array( + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 4, 4, 5, 5, 5, 6, 8, 8, 8, 9, 13, 14, 14, 15, 15, 18, + 18, 19, 20, 20, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 26, 26, + 27, 28, 28, 29, 29, 30, 30, 31, 31, 32]) + col = np.array( + [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2, + 3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, 12, + 13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, 33, 32, 33, 32, + 33, 33, 32, 33, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 31, 29, 33, + 33, 31, 33, 32, 33, 32, 33, 32, 33, 33]) + adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)), shape=(34, 34)) + return adjacency + adjacency.T + + +def star_wars_villains_graph(return_labels=False): + """ + Bipartite graph connecting some Star Wars villains to the movies in which they appear.\n + 7 nodes (4 villains, 3 movies), 8 edges + + Parameters + ---------- + return_labels: bool + whether to return the labels of the nodes as dictionaries. + + Returns + ------- + biadjacency: sparse.csr_matrix + Biadjacency matrix of the graph. """ + row = np.array([0, 0, 1, 2, 2, 2, 3, 3]) + col = np.array([0, 2, 0, 0, 1, 2, 1, 2]) + biadjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col))) - @staticmethod - def karate_club_graph(): - """ - Zachary's Karate Club Graph - Data file from: http://vlado.fmf.uni-lj.si/pub/networks/data/Ucinet/UciData.htm - 34 nodes, 78 edges - - Returns - ------- - The adjacency matrix of Zachary's Karate Club Graph in SciPy CSR format - """ - row = np.array( - [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, - 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, - 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, - 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, - 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 18, 19, 20, - 22, 23, 26, 27, 28, 29, 30, 31, 32]) - col = np.array( - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 10, 10, 10, 11, 12, - 12, 13, 13, 13, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 22, 22, 23, - 23, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 27, 27, 27, 27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, - 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, - 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]) - karate_club = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col))) - return karate_club + if return_labels: + row_labels = {0: 'Jabba', 1: 'Greedo', 2: 'Vador', 3: 'Boba'} + col_labels = {0: 'A_New_Hope', 1: 'The_Empire_Strikes_Back', 2: 'Return_Of_The_Jedi'} + return biadjacency, row_labels, col_labels + else: + return biadjacency diff --git a/sknetwork/toy_graphs/test/test_graph_imports.py b/sknetwork/toy_graphs/test/test_graph_imports.py index 311d0c95b..8f8b0125a 100644 --- a/sknetwork/toy_graphs/test/test_graph_imports.py +++ b/sknetwork/toy_graphs/test/test_graph_imports.py @@ -1,14 +1,9 @@ # -*- coding: utf-8 -*- # test for graph_data.py -# -# Copyright 2018 Scikit-network Developers. -# Copyright 2018 Quentin Lutz -# -# This file is part of Scikit-network. +# authors: Quentin Lutz , Nathan de Lara import unittest -from sknetwork.toy_graphs.graph_data import GraphConstants - +from sknetwork.toy_graphs.graph_data import karate_club_graph, star_wars_villains_graph class TestGraphImport(unittest.TestCase): @@ -16,5 +11,8 @@ def setUp(self): pass def test_available(self): - graph = GraphConstants.karate_club_graph() + graph = karate_club_graph() self.assertEqual(graph.shape[0], 34) + + graph = star_wars_villains_graph() + self.assertEqual(graph.shape, (4, 3))