diff --git a/.gitignore b/.gitignore
index e6e88f5..91b1e48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,12 +2,15 @@
.idea/
.~*#
*_doc/
-
+*.zip
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
+# removes files
+FGFP_metadata.tsv
+FGFP_metadata_ready.tsv
# C extensions
*.so
diff --git a/setup.py b/setup.py
index 025bb85..aa66165 100644
--- a/setup.py
+++ b/setup.py
@@ -16,11 +16,12 @@
'tqdm',
'scikit-learn>=0.19.1',
'matplotlib>=2.2.2',
- 'networkx==1.11',
+ 'networkx>=2.1',
'pandas>=0.23.0',
'numpy>=1.10.4',
'scipy',
-
+ 'matplotlib!=3.0.0rc2',
+# 'scikit-bio>=0.5.2'
],
zip_safe=False,
extras_require={'alldeps': ('numpy>=1.10.4', 'scipy',)}
diff --git a/tmap/netx/SAFE.py b/tmap/netx/SAFE.py
index 496704f..71bfbbc 100644
--- a/tmap/netx/SAFE.py
+++ b/tmap/netx/SAFE.py
@@ -1,9 +1,12 @@
import networkx as nx
-import pandas as pd
import numpy as np
+import pandas as pd
from statsmodels.sandbox.stats.multicomp import multipletests
from tqdm import tqdm
+from tmap.tda.utils import construct_node_data
+
+
def nodes_pairwise_dist(graph):
"""
get all pairwise node distance, including self-distance of 0
@@ -13,111 +16,174 @@ def nodes_pairwise_dist(graph):
G = nx.Graph()
G.add_nodes_from(graph['nodes'].keys())
G.add_edges_from(graph['edges'])
+
all_pairs_dist = nx.all_pairs_shortest_path_length(G)
+ if not isinstance(all_pairs_dist, dict):
+ all_pairs_dist = dict(all_pairs_dist)
return all_pairs_dist
-def construct_weighted_node_data(graph,data):
- nodes = graph['nodes']
- node_data = {k: data.iloc[v, :].sum() for k, v in nodes.items()}
- node_data = pd.DataFrame.from_dict(node_data,orient='index')
- return node_data
-
-def nodes_neighborhood(all_pairs_dist,graph, threshold=0.5):
+def nodes_neighborhood(graph, all_pairs_dist, nr_threshold=0.5):
"""
generate neighborhoods from the graph for all nodes
:param graph:
- :param threshold: Float in range of [0,100]. The threshold is used to cut path distance with percentiles
+ :param nr_threshold: Float in range of [0,100]. The threshold is used to cut path distance with percentiles. nr means neighbour
:return: return a dict with keys of nodes, values is a list of tuple (another node id, its sizes).
"""
- # all_pairs_dist = nodes_pairwise_dist(graph)
node_sizes = dict(zip(graph['node_keys'], graph['node_sizes']))
# generate all pairwise shortest path length (duplicated!!! but is OK for percentile statistics)
all_length = [_ for it in all_pairs_dist.values() for _ in it.values()]
# remove self-distance (that is 0)
all_length = [_ for _ in all_length if _ > 0]
- length_threshold = np.percentile(all_length, threshold)
+ length_threshold = np.percentile(all_length, nr_threshold)
# print('Maximum path length threshold is set to be %s' % (length_threshold,))
neighborhoods = {}
for node_id in graph['nodes']:
pairs = all_pairs_dist[node_id]
- # node neighborhood include also the center node
- # neighbors = [n for n, l in pairs.items() if (l <= length_threshold) and (n != node_id)]
- neighbors = [n for n, l in pairs.items() if l <= length_threshold]
- # neighbors = dict([(neighbor_id, node_sizes[neighbor_id]) for neighbor_id in neighbors])
+ # node neighborhood also include itself.
+ neighbors = [n for n, dis in pairs.items() if dis <= length_threshold]
+ # neighbors.remove(node_id)
+ neighbors = dict([(neighbor_id, node_sizes[neighbor_id][0]) for neighbor_id in neighbors])
neighborhoods[node_id] = neighbors
return neighborhoods
-def nodes_neighborhood_score(neighborhoods, node_data):
+def nodes_neighborhood_score(neighborhoods, node_data, cal_mode="df", _mode='sum'):
"""
calculate neighborhood scores for each node from node associated data
:param neighborhoods: result from nodes_neighborhood
:param node_data: node associated values
+ :param _cal_type: hidden parameters. For a big data with too many features(>=100), calculation with pandas will faster than using dict.
:return: return a dict with keys of center nodes, value is a float
"""
+ map_fun = {'sum': np.sum,
+ 'weighted_sum': np.sum,
+ 'weighted_mean': np.mean,
+ "mean": np.mean}
+ if _mode not in ["sum", "mean", "weighted_sum", "weighted_mean"]:
+ raise SyntaxError('Wrong provided parameters.')
+ else:
+ aggregated_fun = map_fun[_mode]
+ if 'weighted_' in _mode:
+ weight = [neighborhoods[n][n] for n in node_data.index]
+ if type(node_data) == dict:
+ node_data = {k: v * weight[k] for k, v in node_data.items()}
+ else:
+ node_data = node_data.multiply(weight, axis='index')
+
# weighted neighborhood scores by node size
- # neighborhood_scores = {k: np.sum([node_data[n_k]*n_v for n_k,n_v in neighbors.items()])
- # for k, neighbors in neighborhoods.items()}
- neighborhood_scores = {k: np.sum([node_data[n_k] for n_k in neighbors])
- for k, neighbors in neighborhoods.items()}
+ if cal_mode == "dict":
+ neighborhood_scores = {k: aggregated_fun([node_data[n_k] for n_k in neighbors.keys()])
+ for k, neighbors in neighborhoods.items()}
+
+ else:
+ neighborhood_scores = {k: aggregated_fun(node_data.values[list(neighbors.keys())], 0)
+ for k, neighbors in neighborhoods.items()}
+ neighborhood_scores = pd.DataFrame.from_dict(neighborhood_scores, orient="index", columns=node_data.columns)
+ # neighborhood_scores = neighborhood_scores.reindex(node_data.index)
return neighborhood_scores
-def SAFE(graph, node_data, n_iter=1000, threshold=0.5,all_dist = None,neighborhoods=None):
+def convertor(compared_count, node_data, n_iter, cal_mode="df"):
+ """
+ Using the number of times from comparison between observed values and shuffled values to calculated SAFE score.
+ (Multi-test corrected)
+ :param compared_count:
+ :param node_data:
+ :param n_iter:
+ :return:
+ """
+ min_p_value = 1.0 / (n_iter + 1.0)
+ if cal_mode == "df":
+ neighborhood_count_df = pd.DataFrame(compared_count, columns=node_data.columns, index=node_data.index)
+ elif cal_mode == 'dict':
+ neighborhood_count_df = pd.DataFrame.from_dict(compared_count, orient="index")
+ # index is node id, only one column.
+ else:
+ raise SyntaxError
+
+ p_value_df = neighborhood_count_df.div(n_iter)
+ p_value_df = p_value_df.where(p_value_df >= min_p_value, min_p_value)
+
+ # todo: allow user to specify a multi-test correction method?
+ p_values_fdr_bh = p_value_df.apply(lambda col: multipletests(col, method='fdr_bh')[1], axis=0)
+ safe_scores = p_values_fdr_bh.apply(lambda col: np.log10(col) / np.log10(min_p_value), axis=0)
+ safe_scores = safe_scores.to_dict(orient="dict")
+
+ return safe_scores
+
+
+def _SAFE(graph, node_data, n_iter=1000, nr_threshold=0.5, all_dist=None, neighborhoods=None, _cal_type="dict", _mode='enrich', agg_mode='sum', verbose=1):
"""
perform SAFE analysis by node permutations
:param graph:
- :param node_data: node associated values (a dictionary)
+ :param node_data: node associated values
:param n_iter: number of permutations
- :param threshold: Float in range of [0,100]. The threshold is used to cut path distance with percentiles
+ :param nr_threshold: Float in range of [0,100]. The threshold is used to cut path distance with percentiles for neighbour.
+ :param _cal_type: hidden parameters. For a big data with too many features(>=100), calculation with pandas will faster than using dict.
:return: return dict with keys of nodes ID, values are normalized and multi-test corrected p values.
"""
- if all_dist is None:
- all_pairs_dist = nodes_pairwise_dist(graph)
- else:
- all_pairs_dist = all_dist
+ if _mode not in ['enrich', 'decline', 'both']:
+ raise SyntaxError('_mode must be one of [enrich , decline]')
+
+ all_pairs_dist = nodes_pairwise_dist(graph) if all_dist is None else all_dist
+ neighborhoods = nodes_neighborhood(graph, all_pairs_dist, nr_threshold=nr_threshold) if neighborhoods is None else neighborhoods
- if neighborhoods is None:
- neighborhoods = nodes_neighborhood(all_pairs_dist,graph, threshold=threshold)
+ neighborhood_scores = nodes_neighborhood_score(neighborhoods, node_data=node_data, cal_mode=_cal_type, _mode=agg_mode)
+
+ if verbose == 0:
+ iter_obj = range(n_iter)
else:
- neighborhoods = neighborhoods
-
- neighborhood_scores = nodes_neighborhood_score(neighborhoods, node_data=node_data)
-
- # enrichment (p-value) as a rank in the permutation scores (>=, ordered)
- neighborhood_enrichments = {k: 0 for k in neighborhood_scores.keys()}
- for _ in range(n_iter):
- # permute the node attributes, with the network structure kept
- p_data = dict(zip(node_data.keys(), np.random.permutation(list(node_data.values()))))
- p_neighborhood_scores = nodes_neighborhood_score(neighborhoods, p_data)
- for k in neighborhood_enrichments.keys():
- if p_neighborhood_scores[k] >= neighborhood_scores[k]:
- neighborhood_enrichments[k] += 1
-
- # with no multiple test correction
- # min_p_value = 1.0/(n_iter+1.0)
- # # bound the p-value to (0,1]
- # get_p_value = lambda k: max(float(neighborhood_enrichments[k])/n_iter, min_p_value)
- # safe_scores = {k: np.log10(get_p_value(k))/np.log10(min_p_value)
- # for k in neighborhood_enrichments.keys()}
-
- # perform multiple test correction using the 'fdr_bh' method
- min_p_value = 1.0/(n_iter+1.0)
- nodes_keys = neighborhood_enrichments.keys()
- get_p_value = lambda k: max(float(neighborhood_enrichments[k])/n_iter, min_p_value)
- p_values = [get_p_value(k) for k in nodes_keys]
- # todo: allow user to specify a multi-test correction method?
- p_values_fdr_bh = multipletests(p_values, method='fdr_bh')[1]
- p_values_fdr_bh = dict(zip(nodes_keys, p_values_fdr_bh))
- safe_scores = {k: np.log10(p_values_fdr_bh[k])/np.log10(min_p_value) for k in nodes_keys}
+ iter_obj = tqdm(range(n_iter))
- return safe_scores
+ cal_mode = cal_type_define(node_data, _cal_type)
+ if cal_mode == "df":
+ # enrichment (p-value) as a rank in the permutation scores (>=, ordered)
+ neighborhood_enrichments = np.zeros(node_data.shape)
+ neighborhood_decline = np.zeros(node_data.shape)
+ p_data = node_data.copy() # deep copy is important
+ for _ in iter_obj:
+ # permute the node attributes, with the network structure kept
+ # inplace change
+ p_data = p_data.apply(lambda col: np.random.permutation(col), axis=0)
+ p_neighborhood_scores = nodes_neighborhood_score(neighborhoods, p_data, cal_mode=cal_mode)
-def SAFE_batch(graph, meta_data, n_iter=1000, threshold=0.5):
+ neighborhood_enrichments[p_neighborhood_scores >= neighborhood_scores] += 1
+ neighborhood_decline[p_neighborhood_scores <= neighborhood_scores] += 1
+
+ safe_scores_enrich = convertor(neighborhood_enrichments, node_data, n_iter, cal_mode=cal_mode)
+ safe_scores_decline = convertor(neighborhood_decline, node_data, n_iter, cal_mode=cal_mode)
+
+ else:
+ # enrichment (p-value) as a rank in the permutation scores (>=, ordered)
+ neighborhood_enrichments = {k: 0 for k in neighborhood_scores.keys()}
+ neighborhood_decline = {k: 0 for k in neighborhood_scores.keys()}
+ node_data = node_data.to_dict()
+ for _ in iter_obj:
+ # permute the node attributes, with the network structure kept
+ p_data = dict(zip(node_data.keys(), np.random.permutation(list(node_data.values()))))
+ p_neighborhood_scores = nodes_neighborhood_score(neighborhoods, p_data, cal_mode=cal_mode)
+ for k in neighborhood_enrichments.keys():
+ if p_neighborhood_scores[k] >= neighborhood_scores[k]:
+ neighborhood_enrichments[k] += 1
+ elif p_neighborhood_scores[k] <= neighborhood_scores[k]:
+ neighborhood_decline[k] += 1
+
+ safe_scores_enrich = convertor(neighborhood_enrichments, node_data, n_iter, cal_mode=cal_mode)[0]
+ safe_scores_decline = convertor(neighborhood_decline, node_data, n_iter, cal_mode=cal_mode)[0]
+
+ if _mode == 'both':
+ return safe_scores_enrich, safe_scores_decline
+ elif _mode == 'enrich':
+ return safe_scores_enrich
+ elif _mode == 'decline':
+ return safe_scores_decline
+
+
+def SAFE_batch(graph, meta_data, n_iter=1000, nr_threshold=0.5, shuffle_obj="node", _cal_type="auto", _mode='enrich', agg_mode='sum', verbose=1):
"""
Map sample meta-data to node associated values (using means),
and perform SAFE batch analysis for multiple features
@@ -127,48 +193,96 @@ def SAFE_batch(graph, meta_data, n_iter=1000, threshold=0.5):
:param dict graph:
:param np.ndarry/pd.DataFrame meta_data:
:param int n_iter: Permutation times. For some features with skewness values, it should be higher in order to stabilize the resulting SAFE score.
- :param float threshold: Float in range of [0,100]. The threshold is used to cut path distance with percentiles
+ :param float nr_threshold: Float in range of [0,100]. The threshold is used to cut path distance with percentiles
:return: return dict ``{feature: {node_ID:p-values(fdr)} }`` .
"""
all_pairs_dist = nodes_pairwise_dist(graph)
- neighborhoods = nodes_neighborhood(all_pairs_dist, graph, threshold=threshold)
- nodes = graph['nodes']
- all_safe_scores = {}
- if "columns" not in dir(meta_data):
- meta_data = pd.DataFrame(meta_data)
- for feature in tqdm(meta_data.columns):
- node_data = {k: meta_data.iloc[v, meta_data.columns.get_loc(feature)].mean() for k, v in nodes.items()}
- safe_scores = SAFE(graph, node_data, n_iter=n_iter, threshold=threshold,all_dist=all_pairs_dist,neighborhoods=neighborhoods)
- all_safe_scores[feature] = safe_scores
-
- return all_safe_scores
-
-
-def SAFE_single(graph, sample_data, n_iter=1000, threshold=0.5):
- """
- map sample meta-data to node associated values (using means),
- and perform SAFE analysis for a single feature
- :param graph:
- :param sample_data:
- :param n_iter:
- :param threshold:
- :return:
- """
- nodes = graph['nodes']
- node_data = {k: np.mean([sample_data[idx] for idx in v]) for k, v in nodes.items()}
- safe_scores = SAFE(graph, node_data, n_iter=n_iter, threshold=threshold)
- return safe_scores
+ neighborhoods = nodes_neighborhood(graph, all_pairs_dist, nr_threshold=nr_threshold)
+ if meta_data.shape[0] != len(graph["nodes"]):
+ node_data = construct_node_data(graph, meta_data)
+ else:
+ node_data = meta_data
-def get_enriched_nodes(safe_scores, threshold):
+ if verbose == 0:
+ iter_obj = meta_data.columns
+ else:
+ iter_obj = tqdm(meta_data.columns)
+
+ cal_mode = cal_type_define(meta_data, _cal_type)
+ if cal_mode == "df":
+ all_safe_scores = _SAFE(graph, node_data,
+ n_iter=n_iter,
+ nr_threshold=nr_threshold,
+ all_dist=all_pairs_dist,
+ neighborhoods=neighborhoods,
+ _cal_type="df",
+ _mode=_mode,
+ agg_mode=agg_mode,
+ verbose=verbose)
+ return all_safe_scores
+ else:
+ _safe = {}
+ for feature in iter_obj:
+ safe_scores = _SAFE(graph,
+ node_data,
+ n_iter=n_iter,
+ nr_threshold=nr_threshold,
+ all_dist=all_pairs_dist,
+ neighborhoods=neighborhoods,
+ _cal_type="dict",
+ _mode=_mode,
+ agg_mode=agg_mode,
+ verbose=verbose)
+ _safe[feature] = safe_scores
+ if _mode == 'both':
+ enriched_safe, decline_safe = {}, {}
+ for fea in _safe:
+ enriched_safe[fea] = _safe[fea][0]
+ decline_safe[fea] = _safe[fea][1]
+ return enriched_safe, decline_safe
+ else:
+ return _safe
+
+
+# def SAFE_single(graph, sample_data, n_iter=1000, nr_threshold=0.5):
+# """
+# map sample meta-data to node associated values (using means),
+# and perform SAFE analysis for a single feature
+# :param graph:
+# :param sample_data:
+# :param n_iter:
+# :param threshold:
+# :return:
+# """
+# nodes = graph['nodes']
+# node_data = {k: np.mean([sample_data[idx] for idx in v]) for k, v in nodes.items()}
+# safe_scores = _SAFE(graph, node_data, n_iter=n_iter, nr_threshold=nr_threshold, _cal_type="dict")
+# return safe_scores
+
+
+def get_enriched_nodes(graph, safe_scores, SAFE_pvalue, nr_threshold=0.5, centroids=False):
"""
get significantly enriched nodes (>= threshold)
:param safe_scores:
:param threshold:
:return:
"""
- node_ids = safe_scores.columns
- return {feature: list(node_ids[safe_scores.loc[feature,:] >= threshold]) for feature in safe_scores.index}
+ all_pairs_dist = nodes_pairwise_dist(graph)
+ neighborhoods = nodes_neighborhood(graph, all_pairs_dist, nr_threshold=nr_threshold)
+ if 'columns' in dir(safe_scores):
+ node_ids = safe_scores.columns
+ else:
+ safe_scores = pd.DataFrame.from_dict(safe_scores, orient='index')
+ node_ids = safe_scores.columns
+
+ enriched_centroides = {feature: list(node_ids[safe_scores.loc[feature, :] >= SAFE_pvalue]) for feature in
+ safe_scores.index}
+ enriched_nodes = {f: list(set([n for n in nodes for n in neighborhoods[n]])) for f, nodes in enriched_centroides.items()}
+ if centroids:
+ return enriched_centroides, enriched_nodes
+ else:
+ return enriched_nodes
def get_enriched_samples(enriched_nodes, nodes):
@@ -184,7 +298,7 @@ def get_enriched_samples(enriched_nodes, nodes):
for feature, node_ids in enriched_nodes.items()}
-def get_SAFE_summary(graph, meta_data, safe_scores, n_iter_value, p_value=0.01,_output_details=False):
+def get_SAFE_summary(graph, meta_data, safe_scores, n_iter_value, nr_threshold=0.5, p_value=0.01, _output_details=False):
"""
summary the SAFE scores for feature enrichment results
:param graph:
@@ -196,40 +310,51 @@ def get_SAFE_summary(graph, meta_data, safe_scores, n_iter_value, p_value=0.01,_
"""
# todo: refactor into a SAFE summary class?
- min_p_value = 1.0 / (n_iter_value+1.0)
- threshold = np.log10(p_value) / np.log10(min_p_value)
- if isinstance(safe_scores,dict):
+ min_p_value = 1.0 / (n_iter_value + 1.0)
+ SAFE_pvalue = np.log10(p_value) / np.log10(min_p_value)
+ if isinstance(safe_scores, dict):
safe_scores = pd.DataFrame.from_dict(safe_scores, orient='index')
else:
- safe_scores = safe_scores.T
+ if safe_scores.index != meta_data.columns:
+ safe_scores = safe_scores.T
feature_names = safe_scores.index
- safe_total_score = safe_scores.apply(lambda x: np.sum(x), axis=1)
+ safe_total_score = safe_scores.sum(1)
- safe_enriched_nodes = get_enriched_nodes(safe_scores=safe_scores, threshold=threshold)
+ safe_enriched_centroides, safe_enriched_nodes = get_enriched_nodes(safe_scores=safe_scores, SAFE_pvalue=SAFE_pvalue, nr_threshold=nr_threshold, graph=graph, centroids=True)
safe_enriched_nodes_n = {feature: len(node_ids) for feature, node_ids in safe_enriched_nodes.items()}
- safe_enriched_samples = get_enriched_samples(enriched_nodes=safe_enriched_nodes, nodes=graph['nodes'])
- safe_enriched_samples_n = {feature: len(sample_ids) for feature, sample_ids in safe_enriched_samples.items()}
- safe_enriched_score = {feature: np.sum(safe_scores.loc[feature, safe_enriched_nodes[feature]])
+ if meta_data.shape[0] != len(graph["nodes"]):
+ # if input meta_data is (nodes,features) shape.
+ safe_enriched_samples = get_enriched_samples(enriched_nodes=safe_enriched_nodes, nodes=graph['nodes'])
+ safe_enriched_samples_n = {feature: len(sample_ids) for feature, sample_ids in safe_enriched_samples.items()}
+ else:
+ safe_enriched_samples_n = {feature: "Unknown" for feature, sample_ids in safe_enriched_nodes.items()}
+
+ safe_enriched_score = {feature: np.sum(safe_scores.loc[feature, safe_enriched_centroides[feature]])
for feature in feature_names}
if _output_details:
- safe_summary = {'enriched_nodes':safe_enriched_nodes,
- 'enriched_score':safe_enriched_score,}
+ safe_summary = {'enriched_nodes': safe_enriched_nodes,
+ 'enriched_score': safe_enriched_score, }
return safe_summary
# calculate enriched ratios ('enriched abundance' / 'total abundance')
feature_abundance = meta_data.sum(axis=0)
- enriched_abundance_ratio = \
- {feature: np.sum(meta_data.iloc[safe_enriched_samples[feature], meta_data.columns.get_loc(feature)])/feature_abundance[feature]
- for feature in feature_names}
+
+ if meta_data.shape[0] != len(graph["nodes"]):
+ enriched_abundance_ratio = {feature: np.sum(meta_data.iloc[safe_enriched_samples[feature], meta_data.columns.get_loc(feature)]) / feature_abundance[feature]
+ for feature in feature_names}
+ else:
+ enriched_abundance_ratio = \
+ {feature: np.sum(meta_data.iloc[safe_enriched_nodes[feature], meta_data.columns.get_loc(feature)]) / feature_abundance[feature]
+ for feature in feature_names}
# helper for safe division for integer and divide_by zero
def _safe_div(x, y):
if y == 0.0:
return np.nan
else:
- return x*1.0/y
+ return x * 1.0 / y
enriched_safe_ratio = {feature: _safe_div(safe_enriched_score[feature], safe_total_score[feature])
for feature in feature_names}
@@ -244,3 +369,12 @@ def _safe_div(x, y):
safe_summary.index.name = 'name'
return safe_summary
+
+def cal_type_define(node_data, _cal_type):
+ # todo: ensure all node_data must have .shape.
+ if (_cal_type == "auto" and node_data.shape[1] >= 10) or _cal_type == "df":
+ return "df"
+ elif (_cal_type == "auto" and node_data.shape[1] < 10) or _cal_type == "dict":
+ return "dict"
+ else:
+ raise SyntaxError("_cal_type must be one of auto,df,dict.")
diff --git a/tmap/netx/coenrich.py b/tmap/netx/coenrich.py
deleted file mode 100644
index f429a16..0000000
--- a/tmap/netx/coenrich.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import itertools
-from scipy.stats import pearsonr
-from statsmodels.stats.multitest import fdrcorrection
-from tqdm import tqdm
-
-def coenrich(graph, safe_scores):
- """
- Giving graph and safe_scores calculated by ``SAFE_batch``
-
- Currently, we using itertools.combinations for iterating all possibles combinations of each features and perform pearson test with corresponding SAFE scores. Then, it will perform multiple test correction with fdr_correction.
-
- Finally, it will output a dict with different keys.
-
- * associated_pairs: tuple of associated features.
- * association_coeffient: coeffient produced by pearson test
- * association_p_values: p-values produced by pearson test
- * association_p_values(fdr): p-values after multiple test correction
-
- SAFE score is a metric trying to capture the variation of corresponding feature and so it will result much 0 values for among the network.
-
- For pearson test, two array of values with lots of zeros will result some pseudo association. It should be careful it there are negative association with two features.
- """
- nodes = graph['nodes']
- overall_coenrich = {}
- n_feas = len(safe_scores.keys())
- # iterative all fea1,fea2 without repeat.
- for fea1, fea2 in tqdm(itertools.combinations(safe_scores.keys(), 2), total=(n_feas ** 2 - n_feas) / 2):
- if sum(list(safe_scores[fea1].values())) != 0 or sum(list(safe_scores[fea2].values())) != 0:
- _fea1_vals = [safe_scores[fea1][_] for _ in nodes]
- _fea2_vals = [safe_scores[fea2][_] for _ in nodes]
-
- fea1_vals = [_fea1_vals[idx] for idx in range(len(nodes))]
- fea2_vals = [_fea2_vals[idx] for idx in range(len(nodes))]
- p_test = pearsonr(fea1_vals,
- fea2_vals)
- overall_coenrich[(fea1, fea2)] = p_test
-
- graph_coenrich = {}
- graph_coenrich['associated_pairs'] = [k for k, v in overall_coenrich.items()]
- graph_coenrich['association_coeffient'] = dict([(k, v[0]) for k, v in overall_coenrich.items()])
- graph_coenrich['association_p_values'] = dict([(k, v[1]) for k, v in overall_coenrich.items()])
- graph_coenrich['association_p_values(fdr)'] = dict([(k, v) for k, v in zip(overall_coenrich.keys(),
- fdrcorrection([_[1] for _ in overall_coenrich.values()])[1])])
- return graph_coenrich
diff --git a/tmap/netx/coenrichment_analysis.py b/tmap/netx/coenrichment_analysis.py
new file mode 100644
index 0000000..e30a779
--- /dev/null
+++ b/tmap/netx/coenrichment_analysis.py
@@ -0,0 +1,331 @@
+import networkx as nx
+import numpy as np
+import pandas as pd
+import scipy.stats as scs
+from tqdm import tqdm
+
+from tmap.netx.SAFE import get_enriched_nodes
+
+
+def get_component_nodes(graph, enriched_nodes):
+ """
+ Given a list of enriched nodes which comes from ``get_enriched_nodes``. Normally it the enriched_centroid instead of the others nodes around them.
+ :param list enriched_nodes: list of nodes ID which is enriched with given feature and threshold.
+ :param graph:
+ :return: A nested list which contain multiple list of nodes which is not connected to each other.
+ """
+ sub_nodes = list(enriched_nodes)
+ sub_edges = [edge for edge in graph['edges'] if edge[0] in sub_nodes and edge[1] in sub_nodes]
+
+ G = nx.Graph()
+ G.add_nodes_from(sub_nodes)
+ G.add_edges_from(sub_edges)
+
+ comp_nodes = [nodes for nodes in nx.algorithms.components.connected_components(G)]
+ return comp_nodes
+
+
+def is_enriched(s1, s2, s3, s4):
+ """
+ Accessory function for further determine whether it is a co-enrichment after fisher-exact test.
+ """
+ total_1 = len(s1) + len(s2)
+ total_2 = len(s3) + len(s4)
+ if total_1 == 0 or total_2 == 0:
+ return False
+ if len(s1) / total_1 > len(s3) / total_2:
+ return True
+ else:
+ return False
+
+
+def coenrichment_for_nodes(graph, nodes, fea, enriched_centroid, SAFE_pvalue=None, safe_scores=None, _filter=True, mode='both'):
+ """
+ Coenrichment main function
+ With given feature and its enriched nodes, we could construct a contingency table when we comparing to other feature and its enriched nodes.
+ For statistical test association between different features, fisher-exact test was applied to each constructed contingency table. Fisher-exact test only consider the association between two classifications instead of the ratio. For coenrichment, we also implement a accessory function called ``is_enriched`` to further judge that if the ratio of enrichment is bigger than the non-enrichement.
+
+ Besides the global co-enrichment, several local enrichment were observed. With ``networkx`` algorithm for finding component, we could extract each local enrichment nodes from global enrichment.
+
+ Because of the complex combination between comparison of local enrichment, two different contingency table was constructed.
+
+ The contingency table which compare different enriched nodes among components and enriched or non-enriched nodes of other features shown below.
+
+ ======================== ================================= =================================
+ fea fea this comp enriched nodes fea other comp enriched nodes
+ ======================== ================================= =================================
+ o_f_enriched_nodes s1 s2
+ o_f_non-enriched_nodes s3 s4
+ ======================== ================================= =================================
+
+ The other contingency table which compare enriched nodes within specific component or non-enriched nodes and enriched or non-enriched nodes of other features shown below.
+
+ ======================== ================================= =================================
+ fea fea this comp enriched nodes fea non-enriched nodes
+ ======================== ================================= =================================
+ o_f_enriched_nodes s1 s2
+ o_f_non-enriched_nodes s3 s4
+ ======================== ================================= =================================
+
+ For convenient calculation, three different mode [both|global|local] could be choose.
+
+ Output objects contain two different kinds of dictionary. One dict is recording the correlation information between different features, the other is recording the raw contingency table info between each comparsion which is a list of nodes representing s1,s2,s3,s4.
+
+ If mode equals to 'both', it will output global correlative dict, local correlative, metainfo.
+ If mode equals to 'global', it will output local correlative, metainfo.
+ If mode equals to 'local', it will output global correlative dict, metainfo.
+
+ For global correlative, keys are compared features and values are tuples (oddsratio, pvalue) from fisher-exact test.
+ For local correlative, keys are tuples of (index of components, size of components, features) and values are as same as global correlative.
+ The remaining metainfo is a dictionary shared same key to global/local correlative but contains contingency table info.
+
+ :param graph: tmap constructed graph
+ :param list nodes: a list of nodes you want to process from specific feature
+ :param str fea: feature name which doesn't need to exist at enriched_centroid
+ :param dict enriched_centroid: enriched_centroide output from ``get_enriched_nodes``
+ :param float threshold: None or a threshold for SAFE score. If it is a None, it will not filter the nodes.
+ :param dict safe_scores: A dictionary which store SAFE_scores for filter the nodes. If you want to filter the nodes, it must simultaneously give safe_scores and threshold.
+ :param str mode: [both|global|local]
+ :return:
+ """
+
+ if mode not in ['both', 'global', 'local']:
+ print("Wrong syntax input, mode should be one of [both|global|local]")
+ return
+
+ total_nodes = set(list(graph["nodes"].keys()))
+ comp_nodes = get_component_nodes(graph, nodes)
+ metainfo = {}
+ global_correlative_feas = {}
+ sub_correlative_feas = {}
+ if SAFE_pvalue is not None and safe_scores is not None:
+ fea_enriched_nodes = set([_ for _ in nodes if safe_scores.get(_, 0) >= SAFE_pvalue])
+ else:
+ fea_enriched_nodes = set(nodes[::])
+ fea_nonenriched_nodes = total_nodes.difference(fea_enriched_nodes)
+ metainfo[fea] = fea_enriched_nodes, comp_nodes
+ for o_f in enriched_centroid.keys():
+ if o_f != fea:
+ o_f_enriched_nodes = set(enriched_centroid[o_f])
+ o_f_nonenriched_nodes = total_nodes.difference(o_f_enriched_nodes)
+ if mode == 'both' or 'global':
+ # contingency table
+ # fea enriched nodes, fea non-enriched_nodes
+ # o_f_enriched_nodes s1 s2
+ # o_f_non-enriched_nodes s3 s4
+ s1 = o_f_enriched_nodes.intersection(fea_enriched_nodes)
+ s2 = o_f_enriched_nodes.intersection(fea_nonenriched_nodes)
+ s3 = o_f_nonenriched_nodes.intersection(fea_enriched_nodes)
+ s4 = o_f_nonenriched_nodes.intersection(fea_nonenriched_nodes)
+ oddsratio, pvalue = scs.fisher_exact([[len(s1), len(s2)],
+ [len(s3), len(s4)]])
+ if _filter:
+ if pvalue <= 0.05 and is_enriched(s1, s2, s3, s4):
+ global_correlative_feas[o_f] = (oddsratio, pvalue)
+ metainfo[o_f] = (s1, s2, s3, s4)
+ else:
+ global_correlative_feas[o_f] = (oddsratio, pvalue)
+ metainfo[o_f] = (s1, s2, s3, s4)
+
+ if mode == 'both' or 'local':
+ for idx, nodes in enumerate(comp_nodes):
+ # contingency table
+ # fea this comp enriched nodes, fea other comp enriched nodes
+ # o_f_enriched_nodes s1 s2
+ # o_f_non-enriched_nodes s3 s4
+ nodes = set(nodes)
+ _s1 = o_f_enriched_nodes.intersection(nodes)
+ _s2 = o_f_enriched_nodes.intersection(fea_enriched_nodes.difference(nodes))
+ _s3 = o_f_nonenriched_nodes.intersection(nodes)
+ _s4 = o_f_nonenriched_nodes.intersection(fea_enriched_nodes.difference(nodes))
+ oddsratio, pvalue1 = scs.fisher_exact([[len(_s1), len(_s2)],
+ [len(_s3), len(_s4)]])
+ # contingency table
+ # fea this comp enriched nodes, fea non-enriched nodes
+ # o_f_enriched_nodes s1 s2
+ # o_f_non-enriched_nodes s3 s4
+ s1 = o_f_enriched_nodes.intersection(nodes)
+ s2 = o_f_enriched_nodes.intersection(fea_nonenriched_nodes)
+ s3 = o_f_nonenriched_nodes.intersection(nodes)
+ s4 = o_f_nonenriched_nodes.intersection(fea_nonenriched_nodes)
+ oddsratio, pvalue2 = scs.fisher_exact([[len(s1), len(s2)],
+ [len(s3), len(s4)]])
+ if _filter:
+ if pvalue1 <= 0.05 and pvalue2 <= 0.05 and is_enriched(s1, s2, s3, s4) and is_enriched(_s1, _s2, _s3, _s4):
+ sub_correlative_feas[(idx, len(nodes), o_f)] = (pvalue1, pvalue2)
+ metainfo[(idx, len(nodes), o_f)] = (_s1, _s2, s2)
+ else:
+ sub_correlative_feas[(idx, len(nodes), o_f)] = (pvalue1, pvalue2)
+ metainfo[(idx, len(nodes), o_f)] = (_s1, _s2, s2)
+
+ if mode == 'both':
+ return global_correlative_feas, sub_correlative_feas, metainfo
+ elif mode == 'global':
+ return global_correlative_feas, metainfo
+ elif mode == 'global':
+ return sub_correlative_feas, metainfo
+ else:
+ return
+
+
+def batch_coenrichment(fea, graph, safe_scores, n_iter=5000, p_value=0.05, _pre_cal_enriched=None):
+ """
+ Batch find with given feature at all possible features found at safe_scores.
+ If _pre_cal_enriched was given, n_iter and p_value will be useless. Or you should modify n_iter and p_value as the params you passed to ``SAFE`` algorithm.
+
+ :param str/list fea: A single feature or a list of feature which is already applied SAFE algorithm.
+ :param graph:
+ :param dict safe_scores: A SAFE score output from ``SAFE_batch`` which must contain all values occur at fea.
+ :param int n_iter: Permutation times used at ``SAFE_batch``.
+ :param float p_value: The p-value to determine the enriched nodes.
+ :param dict _pre_cal_enriched: A pre calculated enriched_centroid which comprised all necessary features will save time.
+ :return:
+ """
+ global_correlative_feas = {}
+ sub_correlative_feas = {}
+ metainfo = {}
+ print('building network...')
+
+ if '__iter__' in dir(fea):
+ fea_batch = list(fea)[::]
+ elif type(fea) == str:
+ fea_batch = [fea]
+ else:
+ raise SyntaxError
+
+ for fea in fea_batch:
+ if fea in safe_scores.keys():
+ if not _pre_cal_enriched:
+ min_p_value = 1.0 / (n_iter + 1.0)
+ threshold = np.log10(p_value) / np.log10(min_p_value)
+ enriched_centroid, enriched_nodes = get_enriched_nodes(pd.DataFrame.from_dict(safe_scores, orient='index'), threshold, graph, centroids=True)
+ else:
+ enriched_centroid = _pre_cal_enriched
+ _global, _local, _meta = coenrichment_for_nodes(graph, enriched_centroid[fea], fea, enriched_centroid, mode='both')
+ global_correlative_feas.update(_global)
+ sub_correlative_feas.update(_local)
+ metainfo.update(_meta)
+ return global_correlative_feas, sub_correlative_feas, metainfo
+
+
+def construct_correlative_metadata(fea, global_correlative_feas, sub_correlative_feas, metainfo, node_data, verbose=1):
+ """
+ Down-stream transformation of ``batch_coenrichment``.
+ :param fea:
+ :param global_correlative_feas:
+ :param sub_correlative_feas:
+ :param metainfo:
+ :param node_data:
+ :return:
+ """
+ if verbose:
+ print('Transforming given correlative relationship into human-readable DataFrame......')
+ # processing global correlative feas
+ global_headers = ['other feature',
+ 'fisher-exact test pvalue',
+ 'ranksum in co-enriched nodes',
+ 'ranksum in others nodes',
+ 'coverage/%', ]
+
+ sub_headers = ['n_comps',
+ 'comps_size',
+ 'other feature',
+ 'Fisher test pvalue(co-enriched,enriched)',
+ 'Fisher test pvalue(co-enriched,others)',
+ 'coenriched-enriched pvalue',
+ 'coenriched-others pvalue',
+ 'enriched-others pvalue',
+ 'coverage/%',
+ ]
+ global_corr_df = pd.DataFrame(columns=global_headers)
+ for o_f in list(global_correlative_feas.keys()):
+ _1, f_p = global_correlative_feas[o_f]
+ s1, s2, s3, s4 = metainfo[o_f]
+
+ if o_f in node_data.columns:
+ _data = node_data
+ else:
+ print('error feature %s' % o_f)
+ return
+ y1 = _data.loc[s1, o_f]
+ y2 = _data.loc[set.union(s2, s3, s4), o_f]
+
+ if fea in node_data.columns:
+ _data = node_data
+ else:
+ print('error feature %s' % o_f)
+ return
+ _y1 = _data.loc[s1, fea]
+ _y2 = _data.loc[set.union(s2, s3, s4), fea]
+ ranksum_p1 = scs.ranksums(y1, y2)[1]
+ ranksum_p2 = scs.ranksums(_y1, _y2)[1]
+ if not len(s1) + len(s3):
+ coverage = np.nan
+ else:
+ coverage = ((len(s1) + len(s2)) / (len(s1) + len(s3))) * 100
+
+ global_corr_df = global_corr_df.append(pd.DataFrame([[o_f, f_p, ranksum_p1, ranksum_p2, coverage]], columns=global_headers))
+
+ # processing subgraph correlative feas
+ sub_corr_df = pd.DataFrame(columns=sub_headers)
+ for n_c, size, o_f in sub_correlative_feas.keys():
+ if o_f in node_data.columns:
+ _data = node_data
+ else:
+ print('error feature %s' % o_f)
+ return
+ coenriched_nodes, enriched_nodes_o_f_enriched, nonenriched_nodes_o_f_enriched = metainfo[(n_c, size, o_f)]
+ f_p1, f_p2 = sub_correlative_feas[(n_c, size, o_f)]
+ y1 = _data.loc[coenriched_nodes, o_f]
+ y2 = _data.loc[enriched_nodes_o_f_enriched, o_f]
+ y3 = _data.loc[nonenriched_nodes_o_f_enriched, o_f]
+ p1 = scs.ranksums(y1, y2)[1]
+ p2 = scs.ranksums(y1, y3)[1]
+ p3 = scs.ranksums(y2, y3)[1]
+ coverage = len(coenriched_nodes) / len(set.union(coenriched_nodes, enriched_nodes_o_f_enriched, nonenriched_nodes_o_f_enriched)) * 100
+ sub_corr_df = sub_corr_df.append(pd.DataFrame([['comps%s' % n_c, size, o_f, f_p1, f_p2, p1, p2, p3, coverage]], columns=sub_headers))
+
+ return global_corr_df, sub_corr_df
+
+
+def pairwise_coenrichment(graph, safe_scores, n_iter=5000, p_value=0.05, _pre_cal_enriched=None, verbose=1):
+ """
+ Pair-wise calculation for co-enrichment of each feature found at safe_scores.
+ If _pre_cal_enriched was given, n_iter and p_value is not useless.
+ Or you should modify n_iter and p_value to fit the params you passed to ``SAFE`` algorithm.
+
+ :param graph:
+ :param dict safe_scores: A SAFE score output from ``SAFE_batch`` which must contain all values occur at fea.
+ :param int n_iter: Permutation times used at ``SAFE_batch``.
+ :param float p_value: The p-value to determine the enriched nodes.
+ :param dict _pre_cal_enriched: A pre calculated enriched_centroid which comprised all necessary features will save time.
+ :param verbose:
+ :return:
+ """
+ dist_matrix = pd.DataFrame(data=np.nan, index=safe_scores.keys(), columns=safe_scores.keys())
+ if verbose:
+ print('building network...')
+ iter_obj = tqdm(safe_scores.keys())
+ else:
+ iter_obj = safe_scores.keys()
+
+ if not _pre_cal_enriched:
+ min_p_value = 1.0 / (n_iter + 1.0)
+ SAFE_pvalue = np.log10(p_value) / np.log10(min_p_value)
+ enriched_centroid, enriched_nodes = get_enriched_nodes(pd.DataFrame.from_dict(safe_scores, orient='index'), SAFE_pvalue, graph, centroids=True)
+ else:
+ enriched_centroid = _pre_cal_enriched
+
+ for fea in iter_obj:
+ _global, _meta = coenrichment_for_nodes(graph, enriched_centroid[fea], fea, enriched_centroid, mode='global', _filter=False)
+ # _filter to fetch raw fisher-exact test result without any cut-off values.
+ for o_f in safe_scores.keys():
+ if fea != o_f:
+ s1, s2, s3, s4 = _meta[o_f]
+ oddsratio, pvalue = _global[o_f]
+ if is_enriched(s1, s2, s3, s4):
+ dist_matrix.loc[fea, o_f] = pvalue
+ else:
+ dist_matrix.loc[fea, o_f] = 1
+ dist_matrix.loc[fea, fea] = 0
+ return dist_matrix
diff --git a/tmap/netx/driver_detect_beta.py b/tmap/netx/driver_detect_beta.py
new file mode 100644
index 0000000..89efd72
--- /dev/null
+++ b/tmap/netx/driver_detect_beta.py
@@ -0,0 +1,254 @@
+####
+# test pipelines which includes SHAP and xgboost.
+# incoming pipeline which doesn't implement yet.
+# lth 2018-12-10
+####
+
+import itertools
+import os
+import pickle
+import time
+
+import numpy as np
+import pandas as pd
+import scipy
+import shap
+import xgboost as xgb
+from scipy.spatial.distance import squareform, pdist
+from sklearn.cluster import DBSCAN
+from sklearn.metrics import r2_score, auc, roc_curve, average_precision_score
+from sklearn.preprocessing import MinMaxScaler
+from tqdm import tqdm
+
+from tmap.tda import mapper, filter
+from tmap.tda.cover import Cover
+from tmap.tda.metric import Metric
+from tmap.tda.utils import construct_node_data
+from tmap.tda.utils import optimize_dbscan_eps, cover_ratio, optimal_r
+
+global_verbose = 1
+
+
+def generate_graph(input_data, dis, _eu_dm=None, eps_threshold=95, overlap_params=0.75, min_samples=3, resolution_params="auto", filter_=filter.PCOA):
+ tm = mapper.Mapper(verbose=1)
+ # TDA Step2. Projection
+ t1 = time.time()
+ metric = Metric(metric="precomputed")
+ lens = [filter_(components=[0, 1], metric=metric, random_state=100)]
+ projected_X = tm.filter(dis, lens=lens)
+ if global_verbose:
+ print("projection takes: ", time.time() - t1)
+ ###
+ t1 = time.time()
+ eps = optimize_dbscan_eps(input_data, threshold=eps_threshold, dm=_eu_dm)
+ clusterer = DBSCAN(eps=eps, min_samples=min_samples)
+ if resolution_params == "auto":
+ r = optimal_r(input_data, projected_X, clusterer, 40, overlap_params)
+ else:
+ r = resolution_params
+ cover = Cover(projected_data=MinMaxScaler().fit_transform(projected_X), resolution=r, overlap=overlap_params)
+ graph = tm.map(data=input_data, cover=cover, clusterer=clusterer)
+ if global_verbose:
+ print('Graph covers %.2f percentage of samples.' % cover_ratio(graph, input_data))
+ print("graph time: ", time.time() - t1)
+
+ graph_name = "{eps}_{overlap}_{r}_{filter}.graph".format(eps=eps_threshold, overlap=overlap_params, r=r, filter=lens[0].__class__.__name__)
+ return graph, graph_name, projected_X
+
+
+def read_graph(path):
+ graph = pickle.load(open(path, 'rb'))
+ return graph
+
+
+def dump_graph(graph, path):
+ pickle.dump(graph, open(path, "wb"))
+
+
+def generate_XY(graph, input_data, center=False, weighted=True, beta=1):
+ # If we consider the params passed to graph, we could make a more robust or faith X.
+ t1 = time.time()
+
+ def DiffusionKernel(AdjMatrix, beta):
+ # 1.Computes Degree matrix - diagonal matrix with diagonal entries = raw sums of adjacency matrix
+ DegreeMatrix = np.diag(np.sum(AdjMatrix, axis=1))
+ # 2. Computes negative Laplacian H = AdjMatrix - DegreeMatrix
+ H = np.subtract(AdjMatrix, DegreeMatrix)
+ # 3. Computes matrix exponential: exp(beta*H)
+ K = scipy.linalg.expm(beta * H)
+ return K
+
+ node_data = construct_node_data(graph, input_data)
+ if "_raw_nodes" in graph["params"]:
+ raw_nodes = graph['params']['_raw_nodes']
+ node_ids = list(graph["nodes"].keys())
+ adj_matrix = pd.DataFrame(data=np.nan, index=node_ids, columns=node_ids)
+ for k1, k2 in itertools.combinations(node_ids, 2):
+ if np.any(raw_nodes[k1] & raw_nodes[k2]):
+ adj_matrix.loc[k1, k2] = 1
+ adj_matrix.loc[k2, k1] = 1
+ mask_array = (adj_matrix == 1)
+ else:
+ rng = np.arange(node_data.shape[0])
+ mask_array = rng[:, None] < rng
+
+ # prepare X and y
+ if weighted:
+ y = DiffusionKernel(graph["adj_matrix"].fillna(0).values, beta)[mask_array]
+ else:
+ y = graph["adj_matrix"].fillna(0).values[mask_array]
+
+ edge_idx = graph["adj_matrix"].fillna(0).values[mask_array] == 1
+ edge_data = np.ndarray((len(y), input_data.shape[1]))
+ count_ = 0
+
+ if global_verbose:
+ _iter = tqdm(input_data.columns)
+ else:
+ _iter = input_data.columns
+
+ for feature in _iter:
+ one_di_data = np.abs(node_data.loc[:, feature].values - node_data.loc[:, feature].values.reshape(-1, 1))
+ edge_data[:, count_] = one_di_data[mask_array]
+ count_ += 1
+ fetures = node_data.columns[edge_data.sum(0) != 0]
+
+ edge_data = edge_data[:, edge_data.sum(0) != 0]
+ if center:
+ X = np.divide(edge_data, edge_data.std(axis=0))
+ else:
+ X = edge_data
+ if global_verbose: print("Preparing X and y: ", time.time() - t1)
+ return X, y, fetures, edge_idx
+
+
+############################################################
+def learn_rules(X, y, weighted=True, params=None):
+ t1 = time.time()
+ data = xgb.DMatrix(X, label=y)
+ defaul_params = {"seed": 123,
+ "max_depth": 5,
+ "silent": 1,
+ "tree_method": "auto",
+ }
+ num_boost_round = params.get("round", 100) if params is not None else 100
+ if params is None:
+ params = defaul_params
+ else:
+ defaul_params.update(params)
+ params = defaul_params
+ if weighted:
+ xgb_params = {"objective": "reg:logistic",
+ "booster": "gbtree",
+ "eval_metric": "rmse",
+
+ # "subsample":0.3,
+ # "scale_pos_weight":pos_weight,
+ }
+ xgb_params.update(params)
+ else:
+ pos_weight = len(y[y == 0]) / len(y[y == 1])
+ xgb_params = {"objective": "binary:logistic",
+ "booster": "gbtree",
+ "scale_pos_weight": pos_weight,
+ }
+ xgb_params.update(params)
+ bst = xgb.train(xgb_params, data,
+ evals=[(data, "self")],
+ num_boost_round=num_boost_round,
+ early_stopping_rounds=5,
+ verbose_eval=False)
+ if global_verbose: print("xgboost taking: ", time.time() - t1)
+ explainer = shap.TreeExplainer(bst)
+ shap_values = explainer.shap_values(X)
+ return shap_values, bst
+
+
+def record_log(bst, graph_name, X, y, path, type="genera"):
+ if os.path.isfile(path):
+ f1 = open(path, mode="a")
+
+ else:
+ f1 = open(path, mode="w")
+ f1.write("dtype\teps\toverlap\tr\tfilter\tr^2 score\n")
+
+ y_predict = bst.predict(xgb.DMatrix(X, label=y))
+ r2 = r2_score(y, y_predict)
+ eps, overlap, r, filter_name = graph_name.strip(".graph").split("_")
+ f1.write("\t".join([str(_) for _ in [type, eps, overlap, r, filter_name, r2]]) + '\n')
+
+
+def eval_perform(y_true, y_pred, type="all"):
+ """
+ data = []
+ fpr, tpr, th = precision_recall_curve(y_true, y_pred)
+ data.append(go.Scatter(x=fpr,y=tpr))
+ plotly.offline.plot(data)
+ :param y_true:
+ :param y_pred:
+ :param type:
+ :return:
+ """
+ if type == "pr":
+ result = average_precision_score(y_true, y_pred)
+ elif type == "auc":
+ fpr, tpr, th = roc_curve(y_true, y_pred)
+ result = auc(fpr, tpr, reorder=True)
+ # elif type == "f1":
+ # result = f1_score(y_true, y_pred)
+ elif type == "all":
+ result = {}
+ for i in ["pr", "auc", ]:
+ result[i] = eval_perform(y_true, y_pred, type=i)
+ else:
+ result = ''
+ return result
+
+
+def cal_contri(shap_values, features_name):
+ """
+ contr_s = cal_contri(shap_values)
+ :param shap_values:
+ :return:
+ """
+ pos_sum = np.apply_along_axis(lambda x: x[x > 0].sum(), 1, shap_values)
+ neg_sum = np.apply_along_axis(lambda x: x[x < 0].sum(), 1, shap_values)
+
+ t1 = np.apply_along_axis(lambda x: x / pos_sum, 0, shap_values)
+ t2 = np.apply_along_axis(lambda x: x / neg_sum, 0, shap_values)
+ result = np.where(t1 > 0, t1, t2) / 2.0
+
+ contr_s = pd.DataFrame(result, columns=features_name)
+ return contr_s.mean(0).to_dict()
+
+
+def full_pipelines(input_data,
+ weighted=True,
+ metric="braycurtis",
+ eps_threshold=95,
+ overlap_params=0.75,
+ resolution_params="auto",
+ filter_=filter.PCOA):
+ if type(metric) == str:
+ dis = squareform(pdist(input_data, metric))
+ elif "shape" in dir(metric):
+ dis = metric
+ else:
+ dis = squareform(pdist(input_data))
+
+ graph, graph_name, projected_X = generate_graph(input_data,
+ dis,
+ eps_threshold=eps_threshold,
+ overlap_params=overlap_params,
+ resolution_params=resolution_params,
+ filter_=filter_)
+ X, y, features, edge_idx = generate_XY(graph, input_data, center=True, weighted=weighted)
+ shap_values, bst = learn_rules(X, y, weighted=weighted)
+ return shap_values, bst, features
+
+
+if __name__ == '__main__':
+ path = "../test/test_data/FGFP_genus_data.csv"
+ genus_table = pd.read_csv(path, sep=',', index_col=0)
+ shap_values, bst, features = full_pipelines(genus_table, weighted=True, resolution_params=35)
+ shap_df = pd.DataFrame(shap_values, columns=features)
diff --git a/tmap/tda/basic.py b/tmap/tda/basic.py
new file mode 100644
index 0000000..54e2fc7
--- /dev/null
+++ b/tmap/tda/basic.py
@@ -0,0 +1,37 @@
+import time
+
+from sklearn.cluster import DBSCAN
+from sklearn.preprocessing import MinMaxScaler
+
+from tmap.tda import mapper, filter
+from tmap.tda.cover import Cover
+from tmap.tda.metric import Metric
+from tmap.tda.utils import optimize_dbscan_eps, cover_ratio, optimal_r
+
+global_verbose = 1
+
+def generate_graph(input_data, dis, _eu_dm=None, eps_threshold=95, overlap_params=0.75, min_samples=3,resolution_params="auto", filter_=filter.PCOA):
+ tm = mapper.Mapper(verbose=1)
+ # TDA Step2. Projection
+ t1 = time.time()
+ metric = Metric(metric="precomputed")
+ lens = [filter_(components=[0, 1], metric=metric, random_state=100)]
+ projected_X = tm.filter(dis, lens=lens)
+ if global_verbose:
+ print("projection takes: ", time.time() - t1)
+ ###
+ t1 = time.time()
+ eps = optimize_dbscan_eps(input_data, threshold=eps_threshold, dm=_eu_dm)
+ clusterer = DBSCAN(eps=eps, min_samples=min_samples)
+ if resolution_params == "auto":
+ r = optimal_r(input_data, projected_X, clusterer, 40, overlap_params)
+ else:
+ r = resolution_params
+ cover = Cover(projected_data=MinMaxScaler().fit_transform(projected_X), resolution=r, overlap=overlap_params)
+ graph = tm.map(data=input_data, cover=cover, clusterer=clusterer)
+ if global_verbose:
+ print('Graph covers %.2f percentage of samples.' % cover_ratio(graph, input_data))
+ print("graph time: ", time.time() - t1)
+
+ graph_name = "{eps}_{overlap}_{r}_{filter}.graph".format(eps=eps_threshold, overlap=overlap_params, r=r, filter=lens[0].__class__.__name__)
+ return graph, graph_name, projected_X
diff --git a/tmap/tda/cover.py b/tmap/tda/cover.py
index 72e1d1d..9c6388e 100644
--- a/tmap/tda/cover.py
+++ b/tmap/tda/cover.py
@@ -56,7 +56,6 @@ def _get_hypercubes(self,output_bounds=False):
hypercubes = np.zeros((n_bins, self.n_points), dtype=bool)
bounds_with_overlap = []
bounds_without_overlap = []
- # todo: improve the iterations?
for i, bin in enumerate(bins):
lower_bound = self.floor + bin * self.chunk_width
upper_bound = lower_bound + self.chunk_width + self.overlap_width
diff --git a/tmap/tda/filter.py b/tmap/tda/filter.py
index 87f9c72..53ee9df 100644
--- a/tmap/tda/filter.py
+++ b/tmap/tda/filter.py
@@ -2,6 +2,9 @@
import numpy as np
from sklearn import decomposition, manifold
from tmap.tda.metric import Metric
+import umap
+from scipy.spatial.distance import pdist,squareform
+from skbio.stats import ordination
_METRIC_BUILT_IN = ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
@@ -110,10 +113,10 @@ class PCA(Filters):
components: The axis of projection. If you use components 0 and 1, this is [0, 1].
"""
- def __init__(self, components=[0, 1],random_state=None):
+ def __init__(self, components=[0, 1],random_state=None, **kwds):
# PCA only accept raw data and calculate euclidean distance "internally"
super(PCA, self).__init__(components=components, metric=None)
- self.pca = decomposition.PCA(n_components=max(self.components) + 1,random_state=random_state)
+ self.pca = decomposition.PCA(n_components=max(self.components) + 1,random_state=random_state, **kwds)
def fit_transform(self, data):
"""
@@ -135,13 +138,13 @@ class TSNE(Filters):
components: The axis of projection. If you use components 0 and 1, this is [0, 1].
"""
- def __init__(self, components=[0, 1], metric=Metric(metric="euclidean")):
+ def __init__(self, components=[0, 1], metric=Metric(metric="euclidean"), **kwds):
super(TSNE, self).__init__(components=components, metric=metric)
if self.metric.name in _METRIC_BUILT_IN:
- self.tsne = manifold.TSNE(n_components=max(self.components) + 1, metric=self.metric.name)
+ self.tsne = manifold.TSNE(n_components=max(self.components) + 1, metric=self.metric.name, **kwds)
else:
- self.tsne = manifold.TSNE(n_components=max(self.components) + 1, metric="precomputed")
+ self.tsne = manifold.TSNE(n_components=max(self.components) + 1, metric="precomputed", **kwds)
def fit_transform(self, data):
"""
@@ -159,22 +162,72 @@ def fit_transform(self, data):
class MDS(Filters):
"""
MDS filters.
+ Implements with sklearn.maniford.MDS
Params:
components: The axis of projection. If you use components 0 and 1, this is [0, 1].
"""
- def __init__(self, components=[0, 1], metric=Metric(metric="euclidean"),random_state=None):
+ def __init__(self, components=[0, 1], metric=Metric(metric="euclidean"), **kwds):
super(MDS, self).__init__(components=components, metric=metric)
+ if self.metric.name == "euclidean":
+ self.mds = manifold.MDS(n_components=max(self.components) + 1,
+ dissimilarity="euclidean", n_jobs=-1, **kwds)
+ else:
+ self.mds = manifold.MDS(n_components=max(self.components) + 1,
+ dissimilarity="precomputed", n_jobs=-1, **kwds)
+
+ def fit_transform(self, data):
+ data = self._check_data(data)
+ if self.metric.name != "euclidean" and self.metric.name != "precomputed":
+ data = squareform(pdist(data,metric=self.metric.name))
+ data = self.metric.fit_transform(data)
+ else:
+ data = self.metric.fit_transform(data)
+ projected_data = self.mds.fit_transform(data)
+ return projected_data[:, self.components]
+
+class PCOA(Filters):
+ """
+ PCoA filters.
+ Implements with skbio.stats.ordination.pcoa
+ Params:
+ components: The axis of projection. If you use components 0 and 1, this is [0, 1].
+ """
+ def __init__(self, metric=Metric(metric="euclidean"), **kwds):
+ super(PCOA, self).__init__()
+ self.metric = metric
+
+ def fit_transform(self, data):
+ data = self._check_data(data)
+ if self.metric.name != "precomputed":
+ data = squareform(pdist(data,metric=self.metric.name))
+ data = self.metric.fit_transform(data)
+ else:
+ data = self.metric.fit_transform(data)
+
+ projected_data = ordination.pcoa(data)
+ self.pcoa = projected_data
+ return self.pcoa.samples.values[:, self.components]
+
+class UMAP(Filters):
+ """
+ MDS filters.
+ Params:
+ components: The axis of projection. If you use components 0 and 1, this is [0, 1].
+ """
+ def __init__(self, components=[0, 1], metric=Metric(metric="euclidean"), **kwds):
+ super(UMAP, self).__init__(components=components, metric=metric)
+
if self.metric.name in _METRIC_BUILT_IN:
- self.mds = manifold.MDS(n_components=max(self.components) + 1,random_state=random_state,
- dissimilarity=self.metric.name, n_jobs=-1)
+ self.umap = umap.UMAP(n_components=max(self.components) + 1,
+ metric=self.metric.name, **kwds)
else:
- self.mds = manifold.MDS(n_components=max(self.components) + 1,random_state=random_state,
- dissimilarity="precomputed", n_jobs=-1)
+ self.umap = umap.UMAP(n_components=max(self.components) + 1,
+ metric="precomputed", **kwds)
def fit_transform(self, data):
data = self._check_data(data)
if self.metric.name not in _METRIC_BUILT_IN:
data = self.metric.fit_transform(data)
- projected_data = self.mds.fit_transform(data)
+ projected_data = self.umap.fit_transform(data)
return projected_data[:, self.components]
diff --git a/tmap/tda/mapper.py b/tmap/tda/mapper.py
index 6dfc98d..ce3bbbc 100644
--- a/tmap/tda/mapper.py
+++ b/tmap/tda/mapper.py
@@ -3,7 +3,7 @@
import pandas as pd
import itertools
from sklearn import cluster
-
+from tqdm import tqdm
class Mapper(object):
"""
@@ -78,7 +78,6 @@ def map(self, data, cover, clusterer=cluster.DBSCAN(eps=0.5, min_samples=1)):
The resulting graph is a dictionary containing multiple keys and corresponding values. For better understanding the meaning of all keys and values. Here is the descriptions of each key.
-
1. nodes: Another dictionary for storge the mapping relationships between *nodes* and *samples*. Key is the name of nodes. Values is a list of corresponding index of samples.
2. edges: A list of 2-tuples for indicating edges between nodes.
3. adj_matrix: A square ``DataFrame`` constructed by nodes ID. The elements of the matrix indicate whether pairs of vertices are adjacent or not in the graph. (Unweighted)
@@ -93,6 +92,7 @@ def map(self, data, cover, clusterer=cluster.DBSCAN(eps=0.5, min_samples=1)):
# nodes, edges and graph of the TDA graph
graph = {}
nodes = {}
+ raw_nodes = {}
# projection data & raw data should have a same number of points
assert data.shape[0] == cover.n_points
@@ -104,29 +104,44 @@ def map(self, data, cover, clusterer=cluster.DBSCAN(eps=0.5, min_samples=1)):
min_cluster_samples = cluster_params.get("min_samples", 1)
if self.verbose >= 1:
print("...minimal number of points in hypercube to do clustering: %d" % (min_cluster_samples,))
-
+ # print("...iterating ")
# generate hypercubes from the cover and perform clustering analysis
cubes = cover.hypercubes
data_idx = np.arange(data.shape[0])
+ data_vals = np.array(data)
node_id = 0
- for cube in cubes:
- cube_data = data[cube]
+ if self.verbose >= 1:
+ _iteration = tqdm(cubes)
+ else:
+ _iteration = cubes
+ if clusterer.metric == "precomputed":
+ assert data_vals.shape[0] == data_vals.shape[1]
+
+ for cube in _iteration:
+ if clusterer.metric != "precomputed":
+ cube_data = data_vals[cube]
+ else:
+ cube_data = data_vals[cube][:, cube]
cube_data_idx = data_idx[cube]
if cube_data.shape[0] >= min_cluster_samples:
+ # storge raw node2samples relationship
+ raw_point_mask = np.zeros(data_vals.shape[0], dtype=bool)
+ raw_point_mask[cube_data_idx] = True
if (clusterer is not None) and ("fit" in dir(clusterer)):
clusterer.fit(cube_data)
for label in np.unique(clusterer.labels_):
# the "-1" label is used for "un-clustered" points!!!
if label != -1:
- point_mask = np.zeros(data.shape[0], dtype=bool)
+ point_mask = np.zeros(data_vals.shape[0], dtype=bool)
point_mask[cube_data_idx[clusterer.labels_ == label]] = True
nodes[node_id] = point_mask
+ raw_nodes[node_id] = raw_point_mask
node_id += 1
+
else:
# assumed to have a whole cluster of cubes!!!
- point_mask = np.zeros(data.shape[0], dtype=bool)
- point_mask[cube_data_idx] = True
- nodes[node_id] = point_mask
+ # it equals to the raw node2samples
+ nodes[node_id] = raw_point_mask
node_id += 1
if self.verbose >= 1:
@@ -153,15 +168,21 @@ def map(self, data, cover, clusterer=cluster.DBSCAN(eps=0.5, min_samples=1)):
print("...construct a TDA graph.")
node_ids = nodes.keys()
+
+ edges = [edge for edge in itertools.combinations(node_ids, 2) if np.any(nodes[edge[0]] & nodes[edge[1]])]
+ edges_df = pd.DataFrame(columns=['Source','End'],data=np.array(edges))
+ adj_matrix = pd.crosstab(edges_df.Source,edges_df.End)
+ adj_matrix = adj_matrix.reindex(index=node_ids, columns=node_ids).replace(0, np.nan)
# set the NaN value for filtering edges with pandas stack function
- adj_matrix = pd.DataFrame(data=np.nan, index=node_ids, columns=node_ids)
- # todo: this edges making step to be improved? using some native numpy?
- for k1, k2 in itertools.combinations(node_ids, 2):
- if np.any(nodes[k1] & nodes[k2]):
- adj_matrix.loc[k1, k2] = 1
-
- edges = adj_matrix.stack(dropna=True)
- edges = edges.index.tolist()
+ # adj_matrix = pd.DataFrame(data=np.nan, index=node_ids, columns=node_ids)
+ # # todo: this edges making step to be improved? using some native numpy?
+ # for k1, k2 in itertools.combinations(node_ids, 2):
+ # if np.any(nodes[k1] & nodes[k2]):
+ # adj_matrix.loc[k1, k2] = 1
+ # adj_matrix.loc[k2, k1] = 1
+ #
+ # edges = adj_matrix.stack(dropna=True)
+ # edges = edges.index.tolist()
if self.verbose >= 1:
print("...create %s edges." % (len(edges)))
print("Finish TDA mapping")
@@ -179,6 +200,9 @@ def map(self, data, cover, clusterer=cluster.DBSCAN(eps=0.5, min_samples=1)):
graph["node_keys"] = node_keys
graph["node_positions"] = node_positions
graph["node_sizes"] = node_sizes
- graph['params'] = {'cluster':clusterer.get_params(),'cover':{'resolution':cover.resolution,'overlap':cover.overlap}}
+ graph['params'] = {'cluster':clusterer.get_params(),
+ 'cover':{'resolution':cover.resolution,
+ 'overlap':cover.overlap},
+ '_raw_nodes':raw_nodes}
return graph
diff --git a/tmap/tda/plot.py b/tmap/tda/plot.py
index 4a03946..582df8f 100644
--- a/tmap/tda/plot.py
+++ b/tmap/tda/plot.py
@@ -1,13 +1,21 @@
# -*- coding: utf-8 -*-
-import numpy as np
-from scipy import stats
-from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import colorsys
-import matplotlib.pyplot as plt
+
import matplotlib.cm as cm
import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
import networkx as nx
+import numpy as np
+import pandas as pd
+import plotly
+import plotly.graph_objs as go
+from plotly import tools
+from scipy import stats
from sklearn import decomposition
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler, maxabs_scale
+
+from tmap.netx.SAFE import construct_node_data
+from tmap.netx.SAFE import get_enriched_nodes
class Color(object):
@@ -21,12 +29,12 @@ class Color(object):
It is normally have 4 distinct parts in color bar but it will easily missing some parts due to some skewness which misleading the values of percentile.
-
:param list/np.ndarray/pd.Series/dict target: target values for samples or nodes
:param str dtype: type of target values, "numerical" or "categorical"
:param str target_by: target type of "sample" or "node"
"""
+
def __init__(self, target, dtype="numerical", target_by="sample"):
"""
:param list/np.ndarray/pd.Series target: target values for samples or nodes
@@ -59,19 +67,19 @@ def __init__(self, target, dtype="numerical", target_by="sample"):
and (not isinstance(target[0][0], np.number))
):
self.label_encoder = LabelEncoder()
- self.target = self.label_encoder.fit_transform(target)
+ self.target = self.label_encoder.fit_transform(target.ravel())
elif dtype == "categorical":
self.label_encoder = LabelEncoder()
- self.target = self.label_encoder.fit_transform(target.astype(str))
+ self.target = self.label_encoder.fit_transform(target.astype(str).ravel())
else:
self.label_encoder = None
self.target = target
self.dtype = dtype
- self.labels = target
+ self.labels = target.astype(str)
self.target_by = target_by
- def _get_hex_color(self, i):
+ def _get_hex_color(self, i, cmap=None):
"""
map a normalize i value to HSV colors
:param i: input for the hue value, normalized to [0, 1.0]
@@ -98,22 +106,30 @@ def _rescale_target(self, target):
q1, median, q3 = np.percentile(target, 25), np.percentile(target, 50), np.percentile(target, 75)
- # if len(set([q1,median,q3])) != 3:
- # same_bounds = []
index_min_q1 = np.where(target <= q1)[0]
index_q1_median = np.where(((target >= q1) & (target <= median)))[0]
index_median_q3 = np.where(((target >= median) & (target <= q3)))[0]
index_q3_max = np.where(target >= q3)[0]
- target_min_q1 = scaler_min_q1.fit_transform(target[index_min_q1])
- target_q1_median = scaler_q1_median.fit_transform(target[index_q1_median])
- target_median_q3 = scaler_median_q3.fit_transform(target[index_median_q3])
- target_q3_max = scaler_q3_max.fit_transform(target[index_q3_max])
+ target_min_q1 = scaler_min_q1.fit_transform(target[index_min_q1]) if any(index_min_q1) else np.zeros(target[index_min_q1].shape)
+ target_q1_median = scaler_q1_median.fit_transform(target[index_q1_median]) if any(index_q1_median) else np.zeros(target[index_q1_median].shape)
+ target_median_q3 = scaler_median_q3.fit_transform(target[index_median_q3]) if any(index_median_q3) else np.zeros(target[index_median_q3].shape)
+ target_q3_max = scaler_q3_max.fit_transform(target[index_q3_max]) if any(index_q3_max) else np.zeros(target[index_q3_max].shape)
+ # in case the situation which will raise ValueError when sliced_index is all False.
+ # using percentile to cut and assign colors will meet some special case which own weak distribution.
+ # below `if` statement is trying to determine and solve these situations.
if all(target_q3_max == 0.75):
+ # all transformed q3 number equal to the biggest values 0.75.
+ # if we didn't solve it, red color which representing biggest value will disappear.
+ # solution: make it all into 1.
target_q3_max = np.ones(target_q3_max.shape)
if q1 == median == q3:
- target_q3_max = np.array([_ if _!= 0.75 else 0 for _ in target_q3_max[:,0]]).reshape(target_q3_max.shape)
+ # if the border of q1,median,q3 area are all same, it means the the distribution is extremely positive skewness.
+ # Blue color which represents smallest value will disappear.
+ # Solution: Make the range of transformed value output from the final quantile into 0-1.
+ target_q3_max = np.array([_ if _ != 0.75 else 0 for _ in target_q3_max[:, 0]]).reshape(target_q3_max.shape)
+
rescaled_target[index_median_q3] = target_median_q3
rescaled_target[index_q1_median] = target_q1_median
rescaled_target[index_min_q1] = target_min_q1
@@ -132,7 +148,7 @@ def get_colors(self, nodes, cmap=None):
node_keys = nodes.keys()
# map a color for each node
- node_color_idx = np.zeros((len(nodes), 1))
+ node_color_target = np.zeros((len(nodes), 1))
for i, node_id in enumerate(node_keys):
if self.target_by == 'node':
target_in_node = self.target[node_id]
@@ -141,15 +157,45 @@ def get_colors(self, nodes, cmap=None):
# summarize target values from samples/nodes for each node
if self.dtype == "categorical":
- # most common value (if more than one, the smallest is return)
- node_color_idx[i] = stats.mode(target_in_node)[0][0]
+ # Return an array of the modal (most common) value in the passed array. (if more than one, the smallest is return)
+ node_color_target[i] = stats.mode(target_in_node)[0][0]
elif self.dtype == "numerical":
- node_color_idx[i] = np.mean(target_in_node)
-
- _node_color_idx = self._rescale_target(node_color_idx)
+ node_color_target[i] = np.nanmean(target_in_node)
+ if np.any(np.isnan(node_color_target)):
+ print("Nan was found in the given target, Please check the input data.")
+ _node_color_idx = self._rescale_target(node_color_target)
node_colors = [self._get_hex_color(idx) for idx in _node_color_idx]
- return dict(zip(node_keys, node_colors)), (node_color_idx, node_colors)
+ return dict(zip(node_keys, node_colors)), (node_color_target, node_colors)
+
+ def get_sample_colors(self, cmap=None):
+ """
+ :param dict nodes: nodes from graph
+ :param cmap: not implemented yet...
+ :return: nodes colors with keys, and the color map of the target values
+ :rtype: tuple (first is a dict node_ID:node_color, second is a tuple (node_ID_index,node_color))
+ """
+ cat2colors = {}
+
+ if self.target_by == "node":
+ sample_colors = ['red' for _ in self.target]
+
+ else:
+ if self.dtype == "numerical":
+ _sample_color_idx = self._rescale_target(self.target)
+ sample_colors = [self._get_hex_color(idx) for idx in _sample_color_idx]
+ else:
+ labels = self.label_encoder.inverse_transform(self.target)
+ _sample_color_idx = np.arange(0.0, 1.1, 1.0 / (len(set(labels)) - 1)) # add 1 into idx, so it is 1.1 which is little bigger than 1.
+ target2idx = dict(zip(sorted(set(labels)), _sample_color_idx))
+ sample_colors = [self._get_hex_color(target2idx[label]) for label in labels]
+ cat2colors = dict(zip(labels, sample_colors))
+
+ if type(cmap) == dict and self.dtype == 'categorical':
+ sample_colors = [cmap.get(_, 'blue') for _ in labels]
+ # todo: implement for custom cmap for categorical values.
+
+ return sample_colors, cat2colors
def show(data, graph, color=None, fig_size=(10, 10), node_size=10, edge_width=2, mode=None, strength=None):
@@ -182,52 +228,71 @@ def show(data, graph, color=None, fig_size=(10, 10), node_size=10, edge_width=2,
if color is None:
color = 'red'
color_map = {node_id: color for node_id in node_keys}
- target2colors = (np.zeros((len(node_keys), 1)),[color] * len(node_keys))
+ target2colors = (np.zeros((len(node_keys), 1)), [color] * len(node_keys))
else:
color_map, target2colors = color.get_colors(graph["nodes"])
colorlist = [color_map[it] for it in node_keys]
- fig = plt.figure(figsize=fig_size)
- ax = fig.add_subplot(1, 1, 1)
-
node_target_values, node_colors = target2colors
- legend_lookup = dict(zip(node_target_values.reshape(-1,), node_colors))
+ legend_lookup = dict(zip(node_target_values.reshape(-1, ), node_colors))
- # add categorical legend
+ # if there are indicated color with ``Color``, it need to add some legend for given color.
if isinstance(color, Color):
if color.dtype == "categorical":
- for label in set([it[0] for it in color.labels]):
- if color.label_encoder:
- label_color = legend_lookup.get(color.label_encoder.transform([label])[0], None)
- else:
- label_color = legend_lookup.get(label, None)
+ fig = plt.figure(figsize=fig_size)
+ ax = fig.add_subplot(1, 1, 1)
+ if color.label_encoder:
+ encoded_cat = color.label_encoder.transform(color.labels.ravel())
+ # if color.label_encoder exist, color.labels must be some kinds of string list which is need to encoded.
+ else:
+ encoded_cat = color.labels.ravel()
+ # if not, it should be used directly as the node_target_values.
+ label_color = [legend_lookup.get(e_cat, None) for e_cat in encoded_cat]
+ get_label_color_dict = dict(zip(encoded_cat, label_color))
+
+ # add categorical legend
+ for label in sorted(set(encoded_cat)):
if label_color is not None:
- ax.plot([], [], 'o', color=label_color, label=label, markersize=10)
+ ax.plot([], [], 'o', color=get_label_color_dict[label], label=label, markersize=10)
legend = ax.legend(numpoints=1, loc="upper right")
legend.get_frame().set_facecolor('#bebebe')
# add numerical colorbar
elif color.dtype == "numerical":
- legend_values = sorted([_ for _ in legend_lookup])
- legned_colors = [legend_lookup[_] for _ in legend_values]
+ fig = plt.figure(figsize=(fig_size[0] * 10 / 9, fig_size[1]))
+ ax = fig.add_subplot(1, 1, 1)
+ legend_values = sorted([val for val in legend_lookup])
+ legned_colors = [legend_lookup.get(val, 'blue') for val in legend_values]
+ # add color bar
+ # TODO: Implement more details of color bar and make it more robust.
cmap = mcolors.LinearSegmentedColormap.from_list('my_cmap', legned_colors)
norm = mcolors.Normalize(min(legend_values), max(legend_values))
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
- cb = fig.colorbar(sm, shrink=0.5)
+ cb = fig.colorbar(sm,
+ shrink=0.5,
+ pad=0.1)
cb.ax.yaxis.set_ticks_position('right')
if min(legend_values) != 0:
+ # if minimum values of legend is not 0, it need to add a text to point out the minimum values.
if min(legend_values) * 100 >= 0.1:
+ # Determine whether the minimum value is too small to visualize pretty.
+ # .2f indicates accurate to the second decimal place.
+ # .2e indicated accurate to the second decimal after scientific notation.
cb.ax.text(0.5, -0.02, '{:.2f}'.format(min(legend_values)), ha='center', va='top', weight='bold')
else:
cb.ax.text(0.5, -0.02, '{:.2e}'.format(min(legend_values)), ha='center', va='top', weight='bold')
+
if max(legend_values) * 100 >= 0.1:
+ # same as the minimum values
cb.ax.text(0.5, 1.02, '{:.2f}'.format(max(legend_values)), ha='center', va='bottom', weight='bold')
else:
cb.ax.text(0.5, 1.02, '{:.2e}'.format(max(legend_values)), ha='center', va='bottom', weight='bold')
-
+ else:
+ fig = plt.figure(figsize=fig_size)
+ ax = fig.add_subplot(1, 1, 1)
if mode == 'spring':
pos = {}
# the projection space is one dimensional
@@ -248,11 +313,13 @@ def show(data, graph, color=None, fig_size=(10, 10), node_size=10, edge_width=2,
G.add_edges_from(graph["edges"])
pos = nx.spring_layout(G, pos=pos, k=strength)
# add legend
- nx.draw_networkx(G, pos=pos, node_size=sizes,
+ nx.draw_networkx(G, pos=pos,
+ node_size=sizes,
node_color=colorlist,
width=edge_width,
edge_color=[color_map[edge[0]] for edge in graph["edges"]],
- with_labels=False, label="0", ax=ax)
+ with_labels=False, ax=ax)
+
else:
fig = plt.figure(figsize=fig_size)
ax = fig.add_subplot(111)
@@ -266,3 +333,438 @@ def show(data, graph, color=None, fig_size=(10, 10), node_size=10, edge_width=2,
plt.axis("off")
plt.show()
+
+
+def get_arrows(graph, projected_X, safe_score, max_length=1, pvalue=0.05):
+ min_p_value = 1.0 / (5000 + 1.0)
+ threshold = np.log10(pvalue) / np.log10(min_p_value)
+
+ node_pos = construct_node_data(graph, projected_X)
+
+ safe_score_df = pd.DataFrame.from_dict(safe_score, orient='columns')
+ safe_score_df = safe_score_df.where(safe_score_df >= threshold, other=0)
+ norm_df = safe_score_df.apply(lambda x: maxabs_scale(x), axis=1, result_type='broadcast')
+
+ x_cor = norm_df.apply(lambda x: x * node_pos.values[:, 0], axis=0)
+ y_cor = norm_df.apply(lambda x: x * node_pos.values[:, 1], axis=0)
+
+ x_cor = x_cor.mean(0)
+ y_cor = y_cor.mean(0)
+ arrow_df = pd.DataFrame([x_cor, y_cor], index=['x coordinate', 'y coordinate'], columns=safe_score_df.columns)
+ all_fea_scale = maxabs_scale(safe_score_df.sum(0))
+ # scale each arrow by the sum of safe scoreļ¼ maximun is 1 others are percentage not larger than 100%.
+ scaled_ratio = max_length * all_fea_scale / arrow_df.apply(lambda x: np.sqrt(np.sum(x ** 2)), axis=0)
+ # using max length to multipy by scale ratio and denote the original length.
+ scaled_arrow_df = arrow_df * np.repeat(scaled_ratio.values.reshape(1, -1), axis=0, repeats=2).reshape(2, -1)
+
+ return scaled_arrow_df
+
+
+def tm_plot(graph, projected_X, filename, mode='file', include_plotlyjs='cdn', color=None, _color_SAFE=None, min_size=10, max_size=40, **kwargs):
+ vis_progressX(graph, projected_X, simple=True, filename=filename, include_plotlyjs=include_plotlyjs, color=color, _color_SAFE=_color_SAFE, min_size=min_size, max_size=max_size,
+ mode=mode, **kwargs)
+
+
+def vis_progressX(graph, projected_X, simple=False, mode='file', color=None, _color_SAFE=None, min_size=10, max_size=40, **kwargs):
+ """
+ For dynamic visualizing tmap construction process, it performs a interactive graph based on `plotly` with a slider to present the process from ordination to graph step by step. Currently, it doesn't provide any API for overriding the number of step from ordination to graph. It may be implemented at the future.
+
+ If you want to draw a simple graph with edges and nodes instead of the process, try the params ``simple``.
+
+ This visualized function is mainly based on plotly which is a interactive Python graphing library. The params mode is trying to provide multiple type of return for different purpose. There are three different modes you can choose including "file" which return a html created by plotly, "obj" which return a reusable python dict object and "web" which normally used at notebook and make inline visualization possible.
+
+ The color part of this function has a little bit complex because of the multiple sub-figures. Currently, it use the ``tmap.tda.plot.Color`` class to auto generate color with given array. More detailed about how to auto generate color could be reviewed at the annotation of ``tmap.tda.plot.Color``.
+
+ In this function, there are two kinds of color need to implement.
+
+ * First, all color and its showing text values of samples points should be followed by given color params. The color could be **any array** which represents some measurement of Nodes or Samples. **It doesn't have to be SAFE score**.
+
+ * Second, The ``_color_SAFE`` param should be a ``Color`` with a nodes-length array, which is normally a SAFE score.
+
+ :param graph:
+ :param np.array projected_X:
+ :param str mode: [file|obj|web]
+ :param bool simple:
+ :param color:
+ :param _color_SAFE:
+ :param kwargs:
+ :return:
+ """
+ node_pos = graph["node_positions"]
+ ori_MDS = projected_X
+ nodes = graph["nodes"]
+ sizes = graph["node_sizes"][:, 0]
+ sample_names = np.array(graph.get("sample_names", [])).astype(str)
+ minmax_scaler = MinMaxScaler(feature_range=(min_size, max_size))
+ mms_color = MinMaxScaler(feature_range=[0, 1])
+
+ # init some empty values if color wasn't given
+ target_v_raw = ['' for _ in nodes]
+ target_v = [0 for _ in nodes]
+ samples_colors = ['red' for _ in sample_names]
+
+ if color is not None:
+ color_map, target2colors = color.get_colors(graph["nodes"])
+ target_v = mms_color.fit_transform(target2colors[0]).ravel()
+ target_v_raw = target2colors[0].ravel()
+
+ samples_colors, cat2colors = color.get_sample_colors()
+
+ if color.dtype == 'categorical':
+ legend_names = np.array(color.label_encoder.inverse_transform(target2colors[0].reshape(-1, ).astype(int)))
+ else:
+ color_map = {}
+
+ # For calculating the dynamic process. It need to be aligned first.
+ # reconstructing the ori_MDS into the samples_pos
+ # reconstructing the node_pos into the center_pos
+ point_tmp = []
+ center_tmp = []
+ text_tmp = []
+ samples_colors_dynamic = []
+ for n in nodes:
+ point_tmp.append(ori_MDS[nodes[n], :])
+ center_tmp.append(np.concatenate([node_pos[[n], :]] * len(nodes[n]), axis=0))
+ text_tmp.append(sample_names[nodes[n]])
+ if color is not None:
+ samples_colors_dynamic += list(np.repeat(color_map.get(n, 'blue'), len(nodes[n])))
+ else:
+ samples_colors_dynamic.append("blue")
+ samples_pos = np.concatenate(point_tmp, axis=0)
+ center_pos = np.concatenate(center_tmp, axis=0)
+ broadcast_samples_text = np.concatenate(text_tmp, axis=0)
+ # For visualizing the movement of samples, it need to multiply one sample into multiple samples which is need to reconstruct pos,text.
+
+ node_pos = graph["node_positions"]
+ # prepare edge data
+ xs = []
+ ys = []
+ for edge in graph["edges"]:
+ xs += [node_pos[edge[0], 0],
+ node_pos[edge[1], 0],
+ None]
+ ys += [node_pos[edge[0], 1],
+ node_pos[edge[1], 1],
+ None]
+
+ # if there are _color_SAFE, it will present two kinds of color.
+ # one is base on original data, one is transformed-SAFE data. Use the second one.
+ if _color_SAFE is not None:
+ safe_color, safe_t2c = _color_SAFE.get_colors(graph["nodes"])
+ # former is a dict which key is node id and values is node color
+ # second is a tuple (node values, node color)
+ node_colors = [safe_color[_] for _ in range(len(nodes))]
+ target_v = mms_color.fit_transform(safe_t2c[0]).ravel() # minmaxscaled node values
+ target_v_raw = safe_t2c[0].ravel() # raw node values
+ else:
+ node_colors = [color_map.get(_, 'blue') for _ in range(len(nodes))]
+
+ # prepare node & samples text
+ node_vis_vals = target_v_raw # todo: test categorical color passed.
+ # values output from color.target. It need to apply mean function for a samples-length color.target.
+ node_text = [str(n) +
+ # node id
+ "
vals:%s
" % str(v) +
+ # node values output from color.target.
+ '
'.join(sample_names[nodes[n]]) for n, v in
+ # samples name concated with line break.
+ zip(nodes,
+ node_vis_vals)]
+ ### samples text
+ samples_text = ['sample ID:%s' % _ for _ in sample_names]
+
+ nv2c = dict(zip(target_v, node_colors)) # A dict which includes values of node to color
+ colorscale = []
+ for _ in sorted(set(target_v)):
+ colorscale.append([_, nv2c[_]])
+ colorscale[-1][0] = 1 # the last value must be 1
+ colorscale[0][0] = 0 # the first value must be 0
+
+ node_line = go.Scatter(
+ # ordination line
+ visible=False,
+ x=xs,
+ y=ys,
+ marker=dict(color="#8E9DA2",
+ opacity=0.7),
+ line=dict(width=1),
+ showlegend=False,
+ mode="lines")
+ node_marker = go.Scatter(
+ # node position
+ visible=False,
+ x=node_pos[:, 0],
+ y=node_pos[:, 1],
+ text=node_text,
+ hoverinfo="text",
+ marker=dict(color=node_colors,
+ size=minmax_scaler.fit_transform(np.array([sizes[_] for _ in range(len(nodes))]).reshape(-1, 1)),
+ opacity=1),
+ showlegend=False,
+ mode="markers")
+ sample_markers = go.Scatter(
+ visible=True,
+ x=ori_MDS[:, 0],
+ y=ori_MDS[:, 1],
+ marker=dict(color=samples_colors),
+ text=samples_text,
+ hoverinfo="text",
+ showlegend=False,
+ mode="markers")
+ # After all prepared work have been finished.
+ # Append all traces instance into fig
+ if simple:
+ fig = plotly.tools.make_subplots(1, 1)
+ node_line['visible'] = True
+ node_marker['visible'] = True
+ fig.append_trace(node_line, 1, 1)
+
+ if color is not None:
+ if color.dtype == 'numerical':
+ node_marker['marker']['color'] = target_v_raw
+ node_marker['marker']['colorscale'] = colorscale
+ node_marker['marker']['showscale'] = True
+ fig.append_trace(node_marker, 1, 1)
+ else:
+ minmax_scaler.fit(np.array([sizes[_] for _ in range(len(nodes))]).reshape(-1, 1))
+ for cat in np.unique(legend_names):
+ node_marker = go.Scatter(
+ # node position
+ visible=True,
+ x=node_pos[legend_names == cat, 0],
+ y=node_pos[legend_names == cat, 1],
+ text=np.array(node_text)[legend_names == cat],
+ hoverinfo="text",
+ marker=dict(color=cat2colors[cat],
+ size=minmax_scaler.transform(np.array([sizes[_] for _ in np.arange(len(nodes))[legend_names == cat]]).reshape(-1, 1)),
+ opacity=1),
+ name=cat,
+ showlegend=True,
+ mode="markers")
+ fig.append_trace(node_marker, 1, 1)
+ else:
+ fig.append_trace(node_marker, 1, 1)
+ else:
+ fig = plotly.tools.make_subplots(rows=2, cols=2, specs=[[{'rowspan': 2}, {}], [None, {}]],
+ # subplot_titles=('Mapping process', 'Original projection', 'tmap graph')
+ )
+ # original place or ordination place
+ fig.append_trace(sample_markers, 1, 1)
+
+ # dynamic process to generate 5 binning positions
+ n_step = 5
+ for s in range(1, n_step + 1):
+ # s = 1: move 1/steps
+ # s = steps: move to node position.
+ fig.append_trace(go.Scatter(
+ visible=False,
+ x=samples_pos[:, 0] + ((center_pos - samples_pos) / n_step * s)[:, 0],
+ y=samples_pos[:, 1] + ((center_pos - samples_pos) / n_step * s)[:, 1],
+ marker=dict(color=samples_colors_dynamic),
+ hoverinfo="text",
+ text=broadcast_samples_text,
+ showlegend=False,
+ mode="markers"), 1, 1)
+
+ # Order is important, do not change the order !!!
+ # There are the last 5 should be visible at any time
+ fig.append_trace(node_line, 1, 1)
+ fig.append_trace(node_marker, 1, 1)
+ node_line['visible'] = True
+ node_marker['visible'] = True
+ sample_markers['visible'] = True
+ fig.append_trace(node_line, 2, 2)
+ fig.append_trace(node_marker, 2, 2)
+ fig.append_trace(sample_markers, 1, 2)
+ ############################################################
+ steps = []
+ for i in range(n_step + 1):
+ step = dict(
+ method='restyle',
+ args=['visible', [False] * (n_step + 3) + [True, True, True]],
+ )
+ if i >= n_step:
+ step["args"][1][-5:] = [True] * 5 # The last 5 should be some traces must present at any time.
+ else:
+ step['args'][1][i] = True # Toggle i'th trace to "visible"
+ steps.append(step)
+
+ sliders = [dict(
+ active=0,
+ currentvalue={"prefix": "status: "},
+ pad={"t": 20},
+ steps=steps
+ )]
+ ############################################################
+ layout = dict(sliders=sliders,
+ width=2000,
+ height=1000,
+ xaxis1={ # "range": [0, 1],
+ "domain": [0, 0.5]},
+ yaxis1={ # "range": [0, 1],
+ "domain": [0, 1]},
+ xaxis2={ # "range": [0, 1],
+ "domain": [0.6, 0.9]},
+ yaxis2={ # "range": [0, 1],
+ "domain": [0.5, 1]},
+ xaxis3={ # "range": [0, 1],
+ "domain": [0.6, 0.9]},
+ yaxis3={ # "range": [0, 1],
+ "domain": [0, 0.5]},
+ hovermode="closest"
+ )
+ fig.layout.update(layout)
+
+ if mode == 'file':
+ plotly.offline.plot(fig, **kwargs)
+
+ elif mode == 'web':
+ plotly.offline.iplot(fig, **kwargs)
+ elif mode == 'obj':
+ return fig
+ else:
+ print("mode params must be one of 'file', 'web', 'obj'. \n 'file': output html file \n 'web': show in web browser. \n 'obj': return a dict object.")
+
+
+def draw_enriched_plot(graph, safe_score, fea, metainfo, _filter_size=0, **kwargs):
+ """
+ Draw simple node network which only show component which is larger than _filter_size and colorized with
+ its safe_score.
+
+ :param graph:
+ :param safe_score:
+ :param fea:
+ :param metainfo:
+ :param _filter_size:
+ :param kwargs:
+ :return:
+ """
+ enriched_nodes, comps_nodes = metainfo[fea]
+
+ node_pos = graph["node_positions"]
+ sizes = graph["node_sizes"][:, 0]
+ fig = plotly.tools.make_subplots(1, 1)
+ xs = []
+ ys = []
+
+ for edge in graph["edges"]:
+ xs += [node_pos[edge[0], 0],
+ node_pos[edge[1], 0],
+ None]
+ ys += [node_pos[edge[0], 1],
+ node_pos[edge[1], 1],
+ None]
+
+ node_line = go.Scatter(
+ # ordination line
+ visible=True,
+ x=xs,
+ y=ys,
+ hoverinfo='none',
+ marker=dict(color="#8E9DA2", ),
+ line=dict(width=1),
+ showlegend=False,
+ mode="lines")
+
+ fig.append_trace(node_line, 1, 1)
+
+ for idx, nodes in enumerate(comps_nodes):
+ if _filter_size:
+ if len(nodes) <= _filter_size:
+ continue
+ tmp1 = {k: v if k in nodes else np.nan for k, v in safe_score[fea].items()}
+ node_position = go.Scatter(
+ # node position
+ visible=True,
+ x=node_pos[[k for k, v in safe_score[fea].items() if not np.isnan(tmp1[k])], 0],
+ y=node_pos[[k for k, v in safe_score[fea].items() if not np.isnan(tmp1[k])], 1],
+ hoverinfo="text",
+ text=['node:%s,SAFE:%s' % (k, safe_score[fea][k]) for k, v in safe_score[fea].items() if not np.isnan(tmp1[k])],
+ marker=dict( # color=node_colors,
+ size=[7 + sizes[_] for _ in [k for k, v in safe_score[fea].items() if not np.isnan(tmp1[k])]],
+ opacity=0.8),
+ showlegend=True,
+ name='comps_%s' % idx,
+ mode="markers")
+ fig.append_trace(node_position, 1, 1)
+
+ fig.layout.font.size = 15
+ fig.layout.title = fea
+ fig.layout.height = 1500
+ fig.layout.width = 1500
+ fig.layout.hovermode = 'closest'
+ plotly.offline.plot(fig, **kwargs)
+
+
+def pair_draw(data, projected_X, fit_result, safe_scores, graph, fea1, fea2=None, n_iter=5000, p_value=0.05, ):
+ if fea2 is not None:
+ col = 2
+ subtitles = ['%s ordination' % fea1,
+ '%s Tmap' % fea1,
+ '%s ordination' % fea2,
+ '%s Tmap' % fea2]
+ feas = [fea1, fea2]
+ else:
+ col = 1
+ subtitles = ['%s ordination' % fea1,
+ '%s Tmap' % fea1]
+ feas = [fea1]
+
+ fig = tools.make_subplots(2, col, subplot_titles=subtitles,
+ horizontal_spacing=0,
+ vertical_spacing=0)
+
+ def draw_ord_and_tmap(fig, projected_X, fit_result, fea, row, col, data):
+ if fea in data.columns:
+ color = Color(data.loc[:, fea].astype(float), target_by='sample')
+ else:
+ print('Error occur, %s seem like a new feature for given data' % fea)
+
+ fig.append_trace(go.Scatter(x=projected_X[:, 0],
+ y=projected_X[:, 1],
+ # text=metadata.loc[:,fea],
+ hoverinfo='text',
+ mode='markers',
+ marker=dict(color=color.get_sample_colors())
+ , showlegend=False),
+ row, col)
+ if fea in fit_result.index:
+ fig.append_trace(go.Scatter(x=[0, fit_result.loc[fea, 'adj_Source']],
+ y=[0, fit_result.loc[fea, 'adj_End']],
+ mode='lines+text', showlegend=False,
+ text=['', round(fit_result.loc[fea, 'r2'], 4)]), row, col)
+
+ min_p_value = 1.0 / (n_iter + 1.0)
+ threshold = np.log10(p_value) / np.log10(min_p_value)
+ enriched_nodes = get_enriched_nodes(safe_scores, threshold, graph)
+
+ fig_container = []
+ for idx, fea in enumerate(feas):
+ draw_ord_and_tmap(fig, projected_X, fit_result, fea, idx + 1, 1, data)
+ cache = {node: safe_scores[fea][node] if node in enriched_nodes[fea] else 0 for node in safe_scores[fea].keys()}
+ f = vis_progressX(graph, projected_X, color=Color(cache, target_by='node'), mode='obj', simple=True)
+ fig_container.append(f)
+
+ for idx, f in enumerate(fig_container):
+ for _ in f.data:
+ fig.append_trace(_,
+ idx + 1,
+ 2)
+
+ fig.layout.width = 2000
+ fig.layout.height = 2000
+ fig.layout.xaxis1.zeroline = False
+ fig.layout.yaxis1.zeroline = False
+ fig.layout.xaxis3.zeroline = False
+ fig.layout.yaxis3.zeroline = False
+ fig.layout.hovermode = 'closest'
+ # showticklabels
+ for _ in dir(fig.layout):
+ if _.startswith('xaxis') or _.startswith('yaxis'):
+ fig.layout[_]['showticklabels'] = False
+ fig.layout.font.update(dict(size=20))
+ for _ in fig.layout.annotations:
+ _['font'].update(dict(size=25))
+ return fig
+ # plotly.offline.plot(fig, auto_open=False, **kwargs)
+ # plotly.offline.iplot(fig)
diff --git a/tmap/tda/utils.py b/tmap/tda/utils.py
index 1b5e1bc..a15f9d4 100644
--- a/tmap/tda/utils.py
+++ b/tmap/tda/utils.py
@@ -1,9 +1,25 @@
-from sklearn.neighbors import *
+from __future__ import print_function
+
+import csv
+import os
+import pickle
+
+import networkx as nx
import numpy as np
import pandas as pd
-import csv,os
+import scipy.stats as scs
+from sklearn.neighbors import *
+from sklearn.preprocessing import MinMaxScaler
-def optimize_dbscan_eps(data, threshold=90):
+from tmap.tda import mapper
+from tmap.tda.cover import Cover
+
+
+def optimize_dbscan_eps(data, threshold=90, dm=None):
+ if dm is not None:
+ tmp = dm.where(dm != 0, np.inf)
+ eps = np.percentile(np.min(tmp, axis=0), threshold)
+ return eps
# using metric='minkowski', p=2 (that is, a euclidean metric)
tree = KDTree(data, leaf_size=30, metric='minkowski', p=2)
# the first nearest neighbor is itself, set k=2 to get the second returned
@@ -12,36 +28,112 @@ def optimize_dbscan_eps(data, threshold=90):
eps = np.percentile(dist[:, 1], threshold)
return eps
-def construct_node_data(graph,data):
+
+def optimal_r(X, projected_X, clusterer, mid, overlap, step=1):
+ def get_y(r):
+ tm = mapper.Mapper(verbose=0)
+ cover = Cover(projected_data=MinMaxScaler().fit_transform(projected_X), resolution=r, overlap=overlap)
+ graph = tm.map(data=X, cover=cover, clusterer=clusterer)
+ if "adj_matrix" not in graph.keys():
+ return np.inf
+ return abs(scs.skew(graph["adj_matrix"].count()))
+
+ mid_y = get_y(mid)
+ mid_y_r = get_y(mid + 1)
+ mid_y_l = get_y(mid - 1)
+ while 1:
+ min_r = sorted(zip([mid_y_l, mid_y, mid_y_r], [mid - 1, mid, mid + 1]))[0][1]
+ if min_r == mid - step:
+ mid -= step
+ mid_y, mid_y_r = mid_y_l, mid_y
+ mid_y_l = get_y(mid)
+ elif min_r == mid + step:
+ mid += step
+ mid_y, mid_y_l = mid_y_r, mid_y
+ mid_y_r = get_y(mid)
+ else:
+ break
+ print("suitable resolution is ", mid)
+ return mid
+
+
+def construct_node_data(graph, data):
nodes = graph['nodes']
- node_data = {k: data.iloc[v, :].mean(axis=0) for k, v in nodes.items()}
- node_data = pd.DataFrame.from_dict(node_data, orient='index')
+ if "iloc" in dir(data):
+ node_data = {k: data.iloc[v, :].mean() for k, v in nodes.items()}
+ node_data = pd.DataFrame.from_dict(node_data, orient='index', columns=data.columns)
+ else:
+ node_data = {k: np.mean(data[v, :], axis=0) for k, v in nodes.items()}
+ node_data = pd.DataFrame.from_dict(node_data, orient='index')
return node_data
-def cover_ratio(graph,data):
+
+def node2samples(node2s, safe_dict):
+ """
+ get corresponding samples (samples in enriched nodes) at safe_dict.
+ there are overlapped samples between nodes, and should be deduplicated.
+ :param dict node2s: relationship between nodes and samples, as graph["nodes"]
+ :param dict safe_dict: enrichment or decline SAFE score dict from SAFE_batch.
+ :return: most contents are the same as safe_dict except for most inner ids are sample ids instead of node ids.
+ """
+ return {feature: list(set([sample_id for node_id in node_ids
+ for sample_id in safe_dict[node_id]]))
+ for feature, node_ids in node2s.items()}
+
+
+def get_pos(graph, strength):
+ node_keys = graph["node_keys"]
+ node_positions = graph["node_positions"]
+ G = nx.Graph()
+ G.add_nodes_from(graph['nodes'].keys())
+ G.add_edges_from(graph['edges'])
+ pos = {}
+ for i, k in enumerate(node_keys):
+ pos.update({int(k): node_positions[i, :2]})
+ pos = nx.spring_layout(G, pos=pos, k=strength)
+ return pos
+
+
+## Access inner attribute
+
+def cover_ratio(graph, data):
nodes = graph['nodes']
all_samples_in_nodes = [_ for vals in nodes.values() for _ in vals]
n_all_sampels = data.shape[0]
n_in_nodes = len(set(all_samples_in_nodes))
- return n_in_nodes/float(n_all_sampels) * 100
+ return n_in_nodes / float(n_all_sampels) * 100
+
-def safe_scores_IO(arg,output_path=None,mode='w'):
+## Export data as file
+
+def safe_scores_IO(arg, output_path=None, mode='w'):
if mode == 'w':
- if not isinstance(arg,pd.DataFrame):
- safe_scores = pd.DataFrame.from_dict(arg,orient='index')
+ if not isinstance(arg, pd.DataFrame):
+ safe_scores = pd.DataFrame.from_dict(arg, orient='index')
safe_scores = safe_scores.T
else:
safe_scores = arg
- safe_scores.to_csv(output_path,index=True)
+ safe_scores.to_csv(output_path, index=True)
elif mode == 'rd':
- safe_scores = pd.read_csv(arg,index_col=0)
+ safe_scores = pd.read_csv(arg, index_col=0)
safe_scores = safe_scores.to_dict()
return safe_scores
elif mode == 'r':
- safe_scores = pd.read_csv(arg,index_col=0)
+ safe_scores = pd.read_csv(arg, index_col=0)
+ safe_scores = safe_scores.to_dict('index')
return safe_scores
-def output_graph(graph,filepath,sep='\t'):
+
+def read_graph(path):
+ graph = pickle.load(open(path, 'rb'))
+ return graph
+
+
+def dump_graph(graph, path):
+ pickle.dump(graph, open(path, "wb"))
+
+
+def output_graph(graph, filepath, sep='\t'):
"""
Export graph as a file with sep. The output file should be used with `Cytoscape `_ .
@@ -52,73 +144,46 @@ def output_graph(graph,filepath,sep='\t'):
:param str sep:
"""
edges = graph['edges']
- with open(os.path.realpath(filepath),'w') as csvfile:
+ with open(os.path.realpath(filepath), 'w') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=sep)
spamwriter.writerow(['Source', 'Target'])
- for source,target in edges:
- spamwriter.writerow([source,target])
-
-def output_Node_data(graph,filepath,data,features = None,sep='\t',target_by='sample'):
- """
- Export Node data with provided filepath. The output file should be used with `Cytoscape `_ .
-
- It should be noticed that it will overwrite the file you provided.
-
- :param dict graph:
- :param str filepath:
- :param np.ndarray/pandas.Dataframe data: with shape [n_samples,n_features] or [n_nodes,n_features]
- :param list features: It could be None and it will use count number as feature names.
- :param str sep:
- :param str target_by: target type of "sample" or "node"
- """
- if target_by not in ['sample','node']:
- exit("target_by should is one of ['sample','node']")
- nodes = graph['nodes']
- node_keys = graph['node_keys']
- if 'columns' in dir(data) and features is None:
- features = list(data.columns)
- elif 'columns' not in dir(data) and features is None:
- features = list(range(data.shape[1]))
- else:
- features = list(features)
-
- if type(data) != np.ndarray:
- data = np.array(data)
-
- if target_by == 'sample':
- data = np.array([np.mean(data[nodes[_]],axis=0) for _ in node_keys])
- else:
- pass
-
- with open(os.path.realpath(filepath),'w') as csvfile:
- spamwriter = csv.writer(csvfile, delimiter=sep)
- spamwriter.writerow(['NodeID'] + features)
- for idx,v in enumerate(node_keys):
- spamwriter.writerow([str(v)] + [str(_) for _ in data[idx,:]])
-
-def output_Edge_data(graph,filepath,sep='\t'):
- """
- Export edge data with sep [default=TAB]
-
- Mainly for the result of tmap.tda.netx.coenrich
-
- The output file should be used with `Cytoscape `_ .
-
- :param dict graph: graph output by netx.coenrich
- :param str filepath:
- :param str sep:
- """
- if isinstance(graph,dict):
- if "association_coeffient" in graph.keys() and "associated_pairs" in graph.keys():
- edges = graph["associated_pairs"]
- edge_weights = graph["association_coeffient"]
- with open(os.path.realpath(filepath), 'w') as csvfile:
- spamwriter = csv.writer(csvfile, delimiter=sep)
- spamwriter.writerow(["Edge name","coenrich_score"])
- for node1,node2 in edges:
- spamwriter.writerow(["%s (interacts with) %s" % (node1,node2),
- edge_weights[(node1,node2)]])
- else:
- print("Missing key 'association_coeffient' or 'associated_pairs' in graph")
- else:
- print("graph should be a dictionary")
\ No newline at end of file
+ for source, target in edges:
+ spamwriter.writerow([source, target])
+#
+# def output_Node_data(graph,filepath,data,features = None,sep='\t',target_by='sample'):
+# """
+# Export Node data with provided filepath. The output file should be used with `Cytoscape `_ .
+#
+# It should be noticed that it will overwrite the file you provided.
+#
+# :param dict graph:
+# :param str filepath:
+# :param np.ndarray/pandas.Dataframe data: with shape [n_samples,n_features] or [n_nodes,n_features]
+# :param list features: It could be None and it will use count number as feature names.
+# :param str sep:
+# :param str target_by: target type of "sample" or "node"
+# """
+# if target_by not in ['sample','node']:
+# exit("target_by should is one of ['sample','node']")
+# nodes = graph['nodes']
+# node_keys = graph['node_keys']
+# if 'columns' in dir(data) and features is None:
+# features = list(data.columns)
+# elif 'columns' not in dir(data) and features is None:
+# features = list(range(data.shape[1]))
+# else:
+# features = list(features)
+#
+# if type(data) != np.ndarray:
+# data = np.array(data)
+#
+# if target_by == 'sample':
+# data = np.array([np.mean(data[nodes[_]],axis=0) for _ in node_keys])
+# else:
+# pass
+#
+# with open(os.path.realpath(filepath),'w') as csvfile:
+# spamwriter = csv.writer(csvfile, delimiter=sep)
+# spamwriter.writerow(['NodeID'] + features)
+# for idx,v in enumerate(node_keys):
+# spamwriter.writerow([str(v)] + [str(_) for _ in data[idx,:]])
diff --git a/tmap/test/test_Daily_saliva.py b/tmap/test/test_Daily_saliva.py
index bca3c34..4fdf5bc 100644
--- a/tmap/test/test_Daily_saliva.py
+++ b/tmap/test/test_Daily_saliva.py
@@ -33,10 +33,14 @@
target_feature = 'COLLECTION_DAY'
+
color = Color(target=metadata.loc[:, target_feature], dtype="numerical", target_by="sample")
-show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03)
+show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03
+ )
target_feature = 'Bacteroides'
color = Color(target=X.loc[:, target_feature], dtype="numerical", target_by="sample")
-show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03)
+show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03
+ )
+
diff --git a/tmap/test/test_Daily_stool.py b/tmap/test/test_Daily_stool.py
index 1a4d95c..fdd54ba 100644
--- a/tmap/test/test_Daily_stool.py
+++ b/tmap/test/test_Daily_stool.py
@@ -6,8 +6,6 @@
from tmap.tda.plot import show, Color
from tmap.tda.metric import Metric
from tmap.tda.utils import optimize_dbscan_eps,cover_ratio
-from tmap.netx.SAFE import SAFE_batch
-from tmap.netx.coenrich import coenrich
from tmap.test import load_data
from matplotlib.pyplot import title
@@ -37,19 +35,19 @@
target_feature = 'COLLECTION_DAY'
color = Color(target=metadata.loc[:, target_feature], dtype="numerical", target_by="sample")
-show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03)
+show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.12)
target_feature = 'HOST_SUBJECT_ID'
color = Color(target=metadata.loc[:, target_feature], dtype="categorical", target_by="sample")
-show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03)
+show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.12)
color = Color(target=metadata.loc[:, target_feature], dtype="numerical", target_by="sample")
-show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03)
+show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.12)
def time_range(sample,start,end):
target_vals = [1 if metadata.loc[_,"HOST_SUBJECT_ID"]=="2202:Donor%s" % sample and metadata.loc[_,"COLLECTION_DAY"] in list(range(start,end+1)) else 0 for _ in X.index]
color = Color(target=target_vals, dtype="numerical", target_by="sample")
- show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.03)
+ show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.12)
title("Subject %s at %s to %s" % (sample,start,end))
# Travel period
@@ -67,17 +65,17 @@ def time_range(sample,start,end):
time_range("A",123,153)
############################################################
-n_iter = 1000
-safe_scores = SAFE_batch(graph, meta_data=X, n_iter=n_iter, threshold=0.05)
-safe_scores_meta = SAFE_batch(graph, meta_data=metadata.iloc[:,4:], n_iter=n_iter, threshold=0.05)
-
-safe_scores.update(safe_scores_meta)
-co_vals = coenrich(graph,safe_scores)
-
-############################################################
-meta_cols = list(metadata.iloc[:,4:].columns)
-genera = list(X.columns)
-
-co_meta2genus = [_ for _ in co_vals["edges"] if (_[0] in meta_cols and _[1] in genera) or (_[1] in meta_cols and _[0] in genera)]
-
-co_meta2genus_s = sorted([(co_vals['edge_adj_weights(fdr)'][_],_) for _ in co_meta2genus])
\ No newline at end of file
+# n_iter = 1000
+# safe_scores = SAFE_batch(graph, meta_data=X, n_iter=n_iter, threshold=0.05)
+# safe_scores_meta = SAFE_batch(graph, meta_data=metadata.iloc[:,4:], n_iter=n_iter, threshold=0.05)
+#
+# safe_scores.update(safe_scores_meta)
+# co_vals = coenrich(graph,safe_scores)
+#
+# ############################################################
+# meta_cols = list(metadata.iloc[:,4:].columns)
+# genera = list(X.columns)
+#
+# co_meta2genus = [_ for _ in co_vals["edges"] if (_[0] in meta_cols and _[1] in genera) or (_[1] in meta_cols and _[0] in genera)]
+#
+# co_meta2genus_s = sorted([(co_vals['edge_adj_weights(fdr)'][_],_) for _ in co_meta2genus])
\ No newline at end of file
diff --git a/tmap/test/test_FGFP.py b/tmap/test/test_FGFP.py
index 1cd8a57..83f0a19 100644
--- a/tmap/test/test_FGFP.py
+++ b/tmap/test/test_FGFP.py
@@ -1,14 +1,17 @@
from __future__ import print_function
-from sklearn.preprocessing import MinMaxScaler
+
+import os
+
from sklearn.cluster import DBSCAN
+from sklearn.preprocessing import MinMaxScaler
+
+from tmap.netx.SAFE import SAFE_batch, get_SAFE_summary
from tmap.tda import mapper, filter
from tmap.tda.cover import Cover
-from tmap.tda.plot import show, Color
from tmap.tda.metric import Metric
-from tmap.tda.utils import optimize_dbscan_eps,cover_ratio
-from tmap.netx.SAFE import SAFE_batch, get_SAFE_summary
+from tmap.tda.plot import show, Color
+from tmap.tda.utils import optimize_dbscan_eps, cover_ratio
from tmap.test import load_data
-import os
# load taxa abundance data, sample metadata and precomputed distance matrix
X = load_data.FGFP_genus_profile()
@@ -36,7 +39,7 @@
# target_feature = 'Prevotella'
target_feature = 'Bacteroides'
n_iter = 1000
-safe_scores = SAFE_batch(graph, meta_data=X, n_iter=n_iter, threshold=0.05)
+safe_scores = SAFE_batch(graph, meta_data=X, n_iter=n_iter)
target_safe_score = safe_scores[target_feature]
# target_safe_score = SAFE_single(graph, X.loc[:, target_feature], n_iter=1000, threshold=0.05)
@@ -52,6 +55,4 @@
show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.08)
safe_summary = get_SAFE_summary(graph=graph, meta_data=X, safe_scores=safe_scores,
- n_iter_value=n_iter, p_value=0.01)
-safe_summary.to_csv('/home/liaoth/data2/project/TDA_paper/compare_/genus_SAFE_v2.csv', index=True)
-safe_summary.to_csv(os.path.join(os.path.dirname(__file__),'../example/FGFP_SAFE_ranking_genus_1000.csv'), index=True)
+ n_iter_value=n_iter, p_value=0.01)
\ No newline at end of file
diff --git a/tmap/test/test_circles.py b/tmap/test/test_circles.py
index de2e689..5d6960e 100644
--- a/tmap/test/test_circles.py
+++ b/tmap/test/test_circles.py
@@ -22,3 +22,4 @@
# Step4. Visualization
color = Color(target=y, dtype="categorical")
show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.04)
+
diff --git a/tmap/test/test_digits.py b/tmap/test/test_digits.py
index f067c05..153e3af 100644
--- a/tmap/test/test_digits.py
+++ b/tmap/test/test_digits.py
@@ -24,4 +24,5 @@
# Step4. Visualization
color = Color(target=y, dtype="categorical")
-show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=10, mode='spring', strength=0.03)
+show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=10, mode='spring', strength=0.2)
+
diff --git a/tmap/test/test_iris.py b/tmap/test/test_iris.py
index a3de05d..33f4db9 100644
--- a/tmap/test/test_iris.py
+++ b/tmap/test/test_iris.py
@@ -4,7 +4,7 @@
from tmap.tda import mapper, filter
from tmap.tda.cover import Cover
from tmap.tda.plot import show, Color
-
+from tmap.tda.utils import cover_ratio
iris = datasets.load_iris()
X = iris.data
@@ -21,7 +21,7 @@
clusterer = DBSCAN(eps=0.75, min_samples=1)
cover = Cover(projected_data=MinMaxScaler().fit_transform(projected_X), resolution=20, overlap=0.75)
graph = tm.map(data=StandardScaler().fit_transform(X), cover=cover, clusterer=clusterer)
-
+print(cover_ratio(graph, X))
# Step4. Visualization
color = Color(target=y, dtype="categorical")
-show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.04)
+show(data=X, graph=graph, color=color, fig_size=(10, 10), node_size=15, mode='spring', strength=0.1)