diff --git a/.github/workflows/run-tox.yml b/.github/workflows/run-tox.yml index c2e8434d..5f1fbc21 100644 --- a/.github/workflows/run-tox.yml +++ b/.github/workflows/run-tox.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v2 diff --git a/examples/cli/run_example.sh b/examples/cli/run_example.sh index bbde4f53..e2b14ee9 100755 --- a/examples/cli/run_example.sh +++ b/examples/cli/run_example.sh @@ -3,10 +3,10 @@ export OMP_NUM_THREADS=1 # set to one to prevent numpy to run in parallel echo 'Getting data' -hcga -v get_data $1 +#hcga -v get_data $1 echo 'Extracting features' -hcga -v extract_features datasets/$1.pkl -m fast -n 4 --timeout 10.0 +hcga -vvv extract_features datasets/$1.pkl -m fast -n 5 --timeout 10.0 echo 'Run classification' hcga -v feature_analysis $1 diff --git a/hcga/app.py b/hcga/app.py index 6b98f2d9..d293c4e8 100644 --- a/hcga/app.py +++ b/hcga/app.py @@ -26,7 +26,6 @@ import click L = logging.getLogger(__name__) -L.setLevel(logging.DEBUG) # pylint: disable=too-many-arguments,too-many-locals diff --git a/hcga/extraction.py b/hcga/extraction.py index 43181926..7f25de6d 100644 --- a/hcga/extraction.py +++ b/hcga/extraction.py @@ -188,11 +188,13 @@ def feature_extraction(graph, list_feature_classes, with_runtimes=False): Returns: (DataFrame): dataframe of calculated features for a given graph. """ + L.debug("computing %s", graph) column_indexes = pd.MultiIndex( levels=[[], []], codes=[[], []], names=["feature_class", "feature_name"] ) features_df = pd.DataFrame(columns=column_indexes) - for feature_class in list_feature_classes: + for i, feature_class in enumerate(list_feature_classes): + L.debug("computing: %s/ %s, %s", i, len(list_feature_classes), feature_class) if with_runtimes: start_time = time.time() @@ -201,6 +203,7 @@ def feature_extraction(graph, list_feature_classes, with_runtimes=False): columns = [(feat_class_inst.shortname, col) for col in features.columns] features_df[columns] = features del feat_class_inst + L.debug("done with: %s/ %s, %s", i, len(list_feature_classes), feature_class) if with_runtimes: features_df[("runtimes", feature_class.name)] = time.time() - start_time diff --git a/hcga/feature_class.py b/hcga/feature_class.py index fa6a72bd..9bae8df6 100644 --- a/hcga/feature_class.py +++ b/hcga/feature_class.py @@ -41,7 +41,7 @@ def _hmean(dist): def _mode(dist): """""" - return st.mode(dist)[0][0] + return st.mode(dist).mode def _get_index(args, i=0): @@ -56,12 +56,16 @@ def _trivial(graph): # pylint: disable=unused-argument def _feat_N(graph, features): """""" - return features / len(graph.nodes) + if features is not None: + return features / len(graph.nodes) + return None def _feat_E(graph, features): """""" - return features / len(graph.edges) + if features is not None: + return features / len(graph.edges) + return None class FeatureClass: @@ -413,13 +417,13 @@ def _clustering_statistics(self, community_partition, feat_name, feat_desc, feat self.add_feature( feat_name + "_coverage", - lambda: list(partial(quality.partition_quality, partition=community_partition))[0], + lambda graph: quality.partition_quality(graph, partition=community_partition)[0], "Coverage" + compl_desc, feat_interpret, ) self.add_feature( feat_name + "_performance", - lambda: list(partial(quality.partition_quality, partition=community_partition))[1], + lambda graph: quality.partition_quality(graph, partition=community_partition)[1], "Performance" + compl_desc, feat_interpret, ) diff --git a/hcga/features/basal_nodes.py b/hcga/features/basal_nodes.py index 772fc05b..65c60890 100755 --- a/hcga/features/basal_nodes.py +++ b/hcga/features/basal_nodes.py @@ -2,6 +2,8 @@ from functools import lru_cache +import networkx as nx + from hcga.feature_class import FeatureClass, InterpretabilityScore featureclass_name = "BasalNodes" @@ -21,24 +23,34 @@ def basal_nodes_func(graph): def n_basal_nodes(graph): """n_basal_nodes.""" - return len(basal_nodes_func(graph)) + if nx.is_directed(graph): + return len(basal_nodes_func(graph)) + return 0 def basal_degrees(graph): """basal_degrees""" - return [dict(graph.out_degree)[i] for i in basal_nodes_func(graph)] + if nx.is_directed(graph): + return [dict(graph.out_degree)[i] for i in basal_nodes_func(graph)] + return [0] def n_basal_edges(graph): """n_basal_edges""" - return sum(dict(graph.out_degree)[i] for i in basal_nodes_func(graph)) + if nx.is_directed(graph): + return sum(dict(graph.out_degree)[i] for i in basal_nodes_func(graph)) + return 0 def exp_basal_edge(graph): """exp_basal_edge""" - in_degs = list(dict(graph.in_degree).values()) - r = sum(dict(graph.out_degree)[i] for i in basal_nodes_func(graph)) / (graph.number_of_edges()) - return [i * r for i in in_degs] + if nx.is_directed(graph): + in_degs = list(dict(graph.in_degree).values()) + r = sum(dict(graph.out_degree)[i] for i in basal_nodes_func(graph)) / ( + graph.number_of_edges() + ) + return [i * r for i in in_degs] + return [0] @lru_cache(maxsize=None) @@ -50,26 +62,34 @@ def attracting_nodes_func(graph): def n_attracting_nodes(graph): """n_attracting_nodes""" - return len(attracting_nodes_func(graph)) + if nx.is_directed(graph): + return len(attracting_nodes_func(graph)) + return 0 def attracting_degrees(graph): """attracting_degrees""" - return [dict(graph.in_degree)[i] for i in attracting_nodes_func(graph)] + if nx.is_directed(graph): + return [dict(graph.in_degree)[i] for i in attracting_nodes_func(graph)] + return [0] def n_attracting_edges(graph): """n_attracting_edges""" - return sum(dict(graph.in_degree)[i] for i in attracting_nodes_func(graph)) + if nx.is_directed(graph): + return sum(dict(graph.in_degree)[i] for i in attracting_nodes_func(graph)) + return 0 def exp_attracting_edge(graph): """exp_attracting_edge""" - out_degs = list(dict(graph.out_degree).values()) - r = sum(dict(graph.in_degree)[i] for i in attracting_nodes_func(graph)) / ( - graph.number_of_edges() - ) - return [i * r for i in out_degs] + if nx.is_directed(graph): + out_degs = list(dict(graph.out_degree).values()) + r = sum(dict(graph.in_degree)[i] for i in attracting_nodes_func(graph)) / ( + graph.number_of_edges() + ) + return [i * r for i in out_degs] + return [0] class BasalNodes(FeatureClass): diff --git a/hcga/features/centralities_basic.py b/hcga/features/centralities_basic.py index 24a70937..7bef125f 100644 --- a/hcga/features/centralities_basic.py +++ b/hcga/features/centralities_basic.py @@ -78,12 +78,12 @@ def katz_centrality(graph): def pagerank(graph): """pagerank""" - return list(nx.pagerank_numpy(graph).values()) + return list(nx.pagerank(graph).values()) def weighted_pagerank(graph): """weighted_pagerank""" - return list(nx.pagerank_numpy(graph, weight="weight").values()) + return list(nx.pagerank(graph, weight="weight").values()) class CentralitiesBasic(FeatureClass): diff --git a/hcga/features/communities_asyn.py b/hcga/features/communities_asyn.py index 8b760298..25df9f72 100644 --- a/hcga/features/communities_asyn.py +++ b/hcga/features/communities_asyn.py @@ -22,7 +22,7 @@ def eval_asyn(graph, num_comms): def sum_density(graph, num_comms): """sum_density""" - return (sum(eval_asyn(graph, num_comms)[1]),) + return sum(eval_asyn(graph, num_comms)[1]) def ratio_density(graph, num_comms): diff --git a/hcga/features/components.py b/hcga/features/components.py index 7da82dd0..0e0f3ec3 100755 --- a/hcga/features/components.py +++ b/hcga/features/components.py @@ -66,12 +66,16 @@ def attracting_component_sizes(graph): def number_basal_components(graph): """number_basal_components""" - return nx.number_attracting_components(nx.reverse(graph)) + if nx.is_directed(graph): + return nx.number_attracting_components(nx.reverse(graph)) + return 0 def basal_component_sizes(graph): """basal_component_sizes""" - return [len(i) for i in nx.attracting_components(nx.reverse(graph))] + if nx.is_directed(graph): + return [len(i) for i in nx.attracting_components(nx.reverse(graph))] + return [0] class Components(FeatureClass): diff --git a/hcga/features/distance_measures.py b/hcga/features/distance_measures.py index 182bab13..d2ced3d3 100755 --- a/hcga/features/distance_measures.py +++ b/hcga/features/distance_measures.py @@ -35,7 +35,7 @@ def eccentricity(graph): def extrema_bounding(graph): """extrema_bounding""" - return nx.extrema_bounding(ensure_connected(graph)) + return nx.diameter(ensure_connected(graph), usebounds=True) class DistanceMeasures(FeatureClass): diff --git a/hcga/features/flow_hierarchy.py b/hcga/features/flow_hierarchy.py index 51c8bff6..a148c631 100755 --- a/hcga/features/flow_hierarchy.py +++ b/hcga/features/flow_hierarchy.py @@ -1,6 +1,6 @@ """Flow hierarchy class.""" -from functools import partial +from functools import lru_cache, partial import networkx as nx @@ -9,6 +9,14 @@ featureclass_name = "FlowHierarchy" +@lru_cache(maxsize=None) +def flow_hierarchy(graph, weight=None): + """apply flow hierarchy only on digraph""" + if isinstance(graph, nx.DiGraph): + return nx.flow_hierarchy(graph, weight) + return 0.0 + + class FlowHierarchy(FeatureClass): """Flow hierarchy class. @@ -38,14 +46,14 @@ def compute_features(self): # graph clique number self.add_feature( "flow_hierarchy", - nx.flow_hierarchy, + flow_hierarchy, "fraction of edges not participating in cycles", InterpretabilityScore(3), ) self.add_feature( "flow_hierarchy_weighted", - partial(nx.flow_hierarchy, weight="weight"), + partial(flow_hierarchy, weight="weight"), "fraction of edges not participating in cycles", InterpretabilityScore(3), ) diff --git a/hcga/features/in_out_degrees.py b/hcga/features/in_out_degrees.py index c83ba40a..918c9374 100755 --- a/hcga/features/in_out_degrees.py +++ b/hcga/features/in_out_degrees.py @@ -9,39 +9,51 @@ def in_degree(graph): """in_degree""" - return list(dict(graph.in_degree).values()) + if nx.is_directed(graph): + return list(dict(graph.in_degree).values()) + return [0] def out_degree(graph): """out_degree""" - return list(dict(graph.out_degree).values()) + if nx.is_directed(graph): + return list(dict(graph.out_degree).values()) + return [0] def in_deg_n(graph): """in_deg_n""" - return [ - i / d - for i, d in zip(list(dict(graph.in_degree).values()), list(dict(graph.degree).values())) - ] + if nx.is_directed(graph): + return [ + i / d + for i, d in zip(list(dict(graph.in_degree).values()), list(dict(graph.degree).values())) + ] + return [0] def out_deg_n(graph): """out_deg_n""" - return [ - o / d - for o, d in zip(list(dict(graph.out_degree).values()), list(dict(graph.degree).values())) - ] + if nx.is_directed(graph): + return [ + o / d + for o, d in zip( + list(dict(graph.out_degree).values()), list(dict(graph.degree).values()) + ) + ] + return [0] def in_out_deg(graph): """in_out_deg""" - return [ - i / o - for i, o in zip( - list(dict(graph.in_degree).values()), - list(dict(graph.out_degree).values()), - ) - ] + if nx.is_directed(graph): + return [ + i / o + for i, o in zip( + list(dict(graph.in_degree).values()), + list(dict(graph.out_degree).values()), + ) + ] + return [0] def in_degree_centrality(graph): diff --git a/hcga/features/jaccard_similarity.py b/hcga/features/jaccard_similarity.py index 784829b2..b75b53ef 100755 --- a/hcga/features/jaccard_similarity.py +++ b/hcga/features/jaccard_similarity.py @@ -84,12 +84,12 @@ def degree_assortativity_coeff(graph): def graph_clique_number(graph): """graph_clique_number""" - return nx.graph_clique_number(jaccard_similarity(graph)) + return max(len(c) for c in nx.clique.find_cliques(jaccard_similarity(graph))) def num_max_cliques(graph): """num_max_cliques""" - return nx.graph_number_of_cliques(jaccard_similarity(graph)) + return sum(1 for _ in nx.clique.find_cliques(jaccard_similarity(graph))) def transitivity(graph): diff --git a/hcga/features/looplessness.py b/hcga/features/looplessness.py index d9c35258..51a7a2bf 100644 --- a/hcga/features/looplessness.py +++ b/hcga/features/looplessness.py @@ -47,8 +47,8 @@ def looplessness(graph): # pylint: disable=too-many-locals n = graph.number_of_nodes() # Bipartite graphs - if nx.is_bipartite(graph): - trophic = [1] * n + if nx.is_bipartite(graph) or not nx.is_directed(graph): + trophic = [1.0] * n return 0, trophic, 0, 0, 0, 0 # Non-bipartite graphs @@ -86,7 +86,7 @@ def looplessness(graph): # pylint: disable=too-many-locals trophic[j] = s[i] # Convert all weights to 1 in order to compute trophic levels - a = np.where(nx.adj_matrix(graph).toarray() > 0, 1, 0) + a = np.where(nx.adjacency_matrix(graph).toarray() > 0, 1, 0) LHS = [(tr - 1) * k for tr, k in zip(trophic, in_degrees)] RHS = list(np.dot(a, np.array(trophic))) diff --git a/hcga/features/rbc.py b/hcga/features/rbc.py index a2bd7bd3..9363d67d 100755 --- a/hcga/features/rbc.py +++ b/hcga/features/rbc.py @@ -32,7 +32,7 @@ def rbc(graph): """ - a = np.where(nx.adj_matrix(graph).toarray() > 0, 1, 0) + a = np.where(nx.adjacency_matrix(graph).toarray() > 0, 1, 0) g = nx.DiGraph(a) if nx.is_directed_acyclic_graph(g): @@ -92,16 +92,6 @@ def degree_assortativity_coeff(graph): return nx.degree_assortativity_coefficient(rbc(graph)) -def graph_clique_number(graph): - """""" - return nx.graph_clique_number(rbc(graph)) - - -def num_max_cliques(graph): - """""" - return nx.graph_number_of_cliques(rbc(graph)) - - def transitivity(graph): """""" return nx.transitivity(rbc(graph)) @@ -190,21 +180,6 @@ def compute_features(self): InterpretabilityScore(4), ) - # Cliques - self.add_feature( - "graph_clique_number", - graph_clique_number, - "The size of the largest clique in the Jaccard similarity graph", - InterpretabilityScore(3), - ) - - self.add_feature( - "num_max_cliques", - num_max_cliques, - "The number of maximal cliques in the Jaccard similarity graph", - InterpretabilityScore(3), - ) - # Clustering self.add_feature( "transitivity", diff --git a/hcga/features/simple_cycles.py b/hcga/features/simple_cycles.py index 913e9514..7ff091e0 100755 --- a/hcga/features/simple_cycles.py +++ b/hcga/features/simple_cycles.py @@ -11,7 +11,10 @@ @lru_cache(maxsize=None) def simple_cycles_func(graph): - """simple_cycles_func""" + """simple_cycles_func + + this is very slow for not so large graph, as hey can be many cycles + """ return list(nx.simple_cycles(graph)) @@ -52,7 +55,7 @@ class SimpleCycles(FeatureClass): """ - modes = ["fast", "medium", "slow"] + modes = ["slow"] shortname = "SC" name = "simple_cycles" encoding = "networkx" @@ -64,7 +67,6 @@ def compute_features(self): "A simple closed path with no repeated nodes (except the first)", InterpretabilityScore(3), ) - self.add_feature( "simple_cycles_sizes", simple_cycles_sizes, diff --git a/hcga/features/utils.py b/hcga/features/utils.py index 0d16346d..2f8f54a0 100644 --- a/hcga/features/utils.py +++ b/hcga/features/utils.py @@ -9,9 +9,10 @@ def ensure_connected(graph): if nx.is_directed(graph): if not nx.is_weakly_connected(graph): return graph.subgraph(max(nx.weakly_connected_components(graph), key=len)) - else: - if not nx.is_connected(graph): - return graph.subgraph(max(nx.connected_components(graph), key=len)) + return graph + if not nx.is_connected(graph): + return graph.subgraph(max(nx.connected_components(graph), key=len)) + return graph raise Exception("ensure_connected is not implemented for this graph type") @@ -22,4 +23,4 @@ def remove_selfloops(graph): selfloops = nx.selfloop_edges(graph) graph_noselfloop.remove_edges_from(selfloops) return graph_noselfloop - raise Exception("ensure_conneted is not implemented for this graph type") + raise Exception("remove_selfloops is not implemented for this graph type") diff --git a/setup.py b/setup.py index e8836842..05790ddf 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ reqs = [ "click>=7.1.1", "numpy>=1.18.2", - "scipy>=1.4.1", + "scipy>=1.11", "tqdm>=4.45.0", "networkx>=3.0", "scikit-learn>=0.23.1", diff --git a/tox.ini b/tox.ini index 776e5362..3f949ba3 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ testdeps = [tox] envlist = lint - py{38,39,310,311} + py{39,310,311} docs @@ -59,7 +59,6 @@ profile=black [gh-actions] python = - 3.8: py38, lint - 3.9: py39, docs - 3.10: py310 + 3.9: py39, lint + 3.10: py310, docs 3.11: py311