Spaces:
Paused
Paused
| """ | |
| Functions for hashing graphs to strings. | |
| Isomorphic graphs should be assigned identical hashes. | |
| For now, only Weisfeiler-Lehman hashing is implemented. | |
| """ | |
| from collections import Counter, defaultdict | |
| from hashlib import blake2b | |
| import networkx as nx | |
| __all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"] | |
| def _hash_label(label, digest_size): | |
| return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest() | |
| def _init_node_labels(G, edge_attr, node_attr): | |
| if node_attr: | |
| return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)} | |
| elif edge_attr: | |
| return {u: "" for u in G} | |
| else: | |
| return {u: str(deg) for u, deg in G.degree()} | |
| def _neighborhood_aggregate(G, node, node_labels, edge_attr=None): | |
| """ | |
| Compute new labels for given node by aggregating | |
| the labels of each node's neighbors. | |
| """ | |
| label_list = [] | |
| for nbr in G.neighbors(node): | |
| prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr]) | |
| label_list.append(prefix + node_labels[nbr]) | |
| return node_labels[node] + "".join(sorted(label_list)) | |
| def weisfeiler_lehman_graph_hash( | |
| G, edge_attr=None, node_attr=None, iterations=3, digest_size=16 | |
| ): | |
| """Return Weisfeiler Lehman (WL) graph hash. | |
| The function iteratively aggregates and hashes neighbourhoods of each node. | |
| After each node's neighbors are hashed to obtain updated node labels, | |
| a hashed histogram of resulting labels is returned as the final hash. | |
| Hashes are identical for isomorphic graphs and strong guarantees that | |
| non-isomorphic graphs will get different hashes. See [1]_ for details. | |
| If no node or edge attributes are provided, the degree of each node | |
| is used as its initial label. | |
| Otherwise, node and/or edge labels are used to compute the hash. | |
| Parameters | |
| ---------- | |
| G: graph | |
| The graph to be hashed. | |
| Can have node and/or edge attributes. Can also have no attributes. | |
| edge_attr: string, default=None | |
| The key in edge attribute dictionary to be used for hashing. | |
| If None, edge labels are ignored. | |
| node_attr: string, default=None | |
| The key in node attribute dictionary to be used for hashing. | |
| If None, and no edge_attr given, use the degrees of the nodes as labels. | |
| iterations: int, default=3 | |
| Number of neighbor aggregations to perform. | |
| Should be larger for larger graphs. | |
| digest_size: int, default=16 | |
| Size (in bits) of blake2b hash digest to use for hashing node labels. | |
| Returns | |
| ------- | |
| h : string | |
| Hexadecimal string corresponding to hash of the input graph. | |
| Examples | |
| -------- | |
| Two graphs with edge attributes that are isomorphic, except for | |
| differences in the edge labels. | |
| >>> G1 = nx.Graph() | |
| >>> G1.add_edges_from( | |
| ... [ | |
| ... (1, 2, {"label": "A"}), | |
| ... (2, 3, {"label": "A"}), | |
| ... (3, 1, {"label": "A"}), | |
| ... (1, 4, {"label": "B"}), | |
| ... ] | |
| ... ) | |
| >>> G2 = nx.Graph() | |
| >>> G2.add_edges_from( | |
| ... [ | |
| ... (5, 6, {"label": "B"}), | |
| ... (6, 7, {"label": "A"}), | |
| ... (7, 5, {"label": "A"}), | |
| ... (7, 8, {"label": "A"}), | |
| ... ] | |
| ... ) | |
| Omitting the `edge_attr` option, results in identical hashes. | |
| >>> nx.weisfeiler_lehman_graph_hash(G1) | |
| '7bc4dde9a09d0b94c5097b219891d81a' | |
| >>> nx.weisfeiler_lehman_graph_hash(G2) | |
| '7bc4dde9a09d0b94c5097b219891d81a' | |
| With edge labels, the graphs are no longer assigned | |
| the same hash digest. | |
| >>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label") | |
| 'c653d85538bcf041d88c011f4f905f10' | |
| >>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label") | |
| '3dcd84af1ca855d0eff3c978d88e7ec7' | |
| Notes | |
| ----- | |
| To return the WL hashes of each subgraph of a graph, use | |
| `weisfeiler_lehman_subgraph_hashes` | |
| Similarity between hashes does not imply similarity between graphs. | |
| References | |
| ---------- | |
| .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, | |
| Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman | |
| Graph Kernels. Journal of Machine Learning Research. 2011. | |
| http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf | |
| See also | |
| -------- | |
| weisfeiler_lehman_subgraph_hashes | |
| """ | |
| def weisfeiler_lehman_step(G, labels, edge_attr=None): | |
| """ | |
| Apply neighborhood aggregation to each node | |
| in the graph. | |
| Computes a dictionary with labels for each node. | |
| """ | |
| new_labels = {} | |
| for node in G.nodes(): | |
| label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) | |
| new_labels[node] = _hash_label(label, digest_size) | |
| return new_labels | |
| # set initial node labels | |
| node_labels = _init_node_labels(G, edge_attr, node_attr) | |
| subgraph_hash_counts = [] | |
| for _ in range(iterations): | |
| node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr) | |
| counter = Counter(node_labels.values()) | |
| # sort the counter, extend total counts | |
| subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0])) | |
| # hash the final counter | |
| return _hash_label(str(tuple(subgraph_hash_counts)), digest_size) | |
| def weisfeiler_lehman_subgraph_hashes( | |
| G, edge_attr=None, node_attr=None, iterations=3, digest_size=16 | |
| ): | |
| """ | |
| Return a dictionary of subgraph hashes by node. | |
| Dictionary keys are nodes in `G`, and values are a list of hashes. | |
| Each hash corresponds to a subgraph rooted at a given node u in `G`. | |
| Lists of subgraph hashes are sorted in increasing order of depth from | |
| their root node, with the hash at index i corresponding to a subgraph | |
| of nodes at most i edges distance from u. Thus, each list will contain | |
| ``iterations + 1`` elements - a hash for a subgraph at each depth, and | |
| additionally a hash of the initial node label (or equivalently a | |
| subgraph of depth 0) | |
| The function iteratively aggregates and hashes neighbourhoods of each node. | |
| This is achieved for each step by replacing for each node its label from | |
| the previous iteration with its hashed 1-hop neighborhood aggregate. | |
| The new node label is then appended to a list of node labels for each | |
| node. | |
| To aggregate neighborhoods at each step for a node $n$, all labels of | |
| nodes adjacent to $n$ are concatenated. If the `edge_attr` parameter is set, | |
| labels for each neighboring node are prefixed with the value of this attribute | |
| along the connecting edge from this neighbor to node $n$. The resulting string | |
| is then hashed to compress this information into a fixed digest size. | |
| Thus, at the $i$-th iteration, nodes within $i$ hops influence any given | |
| hashed node label. We can therefore say that at depth $i$ for node $n$ | |
| we have a hash for a subgraph induced by the $2i$-hop neighborhood of $n$. | |
| The output can be used to to create general Weisfeiler-Lehman graph kernels, | |
| or generate features for graphs or nodes - for example to generate 'words' in | |
| a graph as seen in the 'graph2vec' algorithm. | |
| See [1]_ & [2]_ respectively for details. | |
| Hashes are identical for isomorphic subgraphs and there exist strong | |
| guarantees that non-isomorphic graphs will get different hashes. | |
| See [1]_ for details. | |
| If no node or edge attributes are provided, the degree of each node | |
| is used as its initial label. | |
| Otherwise, node and/or edge labels are used to compute the hash. | |
| Parameters | |
| ---------- | |
| G: graph | |
| The graph to be hashed. | |
| Can have node and/or edge attributes. Can also have no attributes. | |
| edge_attr: string, default=None | |
| The key in edge attribute dictionary to be used for hashing. | |
| If None, edge labels are ignored. | |
| node_attr: string, default=None | |
| The key in node attribute dictionary to be used for hashing. | |
| If None, and no edge_attr given, use the degrees of the nodes as labels. | |
| iterations: int, default=3 | |
| Number of neighbor aggregations to perform. | |
| Should be larger for larger graphs. | |
| digest_size: int, default=16 | |
| Size (in bits) of blake2b hash digest to use for hashing node labels. | |
| The default size is 16 bits | |
| Returns | |
| ------- | |
| node_subgraph_hashes : dict | |
| A dictionary with each key given by a node in G, and each value given | |
| by the subgraph hashes in order of depth from the key node. | |
| Examples | |
| -------- | |
| Finding similar nodes in different graphs: | |
| >>> G1 = nx.Graph() | |
| >>> G1.add_edges_from([ | |
| ... (1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7) | |
| ... ]) | |
| >>> G2 = nx.Graph() | |
| >>> G2.add_edges_from([ | |
| ... (1, 3), (2, 3), (1, 6), (1, 5), (4, 6) | |
| ... ]) | |
| >>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes(G1, iterations=3, digest_size=8) | |
| >>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes(G2, iterations=3, digest_size=8) | |
| Even though G1 and G2 are not isomorphic (they have different numbers of edges), | |
| the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar: | |
| >>> g1_hashes[1] | |
| ['a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0'] | |
| >>> g2_hashes[5] | |
| ['a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc'] | |
| The first 2 WL subgraph hashes match. From this we can conclude that it's very | |
| likely the neighborhood of 4 hops around these nodes are isomorphic: each | |
| iteration aggregates 1-hop neighbourhoods meaning hashes at depth $n$ are influenced | |
| by every node within $2n$ hops. | |
| However the neighborhood of 6 hops is no longer isomorphic since their 3rd hash does | |
| not match. | |
| These nodes may be candidates to be classified together since their local topology | |
| is similar. | |
| Notes | |
| ----- | |
| To hash the full graph when subgraph hashes are not needed, use | |
| `weisfeiler_lehman_graph_hash` for efficiency. | |
| Similarity between hashes does not imply similarity between graphs. | |
| References | |
| ---------- | |
| .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, | |
| Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman | |
| Graph Kernels. Journal of Machine Learning Research. 2011. | |
| http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf | |
| .. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan, | |
| Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning | |
| Distributed Representations of Graphs. arXiv. 2017 | |
| https://arxiv.org/pdf/1707.05005.pdf | |
| See also | |
| -------- | |
| weisfeiler_lehman_graph_hash | |
| """ | |
| def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None): | |
| """ | |
| Apply neighborhood aggregation to each node | |
| in the graph. | |
| Computes a dictionary with labels for each node. | |
| Appends the new hashed label to the dictionary of subgraph hashes | |
| originating from and indexed by each node in G | |
| """ | |
| new_labels = {} | |
| for node in G.nodes(): | |
| label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) | |
| hashed_label = _hash_label(label, digest_size) | |
| new_labels[node] = hashed_label | |
| node_subgraph_hashes[node].append(hashed_label) | |
| return new_labels | |
| node_labels = _init_node_labels(G, edge_attr, node_attr) | |
| node_subgraph_hashes = defaultdict(list) | |
| for _ in range(iterations): | |
| node_labels = weisfeiler_lehman_step( | |
| G, node_labels, node_subgraph_hashes, edge_attr | |
| ) | |
| return dict(node_subgraph_hashes) | |