|
| 1 | +""" |
| 2 | +FP-GraphMiner - A Fast Frequent Pattern Mining Algorithm for Network Graphs |
| 3 | +
|
| 4 | +A novel Frequent Pattern Graph Mining algorithm, FP-GraphMiner, that compactly |
| 5 | +represents a set of network graphs as a Frequent Pattern Graph (or FP-Graph). |
| 6 | +This graph can be used to efficiently mine frequent subgraphs including maximal |
| 7 | +frequent subgraphs and maximum common subgraphs. |
| 8 | +
|
| 9 | +URL: https://www.researchgate.net/publication/235255851 |
| 10 | +""" |
| 11 | +# fmt: off |
| 12 | +edge_array = [ |
| 13 | + ['ab-e1', 'ac-e3', 'ad-e5', 'bc-e4', 'bd-e2', 'be-e6', 'bh-e12', 'cd-e2', 'ce-e4', |
| 14 | + 'de-e1', 'df-e8', 'dg-e5', 'dh-e10', 'ef-e3', 'eg-e2', 'fg-e6', 'gh-e6', 'hi-e3'], |
| 15 | + ['ab-e1', 'ac-e3', 'ad-e5', 'bc-e4', 'bd-e2', 'be-e6', 'cd-e2', 'de-e1', 'df-e8', |
| 16 | + 'ef-e3', 'eg-e2', 'fg-e6'], |
| 17 | + ['ab-e1', 'ac-e3', 'bc-e4', 'bd-e2', 'de-e1', 'df-e8', 'dg-e5', 'ef-e3', 'eg-e2', |
| 18 | + 'eh-e12', 'fg-e6', 'fh-e10', 'gh-e6'], |
| 19 | + ['ab-e1', 'ac-e3', 'bc-e4', 'bd-e2', 'bh-e12', 'cd-e2', 'df-e8', 'dh-e10'], |
| 20 | + ['ab-e1', 'ac-e3', 'ad-e5', 'bc-e4', 'bd-e2', 'cd-e2', 'ce-e4', 'de-e1', 'df-e8', |
| 21 | + 'dg-e5', 'ef-e3', 'eg-e2', 'fg-e6'] |
| 22 | + ] |
| 23 | +# fmt: on |
| 24 | + |
| 25 | + |
| 26 | +def get_distinct_edge(edge_array): |
| 27 | + """ |
| 28 | + Return Distinct edges from edge array of multiple graphs |
| 29 | + >>> sorted(get_distinct_edge(edge_array)) |
| 30 | + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] |
| 31 | + """ |
| 32 | + distinct_edge = set() |
| 33 | + for row in edge_array: |
| 34 | + for item in row: |
| 35 | + distinct_edge.add(item[0]) |
| 36 | + return list(distinct_edge) |
| 37 | + |
| 38 | + |
| 39 | +def get_bitcode(edge_array, distinct_edge): |
| 40 | + """ |
| 41 | + Return bitcode of distinct_edge |
| 42 | + """ |
| 43 | + bitcode = ["0"] * len(edge_array) |
| 44 | + for i, row in enumerate(edge_array): |
| 45 | + for item in row: |
| 46 | + if distinct_edge in item[0]: |
| 47 | + bitcode[i] = "1" |
| 48 | + break |
| 49 | + return "".join(bitcode) |
| 50 | + |
| 51 | + |
| 52 | +def get_frequency_table(edge_array): |
| 53 | + """ |
| 54 | + Returns Frequency Table |
| 55 | + """ |
| 56 | + distinct_edge = get_distinct_edge(edge_array) |
| 57 | + frequency_table = dict() |
| 58 | + |
| 59 | + for item in distinct_edge: |
| 60 | + bit = get_bitcode(edge_array, item) |
| 61 | + # print('bit',bit) |
| 62 | + # bt=''.join(bit) |
| 63 | + s = bit.count("1") |
| 64 | + frequency_table[item] = [s, bit] |
| 65 | + # Store [Distinct edge, WT(Bitcode), Bitcode] in descending order |
| 66 | + sorted_frequency_table = [ |
| 67 | + [k, v[0], v[1]] |
| 68 | + for k, v in sorted(frequency_table.items(), key=lambda v: v[1][0], reverse=True) |
| 69 | + ] |
| 70 | + return sorted_frequency_table |
| 71 | + |
| 72 | + |
| 73 | +def get_nodes(frequency_table): |
| 74 | + """ |
| 75 | + Returns nodes |
| 76 | + format nodes={bitcode:edges that represent the bitcode} |
| 77 | + >>> get_nodes([['ab', 5, '11111'], ['ac', 5, '11111'], ['df', 5, '11111'], |
| 78 | + ... ['bd', 5, '11111'], ['bc', 5, '11111']]) |
| 79 | + {'11111': ['ab', 'ac', 'df', 'bd', 'bc']} |
| 80 | + """ |
| 81 | + nodes = {} |
| 82 | + for i, item in enumerate(frequency_table): |
| 83 | + nodes.setdefault(item[2], []).append(item[0]) |
| 84 | + return nodes |
| 85 | + |
| 86 | + |
| 87 | +def get_cluster(nodes): |
| 88 | + """ |
| 89 | + Returns cluster |
| 90 | + format cluster:{WT(bitcode):nodes with same WT} |
| 91 | + """ |
| 92 | + cluster = {} |
| 93 | + for key, value in nodes.items(): |
| 94 | + cluster.setdefault(key.count("1"), {})[key] = value |
| 95 | + return cluster |
| 96 | + |
| 97 | + |
| 98 | +def get_support(cluster): |
| 99 | + """ |
| 100 | + Returns support |
| 101 | + >>> get_support({5: {'11111': ['ab', 'ac', 'df', 'bd', 'bc']}, |
| 102 | + ... 4: {'11101': ['ef', 'eg', 'de', 'fg'], '11011': ['cd']}, |
| 103 | + ... 3: {'11001': ['ad'], '10101': ['dg']}, |
| 104 | + ... 2: {'10010': ['dh', 'bh'], '11000': ['be'], '10100': ['gh'], |
| 105 | + ... '10001': ['ce']}, |
| 106 | + ... 1: {'00100': ['fh', 'eh'], '10000': ['hi']}}) |
| 107 | + [100.0, 80.0, 60.0, 40.0, 20.0] |
| 108 | + """ |
| 109 | + return [i * 100 / len(cluster) for i in cluster] |
| 110 | + |
| 111 | + |
| 112 | +def print_all() -> None: |
| 113 | + print("\nNodes\n") |
| 114 | + for key, value in nodes.items(): |
| 115 | + print(key, value) |
| 116 | + print("\nSupport\n") |
| 117 | + print(support) |
| 118 | + print("\n Cluster \n") |
| 119 | + for key, value in sorted(cluster.items(), reverse=True): |
| 120 | + print(key, value) |
| 121 | + print("\n Graph\n") |
| 122 | + for key, value in graph.items(): |
| 123 | + print(key, value) |
| 124 | + print("\n Edge List of Frequent subgraphs \n") |
| 125 | + for edge_list in freq_subgraph_edge_list: |
| 126 | + print(edge_list) |
| 127 | + |
| 128 | + |
| 129 | +def create_edge(nodes, graph, cluster, c1): |
| 130 | + """ |
| 131 | + create edge between the nodes |
| 132 | + """ |
| 133 | + for i in cluster[c1].keys(): |
| 134 | + count = 0 |
| 135 | + c2 = c1 + 1 |
| 136 | + while c2 < max(cluster.keys()): |
| 137 | + for j in cluster[c2].keys(): |
| 138 | + """ |
| 139 | + creates edge only if the condition satisfies |
| 140 | + """ |
| 141 | + if int(i, 2) & int(j, 2) == int(i, 2): |
| 142 | + if tuple(nodes[i]) in graph: |
| 143 | + graph[tuple(nodes[i])].append(nodes[j]) |
| 144 | + else: |
| 145 | + graph[tuple(nodes[i])] = [nodes[j]] |
| 146 | + count += 1 |
| 147 | + if count == 0: |
| 148 | + c2 = c2 + 1 |
| 149 | + else: |
| 150 | + break |
| 151 | + |
| 152 | + |
| 153 | +def construct_graph(cluster, nodes): |
| 154 | + X = cluster[max(cluster.keys())] |
| 155 | + cluster[max(cluster.keys()) + 1] = "Header" |
| 156 | + graph = {} |
| 157 | + for i in X: |
| 158 | + if tuple(["Header"]) in graph: |
| 159 | + graph[tuple(["Header"])].append(X[i]) |
| 160 | + else: |
| 161 | + graph[tuple(["Header"])] = [X[i]] |
| 162 | + for i in X: |
| 163 | + graph[tuple(X[i])] = [["Header"]] |
| 164 | + i = 1 |
| 165 | + while i < max(cluster) - 1: |
| 166 | + create_edge(nodes, graph, cluster, i) |
| 167 | + i = i + 1 |
| 168 | + return graph |
| 169 | + |
| 170 | + |
| 171 | +def myDFS(graph, start, end, path=[]): |
| 172 | + """ |
| 173 | + find different DFS walk from given node to Header node |
| 174 | + """ |
| 175 | + path = path + [start] |
| 176 | + if start == end: |
| 177 | + paths.append(path) |
| 178 | + for node in graph[start]: |
| 179 | + if tuple(node) not in path: |
| 180 | + myDFS(graph, tuple(node), end, path) |
| 181 | + |
| 182 | + |
| 183 | +def find_freq_subgraph_given_support(s, cluster, graph): |
| 184 | + """ |
| 185 | + find edges of multiple frequent subgraphs |
| 186 | + """ |
| 187 | + k = int(s / 100 * (len(cluster) - 1)) |
| 188 | + for i in cluster[k].keys(): |
| 189 | + myDFS(graph, tuple(cluster[k][i]), tuple(["Header"])) |
| 190 | + |
| 191 | + |
| 192 | +def freq_subgraphs_edge_list(paths): |
| 193 | + """ |
| 194 | + returns Edge list for frequent subgraphs |
| 195 | + """ |
| 196 | + freq_sub_EL = [] |
| 197 | + for edges in paths: |
| 198 | + EL = [] |
| 199 | + for j in range(len(edges) - 1): |
| 200 | + temp = list(edges[j]) |
| 201 | + for e in temp: |
| 202 | + edge = (e[0], e[1]) |
| 203 | + EL.append(edge) |
| 204 | + freq_sub_EL.append(EL) |
| 205 | + return freq_sub_EL |
| 206 | + |
| 207 | + |
| 208 | +def preprocess(edge_array): |
| 209 | + """ |
| 210 | + Preprocess the edge array |
| 211 | + >>> preprocess([['ab-e1', 'ac-e3', 'ad-e5', 'bc-e4', 'bd-e2', 'be-e6', 'bh-e12', |
| 212 | + ... 'cd-e2', 'ce-e4', 'de-e1', 'df-e8', 'dg-e5', 'dh-e10', 'ef-e3', |
| 213 | + ... 'eg-e2', 'fg-e6', 'gh-e6', 'hi-e3']]) |
| 214 | +
|
| 215 | + """ |
| 216 | + for i in range(len(edge_array)): |
| 217 | + for j in range(len(edge_array[i])): |
| 218 | + t = edge_array[i][j].split("-") |
| 219 | + edge_array[i][j] = t |
| 220 | + |
| 221 | + |
| 222 | +if __name__ == "__main__": |
| 223 | + preprocess(edge_array) |
| 224 | + frequency_table = get_frequency_table(edge_array) |
| 225 | + nodes = get_nodes(frequency_table) |
| 226 | + cluster = get_cluster(nodes) |
| 227 | + support = get_support(cluster) |
| 228 | + graph = construct_graph(cluster, nodes) |
| 229 | + find_freq_subgraph_given_support(60, cluster, graph) |
| 230 | + paths = [] |
| 231 | + freq_subgraph_edge_list = freq_subgraphs_edge_list(paths) |
| 232 | + print_all() |
0 commit comments