|
| 1 | +""" |
| 2 | +The Frequent Pattern Growth algorithm (FP-Growth) is a widely used data mining |
| 3 | +technique for discovering frequent itemsets in large transaction databases. |
| 4 | +
|
| 5 | +It overcomes some of the limitations of traditional methods such as Apriori by |
| 6 | +efficiently constructing the FP-Tree |
| 7 | +
|
| 8 | +WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html |
| 9 | +
|
| 10 | +Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining |
| 11 | +""" |
| 12 | +from __future__ import annotations |
| 13 | + |
| 14 | +from dataclasses import dataclass, field |
| 15 | + |
| 16 | + |
| 17 | +@dataclass |
| 18 | +class TreeNode: |
| 19 | + """ |
| 20 | + A node in a Frequent Pattern tree. |
| 21 | +
|
| 22 | + Args: |
| 23 | + name: The name of this node. |
| 24 | + num_occur: The number of occurrences of the node. |
| 25 | + parent_node: The parent node. |
| 26 | +
|
| 27 | + Example: |
| 28 | + >>> parent = TreeNode("Parent", 1, None) |
| 29 | + >>> child = TreeNode("Child", 2, parent) |
| 30 | + >>> child.name |
| 31 | + 'Child' |
| 32 | + >>> child.count |
| 33 | + 2 |
| 34 | + """ |
| 35 | + |
| 36 | + name: str |
| 37 | + count: int |
| 38 | + parent: TreeNode | None = None |
| 39 | + children: dict[str, TreeNode] = field(default_factory=dict) |
| 40 | + node_link: TreeNode | None = None |
| 41 | + |
| 42 | + def __repr__(self) -> str: |
| 43 | + return f"TreeNode({self.name!r}, {self.count!r}, {self.parent!r})" |
| 44 | + |
| 45 | + def inc(self, num_occur: int) -> None: |
| 46 | + self.count += num_occur |
| 47 | + |
| 48 | + def disp(self, ind: int = 1) -> None: |
| 49 | + print(f"{' ' * ind} {self.name} {self.count}") |
| 50 | + for child in self.children.values(): |
| 51 | + child.disp(ind + 1) |
| 52 | + |
| 53 | + |
| 54 | +def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: |
| 55 | + """ |
| 56 | + Create Frequent Pattern tree |
| 57 | +
|
| 58 | + Args: |
| 59 | + data_set: A list of transactions, where each transaction is a list of items. |
| 60 | + min_sup: The minimum support threshold. |
| 61 | + Items with support less than this will be pruned. Default is 1. |
| 62 | +
|
| 63 | + Returns: |
| 64 | + The root of the FP-Tree. |
| 65 | + header_table: The header table dictionary with item information. |
| 66 | +
|
| 67 | + Example: |
| 68 | + >>> data_set = [ |
| 69 | + ... ['A', 'B', 'C'], |
| 70 | + ... ['A', 'C'], |
| 71 | + ... ['A', 'B', 'E'], |
| 72 | + ... ['A', 'B', 'C', 'E'], |
| 73 | + ... ['B', 'E'] |
| 74 | + ... ] |
| 75 | + >>> min_sup = 2 |
| 76 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 77 | + >>> fp_tree |
| 78 | + TreeNode('Null Set', 1, None) |
| 79 | + >>> len(header_table) |
| 80 | + 4 |
| 81 | + >>> header_table["A"] |
| 82 | + [[4, None], TreeNode('A', 4, TreeNode('Null Set', 1, None))] |
| 83 | + >>> header_table["E"][1] # doctest: +NORMALIZE_WHITESPACE |
| 84 | + TreeNode('E', 1, TreeNode('B', 3, TreeNode('A', 4, TreeNode('Null Set', 1, None)))) |
| 85 | + >>> sorted(header_table) |
| 86 | + ['A', 'B', 'C', 'E'] |
| 87 | + >>> fp_tree.name |
| 88 | + 'Null Set' |
| 89 | + >>> sorted(fp_tree.children) |
| 90 | + ['A', 'B'] |
| 91 | + >>> fp_tree.children['A'].name |
| 92 | + 'A' |
| 93 | + >>> sorted(fp_tree.children['A'].children) |
| 94 | + ['B', 'C'] |
| 95 | + """ |
| 96 | + header_table: dict = {} |
| 97 | + for trans in data_set: |
| 98 | + for item in trans: |
| 99 | + header_table[item] = header_table.get(item, [0, None]) |
| 100 | + header_table[item][0] += 1 |
| 101 | + |
| 102 | + for k in list(header_table): |
| 103 | + if header_table[k][0] < min_sup: |
| 104 | + del header_table[k] |
| 105 | + |
| 106 | + if not (freq_item_set := set(header_table)): |
| 107 | + return TreeNode("Null Set", 1, None), {} |
| 108 | + |
| 109 | + for k in header_table: |
| 110 | + header_table[k] = [header_table[k], None] |
| 111 | + |
| 112 | + fp_tree = TreeNode("Null Set", 1, None) # Parent is None for the root node |
| 113 | + for tran_set in data_set: |
| 114 | + local_d = { |
| 115 | + item: header_table[item][0] for item in tran_set if item in freq_item_set |
| 116 | + } |
| 117 | + if local_d: |
| 118 | + sorted_items = sorted( |
| 119 | + local_d.items(), key=lambda item_info: item_info[1], reverse=True |
| 120 | + ) |
| 121 | + ordered_items = [item[0] for item in sorted_items] |
| 122 | + update_tree(ordered_items, fp_tree, header_table, 1) |
| 123 | + |
| 124 | + return fp_tree, header_table |
| 125 | + |
| 126 | + |
| 127 | +def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) -> None: |
| 128 | + """ |
| 129 | + Update the FP-Tree with a transaction. |
| 130 | +
|
| 131 | + Args: |
| 132 | + items: List of items in the transaction. |
| 133 | + in_tree: The current node in the FP-Tree. |
| 134 | + header_table: The header table dictionary with item information. |
| 135 | + count: The count of the transaction. |
| 136 | +
|
| 137 | + Example: |
| 138 | + >>> data_set = [ |
| 139 | + ... ['A', 'B', 'C'], |
| 140 | + ... ['A', 'C'], |
| 141 | + ... ['A', 'B', 'E'], |
| 142 | + ... ['A', 'B', 'C', 'E'], |
| 143 | + ... ['B', 'E'] |
| 144 | + ... ] |
| 145 | + >>> min_sup = 2 |
| 146 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 147 | + >>> fp_tree |
| 148 | + TreeNode('Null Set', 1, None) |
| 149 | + >>> transaction = ['A', 'B', 'E'] |
| 150 | + >>> update_tree(transaction, fp_tree, header_table, 1) |
| 151 | + >>> fp_tree |
| 152 | + TreeNode('Null Set', 1, None) |
| 153 | + >>> fp_tree.children['A'].children['B'].children['E'].children |
| 154 | + {} |
| 155 | + >>> fp_tree.children['A'].children['B'].children['E'].count |
| 156 | + 2 |
| 157 | + >>> header_table['E'][1].name |
| 158 | + 'E' |
| 159 | + """ |
| 160 | + if items[0] in in_tree.children: |
| 161 | + in_tree.children[items[0]].inc(count) |
| 162 | + else: |
| 163 | + in_tree.children[items[0]] = TreeNode(items[0], count, in_tree) |
| 164 | + if header_table[items[0]][1] is None: |
| 165 | + header_table[items[0]][1] = in_tree.children[items[0]] |
| 166 | + else: |
| 167 | + update_header(header_table[items[0]][1], in_tree.children[items[0]]) |
| 168 | + if len(items) > 1: |
| 169 | + update_tree(items[1:], in_tree.children[items[0]], header_table, count) |
| 170 | + |
| 171 | + |
| 172 | +def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: |
| 173 | + """ |
| 174 | + Update the header table with a node link. |
| 175 | +
|
| 176 | + Args: |
| 177 | + node_to_test: The node to be updated in the header table. |
| 178 | + target_node: The node to link to. |
| 179 | +
|
| 180 | + Example: |
| 181 | + >>> data_set = [ |
| 182 | + ... ['A', 'B', 'C'], |
| 183 | + ... ['A', 'C'], |
| 184 | + ... ['A', 'B', 'E'], |
| 185 | + ... ['A', 'B', 'C', 'E'], |
| 186 | + ... ['B', 'E'] |
| 187 | + ... ] |
| 188 | + >>> min_sup = 2 |
| 189 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 190 | + >>> fp_tree |
| 191 | + TreeNode('Null Set', 1, None) |
| 192 | + >>> node1 = TreeNode("A", 3, None) |
| 193 | + >>> node2 = TreeNode("B", 4, None) |
| 194 | + >>> node1 |
| 195 | + TreeNode('A', 3, None) |
| 196 | + >>> node1 = update_header(node1, node2) |
| 197 | + >>> node1 |
| 198 | + TreeNode('A', 3, None) |
| 199 | + >>> node1.node_link |
| 200 | + TreeNode('B', 4, None) |
| 201 | + >>> node2.node_link is None |
| 202 | + True |
| 203 | + """ |
| 204 | + while node_to_test.node_link is not None: |
| 205 | + node_to_test = node_to_test.node_link |
| 206 | + if node_to_test.node_link is None: |
| 207 | + node_to_test.node_link = target_node |
| 208 | + # Return the updated node |
| 209 | + return node_to_test |
| 210 | + |
| 211 | + |
| 212 | +def ascend_tree(leaf_node: TreeNode, prefix_path: list[str]) -> None: |
| 213 | + """ |
| 214 | + Ascend the FP-Tree from a leaf node to its root, adding item names to the prefix |
| 215 | + path. |
| 216 | +
|
| 217 | + Args: |
| 218 | + leaf_node: The leaf node to start ascending from. |
| 219 | + prefix_path: A list to store the item as they are ascended. |
| 220 | +
|
| 221 | + Example: |
| 222 | + >>> data_set = [ |
| 223 | + ... ['A', 'B', 'C'], |
| 224 | + ... ['A', 'C'], |
| 225 | + ... ['A', 'B', 'E'], |
| 226 | + ... ['A', 'B', 'C', 'E'], |
| 227 | + ... ['B', 'E'] |
| 228 | + ... ] |
| 229 | + >>> min_sup = 2 |
| 230 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 231 | +
|
| 232 | + >>> path = [] |
| 233 | + >>> ascend_tree(fp_tree.children['A'], path) |
| 234 | + >>> path # ascending from a leaf node 'A' |
| 235 | + ['A'] |
| 236 | + """ |
| 237 | + if leaf_node.parent is not None: |
| 238 | + prefix_path.append(leaf_node.name) |
| 239 | + ascend_tree(leaf_node.parent, prefix_path) |
| 240 | + |
| 241 | + |
| 242 | +def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict: |
| 243 | + """ |
| 244 | + Find the conditional pattern base for a given base pattern. |
| 245 | +
|
| 246 | + Args: |
| 247 | + base_pat: The base pattern for which to find the conditional pattern base. |
| 248 | + tree_node: The node in the FP-Tree. |
| 249 | +
|
| 250 | + Example: |
| 251 | + >>> data_set = [ |
| 252 | + ... ['A', 'B', 'C'], |
| 253 | + ... ['A', 'C'], |
| 254 | + ... ['A', 'B', 'E'], |
| 255 | + ... ['A', 'B', 'C', 'E'], |
| 256 | + ... ['B', 'E'] |
| 257 | + ... ] |
| 258 | + >>> min_sup = 2 |
| 259 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 260 | + >>> fp_tree |
| 261 | + TreeNode('Null Set', 1, None) |
| 262 | + >>> len(header_table) |
| 263 | + 4 |
| 264 | + >>> base_pattern = frozenset(['A']) |
| 265 | + >>> sorted(find_prefix_path(base_pattern, fp_tree.children['A'])) |
| 266 | + [] |
| 267 | + """ |
| 268 | + cond_pats: dict = {} |
| 269 | + while tree_node is not None: |
| 270 | + prefix_path: list = [] |
| 271 | + ascend_tree(tree_node, prefix_path) |
| 272 | + if len(prefix_path) > 1: |
| 273 | + cond_pats[frozenset(prefix_path[1:])] = tree_node.count |
| 274 | + tree_node = tree_node.node_link |
| 275 | + return cond_pats |
| 276 | + |
| 277 | + |
| 278 | +def mine_tree( |
| 279 | + in_tree: TreeNode, |
| 280 | + header_table: dict, |
| 281 | + min_sup: int, |
| 282 | + pre_fix: set, |
| 283 | + freq_item_list: list, |
| 284 | +) -> None: |
| 285 | + """ |
| 286 | + Mine the FP-Tree recursively to discover frequent itemsets. |
| 287 | +
|
| 288 | + Args: |
| 289 | + in_tree: The FP-Tree to mine. |
| 290 | + header_table: The header table dictionary with item information. |
| 291 | + min_sup: The minimum support threshold. |
| 292 | + pre_fix: A set of items as a prefix for the itemsets being mined. |
| 293 | + freq_item_list: A list to store the frequent itemsets. |
| 294 | +
|
| 295 | + Example: |
| 296 | + >>> data_set = [ |
| 297 | + ... ['A', 'B', 'C'], |
| 298 | + ... ['A', 'C'], |
| 299 | + ... ['A', 'B', 'E'], |
| 300 | + ... ['A', 'B', 'C', 'E'], |
| 301 | + ... ['B', 'E'] |
| 302 | + ... ] |
| 303 | + >>> min_sup = 2 |
| 304 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 305 | + >>> fp_tree |
| 306 | + TreeNode('Null Set', 1, None) |
| 307 | + >>> frequent_itemsets = [] |
| 308 | + >>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets) |
| 309 | + >>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}] |
| 310 | + >>> all(expected in frequent_itemsets for expected in expe_itm) |
| 311 | + True |
| 312 | + """ |
| 313 | + sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0]) |
| 314 | + big_l = [item[0] for item in sorted_items] |
| 315 | + for base_pat in big_l: |
| 316 | + new_freq_set = pre_fix.copy() |
| 317 | + new_freq_set.add(base_pat) |
| 318 | + freq_item_list.append(new_freq_set) |
| 319 | + cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1]) |
| 320 | + my_cond_tree, my_head = create_tree(list(cond_patt_bases), min_sup) |
| 321 | + if my_head is not None: |
| 322 | + # Pass header_table[base_pat][1] as node_to_test to update_header |
| 323 | + header_table[base_pat][1] = update_header( |
| 324 | + header_table[base_pat][1], my_cond_tree |
| 325 | + ) |
| 326 | + mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list) |
| 327 | + |
| 328 | + |
| 329 | +if __name__ == "__main__": |
| 330 | + from doctest import testmod |
| 331 | + |
| 332 | + testmod() |
| 333 | + data_set: list[frozenset] = [ |
| 334 | + frozenset(["bread", "milk", "cheese"]), |
| 335 | + frozenset(["bread", "milk"]), |
| 336 | + frozenset(["bread", "diapers"]), |
| 337 | + frozenset(["bread", "milk", "diapers"]), |
| 338 | + frozenset(["milk", "diapers"]), |
| 339 | + frozenset(["milk", "cheese"]), |
| 340 | + frozenset(["diapers", "cheese"]), |
| 341 | + frozenset(["bread", "milk", "cheese", "diapers"]), |
| 342 | + ] |
| 343 | + print(f"{len(data_set) = }") |
| 344 | + fp_tree, header_table = create_tree(data_set, min_sup=3) |
| 345 | + print(f"{fp_tree = }") |
| 346 | + print(f"{len(header_table) = }") |
| 347 | + freq_items: list = [] |
| 348 | + mine_tree(fp_tree, header_table, 3, set(), freq_items) |
| 349 | + print(f"{freq_items = }") |
0 commit comments