|
| 1 | +""" |
| 2 | +The FP-Growth (Frequent Pattern Growth) algorithm is a widely used |
| 3 | +data mining technique for discovering frequent itemsets in |
| 4 | +large transaction databases. |
| 5 | +It overcomes some of the limitations of traditional methods like |
| 6 | +Apriori by efficiently constructing the FP-Tree |
| 7 | +
|
| 8 | +WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html |
| 9 | +Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining |
| 10 | +""" |
| 11 | + |
| 12 | +from typing import Optional |
| 13 | + |
| 14 | + |
| 15 | +class TreeNode: |
| 16 | + """ |
| 17 | + Initialize a TreeNode. |
| 18 | +
|
| 19 | + Args: |
| 20 | + name_value (str): The name of the node. |
| 21 | + num_occur (int): The number of occurrences of the node. |
| 22 | + parent_node (TreeNode): The parent node. |
| 23 | +
|
| 24 | + Example: |
| 25 | + >>> parent = TreeNode("Parent", 1, None) |
| 26 | + >>> child = TreeNode("Child", 2, parent) |
| 27 | + >>> child.name |
| 28 | + 'Child' |
| 29 | + >>> child.count |
| 30 | + 2 |
| 31 | + """ |
| 32 | + |
| 33 | + def __init__( |
| 34 | + self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None |
| 35 | + ) -> None: |
| 36 | + self.name = name_value |
| 37 | + self.count = num_occur |
| 38 | + self.node_link = None # Initialize node_link to None |
| 39 | + self.parent = parent_node |
| 40 | + self.children: dict[str, TreeNode] = {} |
| 41 | + |
| 42 | + def inc(self, num_occur: int) -> None: |
| 43 | + self.count += num_occur |
| 44 | + |
| 45 | + def disp(self, ind: int = 1) -> None: |
| 46 | + print(" " * ind, self.name, " ", self.count) |
| 47 | + for child in self.children.values(): |
| 48 | + child.disp(ind + 1) |
| 49 | + |
| 50 | + |
| 51 | +def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: |
| 52 | + """ |
| 53 | + Create FP tree |
| 54 | +
|
| 55 | + Args: |
| 56 | + data_set (list): A list of transactions, where each transaction |
| 57 | + is a list of items. |
| 58 | + min_sup (int, optional): The minimum support threshold. |
| 59 | + Items with support less than this will be pruned. Default is 1. |
| 60 | +
|
| 61 | + Returns: |
| 62 | + TreeNode: The root of the FP-Tree. |
| 63 | + dict: The header table. |
| 64 | +
|
| 65 | + Example: |
| 66 | + >>> data_set = [ |
| 67 | + ... ['A', 'B', 'C'], |
| 68 | + ... ['A', 'C'], |
| 69 | + ... ['A', 'B', 'E'], |
| 70 | + ... ['A', 'B', 'C', 'E'], |
| 71 | + ... ['B', 'E'] |
| 72 | + ... ] |
| 73 | + >>> min_sup = 2 |
| 74 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 75 | +
|
| 76 | + >>> sorted(list(header_table.keys())) |
| 77 | + ['A', 'B', 'C', 'E'] |
| 78 | +
|
| 79 | + >>> fp_tree.name |
| 80 | + 'Null Set' |
| 81 | + >>> sorted(fp_tree.children.keys()) |
| 82 | + ['A', 'B'] |
| 83 | + >>> fp_tree.children['A'].name |
| 84 | + 'A' |
| 85 | + >>> sorted(fp_tree.children['A'].children.keys()) |
| 86 | + ['B', 'C'] |
| 87 | +
|
| 88 | + """ |
| 89 | + header_table: dict = {} |
| 90 | + for trans in data_set: |
| 91 | + for item in trans: |
| 92 | + header_table[item] = header_table.get(item, [0, None]) |
| 93 | + header_table[item][0] += 1 |
| 94 | + |
| 95 | + for k in list(header_table.keys()): |
| 96 | + if header_table[k][0] < min_sup: |
| 97 | + del header_table[k] |
| 98 | + |
| 99 | + freq_item_set = set(header_table.keys()) |
| 100 | + |
| 101 | + if len(freq_item_set) == 0: |
| 102 | + return TreeNode("Null Set", 1, None), {} |
| 103 | + |
| 104 | + for k in header_table: |
| 105 | + header_table[k] = [header_table[k], None] |
| 106 | + |
| 107 | + fp_tree = TreeNode("Null Set", 1, None) # Parent is None for the root node |
| 108 | + for tran_set in data_set: |
| 109 | + local_d = {} |
| 110 | + for item in tran_set: |
| 111 | + if item in freq_item_set: |
| 112 | + local_d[item] = header_table[item][0] |
| 113 | + if len(local_d) > 0: |
| 114 | + sorted_items = sorted( |
| 115 | + local_d.items(), key=lambda item_info: item_info[1], reverse=True |
| 116 | + ) |
| 117 | + ordered_items = [item[0] for item in sorted_items] |
| 118 | + update_tree(ordered_items, fp_tree, header_table, 1) |
| 119 | + |
| 120 | + return fp_tree, header_table |
| 121 | + |
| 122 | + |
| 123 | +def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) -> None: |
| 124 | + """ |
| 125 | + Update the FP-Tree with a transaction. |
| 126 | +
|
| 127 | + Args: |
| 128 | + items (list): List of items in the transaction. |
| 129 | + in_tree (TreeNode): The current node in the FP-Tree. |
| 130 | + header_table (dict): The header table with item information. |
| 131 | + count (int): The count of the transaction. |
| 132 | +
|
| 133 | + Example: |
| 134 | + >>> data_set = [ |
| 135 | + ... ['A', 'B', 'C'], |
| 136 | + ... ['A', 'C'], |
| 137 | + ... ['A', 'B', 'E'], |
| 138 | + ... ['A', 'B', 'C', 'E'], |
| 139 | + ... ['B', 'E'] |
| 140 | + ... ] |
| 141 | + >>> min_sup = 2 |
| 142 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 143 | +
|
| 144 | + >>> transaction = ['A', 'B', 'E'] |
| 145 | + >>> update_tree(transaction, fp_tree, header_table, 1) |
| 146 | +
|
| 147 | + >>> sorted(fp_tree.children['A'].children['B'].children['E'].children.keys()) |
| 148 | + [] |
| 149 | + >>> fp_tree.children['A'].children['B'].children['E'].count |
| 150 | + 2 |
| 151 | + >>> header_table['E'][1].name |
| 152 | + 'E' |
| 153 | + """ |
| 154 | + if items[0] in in_tree.children: |
| 155 | + in_tree.children[items[0]].inc(count) |
| 156 | + else: |
| 157 | + in_tree.children[items[0]] = TreeNode(items[0], count, in_tree) |
| 158 | + if header_table[items[0]][1] is None: |
| 159 | + header_table[items[0]][1] = in_tree.children[items[0]] |
| 160 | + else: |
| 161 | + update_header(header_table[items[0]][1], in_tree.children[items[0]]) |
| 162 | + if len(items) > 1: |
| 163 | + update_tree(items[1:], in_tree.children[items[0]], header_table, count) |
| 164 | + |
| 165 | + |
| 166 | +def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: |
| 167 | + """ |
| 168 | + Update the header table with a node link. |
| 169 | +
|
| 170 | + Args: |
| 171 | + node_to_test (TreeNode): The node to be updated in the header table. |
| 172 | + target_node (TreeNode): The node to link to. |
| 173 | +
|
| 174 | + Example: |
| 175 | + >>> data_set = [ |
| 176 | + ... ['A', 'B', 'C'], |
| 177 | + ... ['A', 'C'], |
| 178 | + ... ['A', 'B', 'E'], |
| 179 | + ... ['A', 'B', 'C', 'E'], |
| 180 | + ... ['B', 'E'] |
| 181 | + ... ] |
| 182 | + >>> min_sup = 2 |
| 183 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 184 | +
|
| 185 | + >>> node1 = TreeNode("A", 3, None) |
| 186 | + >>> node2 = TreeNode("B", 4, None) |
| 187 | + >>> node1 = update_header(node1, node2) |
| 188 | + >>> node1.node_link.name |
| 189 | + 'B' |
| 190 | + >>> node2.node_link is None |
| 191 | + True |
| 192 | + """ |
| 193 | + while node_to_test.node_link is not None: |
| 194 | + node_to_test = node_to_test.node_link |
| 195 | + if node_to_test.node_link is None: |
| 196 | + node_to_test.node_link = TreeNode(target_node.name, target_node.count, node_to_test) |
| 197 | + # Return the updated node |
| 198 | + return node_to_test |
| 199 | + |
| 200 | + |
| 201 | +def ascend_tree(leaf_node: TreeNode, prefix_path: list) -> None: |
| 202 | + """ |
| 203 | + Ascend the FP-Tree from a leaf node to its root, |
| 204 | + adding item names to the prefix path. |
| 205 | +
|
| 206 | + Args: |
| 207 | + leaf_node (TreeNode): The leaf node to start ascending from. |
| 208 | + prefix_path (list): A list to store the item as they are ascended. |
| 209 | +
|
| 210 | + Example: |
| 211 | + >>> data_set = [ |
| 212 | + ... ['A', 'B', 'C'], |
| 213 | + ... ['A', 'C'], |
| 214 | + ... ['A', 'B', 'E'], |
| 215 | + ... ['A', 'B', 'C', 'E'], |
| 216 | + ... ['B', 'E'] |
| 217 | + ... ] |
| 218 | + >>> min_sup = 2 |
| 219 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 220 | +
|
| 221 | + >>> path = [] |
| 222 | + >>> ascend_tree(fp_tree.children['A'], path) |
| 223 | + >>> path # ascending from a leaf node 'A' |
| 224 | + ['A'] |
| 225 | + """ |
| 226 | + if leaf_node.parent is not None: |
| 227 | + prefix_path.append(leaf_node.name) |
| 228 | + ascend_tree(leaf_node.parent, prefix_path) |
| 229 | + |
| 230 | + |
| 231 | +def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict: |
| 232 | + """ |
| 233 | + Find the conditional pattern base for a given base pattern. |
| 234 | +
|
| 235 | + Args: |
| 236 | + base_pat (frozenset): The base pattern for which to find |
| 237 | + the conditional pattern base. |
| 238 | + tree_node (TreeNode): The node in the FP-Tree. |
| 239 | +
|
| 240 | + Example: |
| 241 | + >>> data_set = [ |
| 242 | + ... ['A', 'B', 'C'], |
| 243 | + ... ['A', 'C'], |
| 244 | + ... ['A', 'B', 'E'], |
| 245 | + ... ['A', 'B', 'C', 'E'], |
| 246 | + ... ['B', 'E'] |
| 247 | + ... ] |
| 248 | + >>> min_sup = 2 |
| 249 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 250 | + >>> base_pattern = frozenset(['A']) |
| 251 | + >>> cond_pat = find_prefix_path(base_pattern, fp_tree.children['A']) |
| 252 | + >>> sorted(cond_pat.keys()) |
| 253 | + [] |
| 254 | + """ |
| 255 | + cond_pats: dict = {} |
| 256 | + while tree_node is not None: |
| 257 | + prefix_path: list = [] |
| 258 | + ascend_tree(tree_node, prefix_path) |
| 259 | + if len(prefix_path) > 1: |
| 260 | + cond_pats[frozenset(prefix_path[1:])] = tree_node.count |
| 261 | + tree_node = tree_node.node_link |
| 262 | + return cond_pats |
| 263 | + |
| 264 | + |
| 265 | +def mine_tree( |
| 266 | + in_tree: TreeNode, |
| 267 | + header_table: dict, |
| 268 | + min_sup: int, |
| 269 | + pre_fix: set, |
| 270 | + freq_item_list: list, |
| 271 | +) -> None: |
| 272 | + """ |
| 273 | + Mine the FP-Tree recursively to discover frequent itemsets. |
| 274 | +
|
| 275 | + Args: |
| 276 | + in_tree (TreeNode): The FP-Tree to mine. |
| 277 | + header_table (dict): The header table with item information. |
| 278 | + min_sup (int): The minimum support threshold. |
| 279 | + pre_fix (set): A set of items as a prefix for the itemsets being mined. |
| 280 | + freq_item_list (list): A list to store the frequent itemsets. |
| 281 | +
|
| 282 | + Example: |
| 283 | + >>> data_set = [ |
| 284 | + ... ['A', 'B', 'C'], |
| 285 | + ... ['A', 'C'], |
| 286 | + ... ['A', 'B', 'E'], |
| 287 | + ... ['A', 'B', 'C', 'E'], |
| 288 | + ... ['B', 'E'] |
| 289 | + ... ] |
| 290 | + >>> min_sup = 2 |
| 291 | + >>> fp_tree, header_table = create_tree(data_set, min_sup) |
| 292 | +
|
| 293 | + >>> frequent_itemsets = [] |
| 294 | + >>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets) |
| 295 | + >>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}] |
| 296 | + >>> all(expected in frequent_itemsets for expected in expe_itm) |
| 297 | + True |
| 298 | + """ |
| 299 | + sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0]) |
| 300 | + big_l = [item[0] for item in sorted_items] |
| 301 | + for base_pat in big_l: |
| 302 | + new_freq_set = pre_fix.copy() |
| 303 | + new_freq_set.add(base_pat) |
| 304 | + freq_item_list.append(new_freq_set) |
| 305 | + cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1]) |
| 306 | + my_cond_tree, my_head = create_tree(list(cond_patt_bases.keys()), min_sup) |
| 307 | + if my_head is not None: |
| 308 | + # Pass header_table[base_pat][1] as node_to_test to update_header |
| 309 | + header_table[base_pat][1] = update_header( |
| 310 | + header_table[base_pat][1], my_cond_tree |
| 311 | + ) |
| 312 | + mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list) |
| 313 | + |
| 314 | + |
| 315 | +if __name__ == "__main__": |
| 316 | + import doctest |
| 317 | + |
| 318 | + doctest.testmod() |
| 319 | + |
| 320 | + data_set: list = [ |
| 321 | + frozenset(["bread", "milk", "cheese"]), |
| 322 | + frozenset(["bread", "milk"]), |
| 323 | + frozenset(["bread", "diapers"]), |
| 324 | + frozenset(["bread", "milk", "diapers"]), |
| 325 | + frozenset(["milk", "diapers"]), |
| 326 | + frozenset(["milk", "cheese"]), |
| 327 | + frozenset(["diapers", "cheese"]), |
| 328 | + frozenset(["bread", "milk", "cheese", "diapers"]), |
| 329 | + ] |
| 330 | + fp_tree, header_table = create_tree(data_set, min_sup=3) |
| 331 | + freq_items: list = [] |
| 332 | + mine_tree(fp_tree, header_table, 3, set(), freq_items) |
| 333 | + print(freq_items) |
0 commit comments