From 0ea8df7682158f82fb463292b08201ef6680993e Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sat, 21 Oct 2023 11:20:06 +0530 Subject: [PATCH 1/8] Add: FP Growth Algorithm --- DIRECTORY.md | 1 + machine_learning/fp_growth.py | 333 ++++++++++++++++++++++++++++++++++ 2 files changed, 334 insertions(+) create mode 100644 machine_learning/fp_growth.py diff --git a/DIRECTORY.md b/DIRECTORY.md index b92f8f877e97..df6692fdddae 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -541,6 +541,7 @@ * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py) * Forecasting * [Run](machine_learning/forecasting/run.py) + * [FP Growth Algorithm](machine_learning/fp_growth.py) * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) diff --git a/machine_learning/fp_growth.py b/machine_learning/fp_growth.py new file mode 100644 index 000000000000..eaf59693f231 --- /dev/null +++ b/machine_learning/fp_growth.py @@ -0,0 +1,333 @@ +""" +The FP-Growth (Frequent Pattern Growth) algorithm is a widely used +data mining technique for discovering frequent itemsets in +large transaction databases. +It overcomes some of the limitations of traditional methods like +Apriori by efficiently constructing the FP-Tree + +WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html +Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining +""" + +from typing import Optional + + +class TreeNode: + """ + Initialize a TreeNode. + + Args: + name_value (str): The name of the node. + num_occur (int): The number of occurrences of the node. + parent_node (TreeNode): The parent node. + + Example: + >>> parent = TreeNode("Parent", 1, None) + >>> child = TreeNode("Child", 2, parent) + >>> child.name + 'Child' + >>> child.count + 2 + """ + + def __init__( + self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None + ) -> None: + self.name = name_value + self.count = num_occur + self.node_link = None # Initialize node_link to None + self.parent = parent_node + self.children: dict[str, TreeNode] = {} + + def inc(self, num_occur: int) -> None: + self.count += num_occur + + def disp(self, ind: int = 1) -> None: + print(" " * ind, self.name, " ", self.count) + for child in self.children.values(): + child.disp(ind + 1) + + +def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: + """ + Create FP tree + + Args: + data_set (list): A list of transactions, where each transaction + is a list of items. + min_sup (int, optional): The minimum support threshold. + Items with support less than this will be pruned. Default is 1. + + Returns: + TreeNode: The root of the FP-Tree. + dict: The header table. + + Example: + >>> data_set = [ + ... ['A', 'B', 'C'], + ... ['A', 'C'], + ... ['A', 'B', 'E'], + ... ['A', 'B', 'C', 'E'], + ... ['B', 'E'] + ... ] + >>> min_sup = 2 + >>> fp_tree, header_table = create_tree(data_set, min_sup) + + >>> sorted(list(header_table.keys())) + ['A', 'B', 'C', 'E'] + + >>> fp_tree.name + 'Null Set' + >>> sorted(fp_tree.children.keys()) + ['A', 'B'] + >>> fp_tree.children['A'].name + 'A' + >>> sorted(fp_tree.children['A'].children.keys()) + ['B', 'C'] + + """ + header_table: dict = {} + for trans in data_set: + for item in trans: + header_table[item] = header_table.get(item, [0, None]) + header_table[item][0] += 1 + + for k in list(header_table.keys()): + if header_table[k][0] < min_sup: + del header_table[k] + + freq_item_set = set(header_table.keys()) + + if len(freq_item_set) == 0: + return TreeNode("Null Set", 1, None), {} + + for k in header_table: + header_table[k] = [header_table[k], None] + + fp_tree = TreeNode("Null Set", 1, None) # Parent is None for the root node + for tran_set in data_set: + local_d = {} + for item in tran_set: + if item in freq_item_set: + local_d[item] = header_table[item][0] + if len(local_d) > 0: + sorted_items = sorted( + local_d.items(), key=lambda item_info: item_info[1], reverse=True + ) + ordered_items = [item[0] for item in sorted_items] + update_tree(ordered_items, fp_tree, header_table, 1) + + return fp_tree, header_table + + +def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) -> None: + """ + Update the FP-Tree with a transaction. + + Args: + items (list): List of items in the transaction. + in_tree (TreeNode): The current node in the FP-Tree. + header_table (dict): The header table with item information. + count (int): The count of the transaction. + + Example: + >>> data_set = [ + ... ['A', 'B', 'C'], + ... ['A', 'C'], + ... ['A', 'B', 'E'], + ... ['A', 'B', 'C', 'E'], + ... ['B', 'E'] + ... ] + >>> min_sup = 2 + >>> fp_tree, header_table = create_tree(data_set, min_sup) + + >>> transaction = ['A', 'B', 'E'] + >>> update_tree(transaction, fp_tree, header_table, 1) + + >>> sorted(fp_tree.children['A'].children['B'].children['E'].children.keys()) + [] + >>> fp_tree.children['A'].children['B'].children['E'].count + 2 + >>> header_table['E'][1].name + 'E' + """ + if items[0] in in_tree.children: + in_tree.children[items[0]].inc(count) + else: + in_tree.children[items[0]] = TreeNode(items[0], count, in_tree) + if header_table[items[0]][1] is None: + header_table[items[0]][1] = in_tree.children[items[0]] + else: + update_header(header_table[items[0]][1], in_tree.children[items[0]]) + if len(items) > 1: + update_tree(items[1:], in_tree.children[items[0]], header_table, count) + + +def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: + """ + Update the header table with a node link. + + Args: + node_to_test (TreeNode): The node to be updated in the header table. + target_node (TreeNode): The node to link to. + + Example: + >>> data_set = [ + ... ['A', 'B', 'C'], + ... ['A', 'C'], + ... ['A', 'B', 'E'], + ... ['A', 'B', 'C', 'E'], + ... ['B', 'E'] + ... ] + >>> min_sup = 2 + >>> fp_tree, header_table = create_tree(data_set, min_sup) + + >>> node1 = TreeNode("A", 3, None) + >>> node2 = TreeNode("B", 4, None) + >>> node1 = update_header(node1, node2) + >>> node1.node_link.name + 'B' + >>> node2.node_link is None + True + """ + while node_to_test.node_link is not None: + node_to_test = node_to_test.node_link + if node_to_test.node_link is None: + node_to_test.node_link = TreeNode(target_node.name, target_node.count, node_to_test) + # Return the updated node + return node_to_test + + +def ascend_tree(leaf_node: TreeNode, prefix_path: list) -> None: + """ + Ascend the FP-Tree from a leaf node to its root, + adding item names to the prefix path. + + Args: + leaf_node (TreeNode): The leaf node to start ascending from. + prefix_path (list): A list to store the item as they are ascended. + + Example: + >>> data_set = [ + ... ['A', 'B', 'C'], + ... ['A', 'C'], + ... ['A', 'B', 'E'], + ... ['A', 'B', 'C', 'E'], + ... ['B', 'E'] + ... ] + >>> min_sup = 2 + >>> fp_tree, header_table = create_tree(data_set, min_sup) + + >>> path = [] + >>> ascend_tree(fp_tree.children['A'], path) + >>> path # ascending from a leaf node 'A' + ['A'] + """ + if leaf_node.parent is not None: + prefix_path.append(leaf_node.name) + ascend_tree(leaf_node.parent, prefix_path) + + +def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict: + """ + Find the conditional pattern base for a given base pattern. + + Args: + base_pat (frozenset): The base pattern for which to find + the conditional pattern base. + tree_node (TreeNode): The node in the FP-Tree. + + Example: + >>> data_set = [ + ... ['A', 'B', 'C'], + ... ['A', 'C'], + ... ['A', 'B', 'E'], + ... ['A', 'B', 'C', 'E'], + ... ['B', 'E'] + ... ] + >>> min_sup = 2 + >>> fp_tree, header_table = create_tree(data_set, min_sup) + >>> base_pattern = frozenset(['A']) + >>> cond_pat = find_prefix_path(base_pattern, fp_tree.children['A']) + >>> sorted(cond_pat.keys()) + [] + """ + cond_pats: dict = {} + while tree_node is not None: + prefix_path: list = [] + ascend_tree(tree_node, prefix_path) + if len(prefix_path) > 1: + cond_pats[frozenset(prefix_path[1:])] = tree_node.count + tree_node = tree_node.node_link + return cond_pats + + +def mine_tree( + in_tree: TreeNode, + header_table: dict, + min_sup: int, + pre_fix: set, + freq_item_list: list, +) -> None: + """ + Mine the FP-Tree recursively to discover frequent itemsets. + + Args: + in_tree (TreeNode): The FP-Tree to mine. + header_table (dict): The header table with item information. + min_sup (int): The minimum support threshold. + pre_fix (set): A set of items as a prefix for the itemsets being mined. + freq_item_list (list): A list to store the frequent itemsets. + + Example: + >>> data_set = [ + ... ['A', 'B', 'C'], + ... ['A', 'C'], + ... ['A', 'B', 'E'], + ... ['A', 'B', 'C', 'E'], + ... ['B', 'E'] + ... ] + >>> min_sup = 2 + >>> fp_tree, header_table = create_tree(data_set, min_sup) + + >>> frequent_itemsets = [] + >>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets) + >>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}] + >>> all(expected in frequent_itemsets for expected in expe_itm) + True + """ + sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0]) + big_l = [item[0] for item in sorted_items] + for base_pat in big_l: + new_freq_set = pre_fix.copy() + new_freq_set.add(base_pat) + freq_item_list.append(new_freq_set) + cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1]) + my_cond_tree, my_head = create_tree(list(cond_patt_bases.keys()), min_sup) + if my_head is not None: + # Pass header_table[base_pat][1] as node_to_test to update_header + header_table[base_pat][1] = update_header( + header_table[base_pat][1], my_cond_tree + ) + mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list) + + +if __name__ == "__main__": + import doctest + + doctest.testmod() + + data_set: list = [ + frozenset(["bread", "milk", "cheese"]), + frozenset(["bread", "milk"]), + frozenset(["bread", "diapers"]), + frozenset(["bread", "milk", "diapers"]), + frozenset(["milk", "diapers"]), + frozenset(["milk", "cheese"]), + frozenset(["diapers", "cheese"]), + frozenset(["bread", "milk", "cheese", "diapers"]), + ] + fp_tree, header_table = create_tree(data_set, min_sup=3) + freq_items: list = [] + mine_tree(fp_tree, header_table, 3, set(), freq_items) + print(freq_items) From 71776e78022d6a9699f2ced68bd28d6667e7335e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Oct 2023 05:52:58 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/fp_growth.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning/fp_growth.py b/machine_learning/fp_growth.py index eaf59693f231..356e95347079 100644 --- a/machine_learning/fp_growth.py +++ b/machine_learning/fp_growth.py @@ -193,7 +193,9 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: while node_to_test.node_link is not None: node_to_test = node_to_test.node_link if node_to_test.node_link is None: - node_to_test.node_link = TreeNode(target_node.name, target_node.count, node_to_test) + node_to_test.node_link = TreeNode( + target_node.name, target_node.count, node_to_test + ) # Return the updated node return node_to_test From c0470094d01391294617df6a92734b78b470b127 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sat, 21 Oct 2023 14:48:21 +0530 Subject: [PATCH 3/8] changes names --- DIRECTORY.md | 2 +- ...p_growth.py => frequent_pattern_growth.py} | 32 +++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) rename machine_learning/{fp_growth.py => frequent_pattern_growth.py} (92%) diff --git a/DIRECTORY.md b/DIRECTORY.md index df6692fdddae..916d993c563a 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -541,7 +541,7 @@ * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py) * Forecasting * [Run](machine_learning/forecasting/run.py) - * [FP Growth Algorithm](machine_learning/fp_growth.py) + * [Frequent Pattern Growth Algorithm](machine_learning/frequent_pattern_growth.py) * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) diff --git a/machine_learning/fp_growth.py b/machine_learning/frequent_pattern_growth.py similarity index 92% rename from machine_learning/fp_growth.py rename to machine_learning/frequent_pattern_growth.py index 356e95347079..fa37313be9da 100644 --- a/machine_learning/fp_growth.py +++ b/machine_learning/frequent_pattern_growth.py @@ -10,8 +10,10 @@ """ from typing import Optional +from dataclasses import dataclass, field +@dataclass class TreeNode: """ Initialize a TreeNode. @@ -30,14 +32,19 @@ class TreeNode: 2 """ - def __init__( - self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None - ) -> None: - self.name = name_value - self.count = num_occur - self.node_link = None # Initialize node_link to None - self.parent = parent_node - self.children: dict[str, TreeNode] = {} + # def __init__( + # self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None + # ) -> None: + # self.name = name_value + # self.count = num_occur + # self.node_link = TreeNode | None # Initialize node_link to None + # self.parent = parent_node + # self.children: dict[str, TreeNode] = {} + name: str + count: int + node_link: Optional['TreeNode'] = None # Initialize node_link to None + parent: Optional["TreeNode"] = None + children: dict[str, "TreeNode"] = field(default_factory=dict) def inc(self, num_occur: int) -> None: self.count += num_occur @@ -50,7 +57,7 @@ def disp(self, ind: int = 1) -> None: def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: """ - Create FP tree + Create Frequent Pattern tree Args: data_set (list): A list of transactions, where each transaction @@ -193,10 +200,7 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: while node_to_test.node_link is not None: node_to_test = node_to_test.node_link if node_to_test.node_link is None: - node_to_test.node_link = TreeNode( - target_node.name, target_node.count, node_to_test - ) - # Return the updated node + node_to_test.node_link = target_node return node_to_test @@ -298,6 +302,7 @@ def mine_tree( >>> all(expected in frequent_itemsets for expected in expe_itm) True """ + new_head: Optional['TreeNode'] = None sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0]) big_l = [item[0] for item in sorted_items] for base_pat in big_l: @@ -311,6 +316,7 @@ def mine_tree( header_table[base_pat][1] = update_header( header_table[base_pat][1], my_cond_tree ) + my_head = new_head mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list) From c014b1c006e1a5a7e3ba41856ad736e843df8c34 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Oct 2023 09:19:07 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/frequent_pattern_growth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/frequent_pattern_growth.py index fa37313be9da..3d5d6609fc53 100644 --- a/machine_learning/frequent_pattern_growth.py +++ b/machine_learning/frequent_pattern_growth.py @@ -42,7 +42,7 @@ class TreeNode: # self.children: dict[str, TreeNode] = {} name: str count: int - node_link: Optional['TreeNode'] = None # Initialize node_link to None + node_link: Optional["TreeNode"] = None # Initialize node_link to None parent: Optional["TreeNode"] = None children: dict[str, "TreeNode"] = field(default_factory=dict) @@ -302,7 +302,7 @@ def mine_tree( >>> all(expected in frequent_itemsets for expected in expe_itm) True """ - new_head: Optional['TreeNode'] = None + new_head: Optional["TreeNode"] = None sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0]) big_l = [item[0] for item in sorted_items] for base_pat in big_l: From 3ae692d129e5b60df7baafbd09f2005a4dc305ef Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sat, 21 Oct 2023 14:53:49 +0530 Subject: [PATCH 5/8] Revert "changes names" This reverts commit c0470094d01391294617df6a92734b78b470b127. --- DIRECTORY.md | 2 +- ...requent_pattern_growth.py => fp_growth.py} | 32 ++++++++----------- 2 files changed, 14 insertions(+), 20 deletions(-) rename machine_learning/{frequent_pattern_growth.py => fp_growth.py} (92%) diff --git a/DIRECTORY.md b/DIRECTORY.md index 916d993c563a..df6692fdddae 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -541,7 +541,7 @@ * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py) * Forecasting * [Run](machine_learning/forecasting/run.py) - * [Frequent Pattern Growth Algorithm](machine_learning/frequent_pattern_growth.py) + * [FP Growth Algorithm](machine_learning/fp_growth.py) * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/fp_growth.py similarity index 92% rename from machine_learning/frequent_pattern_growth.py rename to machine_learning/fp_growth.py index fa37313be9da..356e95347079 100644 --- a/machine_learning/frequent_pattern_growth.py +++ b/machine_learning/fp_growth.py @@ -10,10 +10,8 @@ """ from typing import Optional -from dataclasses import dataclass, field -@dataclass class TreeNode: """ Initialize a TreeNode. @@ -32,19 +30,14 @@ class TreeNode: 2 """ - # def __init__( - # self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None - # ) -> None: - # self.name = name_value - # self.count = num_occur - # self.node_link = TreeNode | None # Initialize node_link to None - # self.parent = parent_node - # self.children: dict[str, TreeNode] = {} - name: str - count: int - node_link: Optional['TreeNode'] = None # Initialize node_link to None - parent: Optional["TreeNode"] = None - children: dict[str, "TreeNode"] = field(default_factory=dict) + def __init__( + self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None + ) -> None: + self.name = name_value + self.count = num_occur + self.node_link = None # Initialize node_link to None + self.parent = parent_node + self.children: dict[str, TreeNode] = {} def inc(self, num_occur: int) -> None: self.count += num_occur @@ -57,7 +50,7 @@ def disp(self, ind: int = 1) -> None: def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: """ - Create Frequent Pattern tree + Create FP tree Args: data_set (list): A list of transactions, where each transaction @@ -200,7 +193,10 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: while node_to_test.node_link is not None: node_to_test = node_to_test.node_link if node_to_test.node_link is None: - node_to_test.node_link = target_node + node_to_test.node_link = TreeNode( + target_node.name, target_node.count, node_to_test + ) + # Return the updated node return node_to_test @@ -302,7 +298,6 @@ def mine_tree( >>> all(expected in frequent_itemsets for expected in expe_itm) True """ - new_head: Optional['TreeNode'] = None sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0]) big_l = [item[0] for item in sorted_items] for base_pat in big_l: @@ -316,7 +311,6 @@ def mine_tree( header_table[base_pat][1] = update_header( header_table[base_pat][1], my_cond_tree ) - my_head = new_head mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list) From 973ae0200870f88ea0b2b17e40abc0890582df57 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sat, 21 Oct 2023 15:59:24 +0530 Subject: [PATCH 6/8] refactore code --- DIRECTORY.md | 2 +- ...p_growth.py => frequent_pattern_growth.py} | 23 +++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) rename machine_learning/{fp_growth.py => frequent_pattern_growth.py} (95%) diff --git a/DIRECTORY.md b/DIRECTORY.md index df6692fdddae..916d993c563a 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -541,7 +541,7 @@ * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py) * Forecasting * [Run](machine_learning/forecasting/run.py) - * [FP Growth Algorithm](machine_learning/fp_growth.py) + * [Frequent Pattern Growth Algorithm](machine_learning/frequent_pattern_growth.py) * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) diff --git a/machine_learning/fp_growth.py b/machine_learning/frequent_pattern_growth.py similarity index 95% rename from machine_learning/fp_growth.py rename to machine_learning/frequent_pattern_growth.py index 356e95347079..df37eeee2f08 100644 --- a/machine_learning/fp_growth.py +++ b/machine_learning/frequent_pattern_growth.py @@ -9,9 +9,10 @@ Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining """ +# from dataclasses import dataclass, field from typing import Optional - +# @dataclass class TreeNode: """ Initialize a TreeNode. @@ -31,7 +32,8 @@ class TreeNode: """ def __init__( - self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None + self, name_value: str, num_occur: int, + parent_node: Optional["TreeNode"] = None ) -> None: self.name = name_value self.count = num_occur @@ -39,6 +41,13 @@ def __init__( self.parent = parent_node self.children: dict[str, TreeNode] = {} + # name: str + # count: int + # node_link: Optional["TreeNode"] = None + # parent: Optional["TreeNode"] = None + # children: dict[str, "TreeNode"] = field(default_factory=dict) + + def inc(self, num_occur: int) -> None: self.count += num_occur @@ -50,7 +59,7 @@ def disp(self, ind: int = 1) -> None: def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: """ - Create FP tree + Create Frequent Pattern tree Args: data_set (list): A list of transactions, where each transaction @@ -96,9 +105,7 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: if header_table[k][0] < min_sup: del header_table[k] - freq_item_set = set(header_table.keys()) - - if len(freq_item_set) == 0: + if not (freq_item_set := set(header_table)): return TreeNode("Null Set", 1, None), {} for k in header_table: @@ -193,9 +200,7 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: while node_to_test.node_link is not None: node_to_test = node_to_test.node_link if node_to_test.node_link is None: - node_to_test.node_link = TreeNode( - target_node.name, target_node.count, node_to_test - ) + node_to_test.node_link = target_node # Return the updated node return node_to_test From 8a1f71b812b678097cb6e962c01aa04cd6c53ee4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Oct 2023 10:30:26 +0000 Subject: [PATCH 7/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/frequent_pattern_growth.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/frequent_pattern_growth.py index df37eeee2f08..b58ef5d45746 100644 --- a/machine_learning/frequent_pattern_growth.py +++ b/machine_learning/frequent_pattern_growth.py @@ -12,6 +12,7 @@ # from dataclasses import dataclass, field from typing import Optional + # @dataclass class TreeNode: """ @@ -32,8 +33,7 @@ class TreeNode: """ def __init__( - self, name_value: str, num_occur: int, - parent_node: Optional["TreeNode"] = None + self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None ) -> None: self.name = name_value self.count = num_occur @@ -47,7 +47,6 @@ def __init__( # parent: Optional["TreeNode"] = None # children: dict[str, "TreeNode"] = field(default_factory=dict) - def inc(self, num_occur: int) -> None: self.count += num_occur From d4ded62a0d217749b430ebac926ba1a4dc1a81ec Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 21 Oct 2023 16:47:43 +0200 Subject: [PATCH 8/8] Update frequent_pattern_growth.py --- machine_learning/frequent_pattern_growth.py | 168 +++++++++++--------- 1 file changed, 89 insertions(+), 79 deletions(-) diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/frequent_pattern_growth.py index b58ef5d45746..205d598464a1 100644 --- a/machine_learning/frequent_pattern_growth.py +++ b/machine_learning/frequent_pattern_growth.py @@ -1,27 +1,28 @@ """ -The FP-Growth (Frequent Pattern Growth) algorithm is a widely used -data mining technique for discovering frequent itemsets in -large transaction databases. -It overcomes some of the limitations of traditional methods like -Apriori by efficiently constructing the FP-Tree +The Frequent Pattern Growth algorithm (FP-Growth) is a widely used data mining +technique for discovering frequent itemsets in large transaction databases. + +It overcomes some of the limitations of traditional methods such as Apriori by +efficiently constructing the FP-Tree WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html + Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining """ +from __future__ import annotations -# from dataclasses import dataclass, field -from typing import Optional +from dataclasses import dataclass, field -# @dataclass +@dataclass class TreeNode: """ - Initialize a TreeNode. + A node in a Frequent Pattern tree. Args: - name_value (str): The name of the node. - num_occur (int): The number of occurrences of the node. - parent_node (TreeNode): The parent node. + name: The name of this node. + num_occur: The number of occurrences of the node. + parent_node: The parent node. Example: >>> parent = TreeNode("Parent", 1, None) @@ -32,26 +33,20 @@ class TreeNode: 2 """ - def __init__( - self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None - ) -> None: - self.name = name_value - self.count = num_occur - self.node_link = None # Initialize node_link to None - self.parent = parent_node - self.children: dict[str, TreeNode] = {} - - # name: str - # count: int - # node_link: Optional["TreeNode"] = None - # parent: Optional["TreeNode"] = None - # children: dict[str, "TreeNode"] = field(default_factory=dict) + name: str + count: int + parent: TreeNode | None = None + children: dict[str, TreeNode] = field(default_factory=dict) + node_link: TreeNode | None = None + + def __repr__(self) -> str: + return f"TreeNode({self.name!r}, {self.count!r}, {self.parent!r})" def inc(self, num_occur: int) -> None: self.count += num_occur def disp(self, ind: int = 1) -> None: - print(" " * ind, self.name, " ", self.count) + print(f"{' ' * ind} {self.name} {self.count}") for child in self.children.values(): child.disp(ind + 1) @@ -61,14 +56,13 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: Create Frequent Pattern tree Args: - data_set (list): A list of transactions, where each transaction - is a list of items. - min_sup (int, optional): The minimum support threshold. + data_set: A list of transactions, where each transaction is a list of items. + min_sup: The minimum support threshold. Items with support less than this will be pruned. Default is 1. Returns: - TreeNode: The root of the FP-Tree. - dict: The header table. + The root of the FP-Tree. + header_table: The header table dictionary with item information. Example: >>> data_set = [ @@ -80,19 +74,24 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: ... ] >>> min_sup = 2 >>> fp_tree, header_table = create_tree(data_set, min_sup) - - >>> sorted(list(header_table.keys())) + >>> fp_tree + TreeNode('Null Set', 1, None) + >>> len(header_table) + 4 + >>> header_table["A"] + [[4, None], TreeNode('A', 4, TreeNode('Null Set', 1, None))] + >>> header_table["E"][1] # doctest: +NORMALIZE_WHITESPACE + TreeNode('E', 1, TreeNode('B', 3, TreeNode('A', 4, TreeNode('Null Set', 1, None)))) + >>> sorted(header_table) ['A', 'B', 'C', 'E'] - >>> fp_tree.name 'Null Set' - >>> sorted(fp_tree.children.keys()) + >>> sorted(fp_tree.children) ['A', 'B'] >>> fp_tree.children['A'].name 'A' - >>> sorted(fp_tree.children['A'].children.keys()) + >>> sorted(fp_tree.children['A'].children) ['B', 'C'] - """ header_table: dict = {} for trans in data_set: @@ -100,7 +99,7 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: header_table[item] = header_table.get(item, [0, None]) header_table[item][0] += 1 - for k in list(header_table.keys()): + for k in list(header_table): if header_table[k][0] < min_sup: del header_table[k] @@ -112,11 +111,10 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]: fp_tree = TreeNode("Null Set", 1, None) # Parent is None for the root node for tran_set in data_set: - local_d = {} - for item in tran_set: - if item in freq_item_set: - local_d[item] = header_table[item][0] - if len(local_d) > 0: + local_d = { + item: header_table[item][0] for item in tran_set if item in freq_item_set + } + if local_d: sorted_items = sorted( local_d.items(), key=lambda item_info: item_info[1], reverse=True ) @@ -131,10 +129,10 @@ def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) Update the FP-Tree with a transaction. Args: - items (list): List of items in the transaction. - in_tree (TreeNode): The current node in the FP-Tree. - header_table (dict): The header table with item information. - count (int): The count of the transaction. + items: List of items in the transaction. + in_tree: The current node in the FP-Tree. + header_table: The header table dictionary with item information. + count: The count of the transaction. Example: >>> data_set = [ @@ -146,12 +144,14 @@ def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) ... ] >>> min_sup = 2 >>> fp_tree, header_table = create_tree(data_set, min_sup) - + >>> fp_tree + TreeNode('Null Set', 1, None) >>> transaction = ['A', 'B', 'E'] >>> update_tree(transaction, fp_tree, header_table, 1) - - >>> sorted(fp_tree.children['A'].children['B'].children['E'].children.keys()) - [] + >>> fp_tree + TreeNode('Null Set', 1, None) + >>> fp_tree.children['A'].children['B'].children['E'].children + {} >>> fp_tree.children['A'].children['B'].children['E'].count 2 >>> header_table['E'][1].name @@ -174,8 +174,8 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: Update the header table with a node link. Args: - node_to_test (TreeNode): The node to be updated in the header table. - target_node (TreeNode): The node to link to. + node_to_test: The node to be updated in the header table. + target_node: The node to link to. Example: >>> data_set = [ @@ -187,12 +187,17 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: ... ] >>> min_sup = 2 >>> fp_tree, header_table = create_tree(data_set, min_sup) - + >>> fp_tree + TreeNode('Null Set', 1, None) >>> node1 = TreeNode("A", 3, None) >>> node2 = TreeNode("B", 4, None) + >>> node1 + TreeNode('A', 3, None) >>> node1 = update_header(node1, node2) - >>> node1.node_link.name - 'B' + >>> node1 + TreeNode('A', 3, None) + >>> node1.node_link + TreeNode('B', 4, None) >>> node2.node_link is None True """ @@ -204,14 +209,14 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode: return node_to_test -def ascend_tree(leaf_node: TreeNode, prefix_path: list) -> None: +def ascend_tree(leaf_node: TreeNode, prefix_path: list[str]) -> None: """ - Ascend the FP-Tree from a leaf node to its root, - adding item names to the prefix path. + Ascend the FP-Tree from a leaf node to its root, adding item names to the prefix + path. Args: - leaf_node (TreeNode): The leaf node to start ascending from. - prefix_path (list): A list to store the item as they are ascended. + leaf_node: The leaf node to start ascending from. + prefix_path: A list to store the item as they are ascended. Example: >>> data_set = [ @@ -239,9 +244,8 @@ def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict: Find the conditional pattern base for a given base pattern. Args: - base_pat (frozenset): The base pattern for which to find - the conditional pattern base. - tree_node (TreeNode): The node in the FP-Tree. + base_pat: The base pattern for which to find the conditional pattern base. + tree_node: The node in the FP-Tree. Example: >>> data_set = [ @@ -253,9 +257,12 @@ def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict: ... ] >>> min_sup = 2 >>> fp_tree, header_table = create_tree(data_set, min_sup) + >>> fp_tree + TreeNode('Null Set', 1, None) + >>> len(header_table) + 4 >>> base_pattern = frozenset(['A']) - >>> cond_pat = find_prefix_path(base_pattern, fp_tree.children['A']) - >>> sorted(cond_pat.keys()) + >>> sorted(find_prefix_path(base_pattern, fp_tree.children['A'])) [] """ cond_pats: dict = {} @@ -279,11 +286,11 @@ def mine_tree( Mine the FP-Tree recursively to discover frequent itemsets. Args: - in_tree (TreeNode): The FP-Tree to mine. - header_table (dict): The header table with item information. - min_sup (int): The minimum support threshold. - pre_fix (set): A set of items as a prefix for the itemsets being mined. - freq_item_list (list): A list to store the frequent itemsets. + in_tree: The FP-Tree to mine. + header_table: The header table dictionary with item information. + min_sup: The minimum support threshold. + pre_fix: A set of items as a prefix for the itemsets being mined. + freq_item_list: A list to store the frequent itemsets. Example: >>> data_set = [ @@ -295,7 +302,8 @@ def mine_tree( ... ] >>> min_sup = 2 >>> fp_tree, header_table = create_tree(data_set, min_sup) - + >>> fp_tree + TreeNode('Null Set', 1, None) >>> frequent_itemsets = [] >>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets) >>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}] @@ -309,7 +317,7 @@ def mine_tree( new_freq_set.add(base_pat) freq_item_list.append(new_freq_set) cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1]) - my_cond_tree, my_head = create_tree(list(cond_patt_bases.keys()), min_sup) + my_cond_tree, my_head = create_tree(list(cond_patt_bases), min_sup) if my_head is not None: # Pass header_table[base_pat][1] as node_to_test to update_header header_table[base_pat][1] = update_header( @@ -319,11 +327,10 @@ def mine_tree( if __name__ == "__main__": - import doctest - - doctest.testmod() + from doctest import testmod - data_set: list = [ + testmod() + data_set: list[frozenset] = [ frozenset(["bread", "milk", "cheese"]), frozenset(["bread", "milk"]), frozenset(["bread", "diapers"]), @@ -333,7 +340,10 @@ def mine_tree( frozenset(["diapers", "cheese"]), frozenset(["bread", "milk", "cheese", "diapers"]), ] + print(f"{len(data_set) = }") fp_tree, header_table = create_tree(data_set, min_sup=3) + print(f"{fp_tree = }") + print(f"{len(header_table) = }") freq_items: list = [] mine_tree(fp_tree, header_table, 3, set(), freq_items) - print(freq_items) + print(f"{freq_items = }")