Skip to content

Commit 0ea8df7

Browse files
author
Jeel Gajera
committed
Add: FP Growth Algorithm
1 parent 5645084 commit 0ea8df7

File tree

2 files changed

+334
-0
lines changed

2 files changed

+334
-0
lines changed

Diff for: DIRECTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@
541541
* [Dimensionality Reduction](machine_learning/dimensionality_reduction.py)
542542
* Forecasting
543543
* [Run](machine_learning/forecasting/run.py)
544+
* [FP Growth Algorithm](machine_learning/fp_growth.py)
544545
* [Gradient Descent](machine_learning/gradient_descent.py)
545546
* [K Means Clust](machine_learning/k_means_clust.py)
546547
* [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)

Diff for: machine_learning/fp_growth.py

+333
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
"""
2+
The FP-Growth (Frequent Pattern Growth) algorithm is a widely used
3+
data mining technique for discovering frequent itemsets in
4+
large transaction databases.
5+
It overcomes some of the limitations of traditional methods like
6+
Apriori by efficiently constructing the FP-Tree
7+
8+
WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html
9+
Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining
10+
"""
11+
12+
from typing import Optional
13+
14+
15+
class TreeNode:
16+
"""
17+
Initialize a TreeNode.
18+
19+
Args:
20+
name_value (str): The name of the node.
21+
num_occur (int): The number of occurrences of the node.
22+
parent_node (TreeNode): The parent node.
23+
24+
Example:
25+
>>> parent = TreeNode("Parent", 1, None)
26+
>>> child = TreeNode("Child", 2, parent)
27+
>>> child.name
28+
'Child'
29+
>>> child.count
30+
2
31+
"""
32+
33+
def __init__(
34+
self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
35+
) -> None:
36+
self.name = name_value
37+
self.count = num_occur
38+
self.node_link = None # Initialize node_link to None
39+
self.parent = parent_node
40+
self.children: dict[str, TreeNode] = {}
41+
42+
def inc(self, num_occur: int) -> None:
43+
self.count += num_occur
44+
45+
def disp(self, ind: int = 1) -> None:
46+
print(" " * ind, self.name, " ", self.count)
47+
for child in self.children.values():
48+
child.disp(ind + 1)
49+
50+
51+
def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
52+
"""
53+
Create FP tree
54+
55+
Args:
56+
data_set (list): A list of transactions, where each transaction
57+
is a list of items.
58+
min_sup (int, optional): The minimum support threshold.
59+
Items with support less than this will be pruned. Default is 1.
60+
61+
Returns:
62+
TreeNode: The root of the FP-Tree.
63+
dict: The header table.
64+
65+
Example:
66+
>>> data_set = [
67+
... ['A', 'B', 'C'],
68+
... ['A', 'C'],
69+
... ['A', 'B', 'E'],
70+
... ['A', 'B', 'C', 'E'],
71+
... ['B', 'E']
72+
... ]
73+
>>> min_sup = 2
74+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
75+
76+
>>> sorted(list(header_table.keys()))
77+
['A', 'B', 'C', 'E']
78+
79+
>>> fp_tree.name
80+
'Null Set'
81+
>>> sorted(fp_tree.children.keys())
82+
['A', 'B']
83+
>>> fp_tree.children['A'].name
84+
'A'
85+
>>> sorted(fp_tree.children['A'].children.keys())
86+
['B', 'C']
87+
88+
"""
89+
header_table: dict = {}
90+
for trans in data_set:
91+
for item in trans:
92+
header_table[item] = header_table.get(item, [0, None])
93+
header_table[item][0] += 1
94+
95+
for k in list(header_table.keys()):
96+
if header_table[k][0] < min_sup:
97+
del header_table[k]
98+
99+
freq_item_set = set(header_table.keys())
100+
101+
if len(freq_item_set) == 0:
102+
return TreeNode("Null Set", 1, None), {}
103+
104+
for k in header_table:
105+
header_table[k] = [header_table[k], None]
106+
107+
fp_tree = TreeNode("Null Set", 1, None) # Parent is None for the root node
108+
for tran_set in data_set:
109+
local_d = {}
110+
for item in tran_set:
111+
if item in freq_item_set:
112+
local_d[item] = header_table[item][0]
113+
if len(local_d) > 0:
114+
sorted_items = sorted(
115+
local_d.items(), key=lambda item_info: item_info[1], reverse=True
116+
)
117+
ordered_items = [item[0] for item in sorted_items]
118+
update_tree(ordered_items, fp_tree, header_table, 1)
119+
120+
return fp_tree, header_table
121+
122+
123+
def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) -> None:
124+
"""
125+
Update the FP-Tree with a transaction.
126+
127+
Args:
128+
items (list): List of items in the transaction.
129+
in_tree (TreeNode): The current node in the FP-Tree.
130+
header_table (dict): The header table with item information.
131+
count (int): The count of the transaction.
132+
133+
Example:
134+
>>> data_set = [
135+
... ['A', 'B', 'C'],
136+
... ['A', 'C'],
137+
... ['A', 'B', 'E'],
138+
... ['A', 'B', 'C', 'E'],
139+
... ['B', 'E']
140+
... ]
141+
>>> min_sup = 2
142+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
143+
144+
>>> transaction = ['A', 'B', 'E']
145+
>>> update_tree(transaction, fp_tree, header_table, 1)
146+
147+
>>> sorted(fp_tree.children['A'].children['B'].children['E'].children.keys())
148+
[]
149+
>>> fp_tree.children['A'].children['B'].children['E'].count
150+
2
151+
>>> header_table['E'][1].name
152+
'E'
153+
"""
154+
if items[0] in in_tree.children:
155+
in_tree.children[items[0]].inc(count)
156+
else:
157+
in_tree.children[items[0]] = TreeNode(items[0], count, in_tree)
158+
if header_table[items[0]][1] is None:
159+
header_table[items[0]][1] = in_tree.children[items[0]]
160+
else:
161+
update_header(header_table[items[0]][1], in_tree.children[items[0]])
162+
if len(items) > 1:
163+
update_tree(items[1:], in_tree.children[items[0]], header_table, count)
164+
165+
166+
def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
167+
"""
168+
Update the header table with a node link.
169+
170+
Args:
171+
node_to_test (TreeNode): The node to be updated in the header table.
172+
target_node (TreeNode): The node to link to.
173+
174+
Example:
175+
>>> data_set = [
176+
... ['A', 'B', 'C'],
177+
... ['A', 'C'],
178+
... ['A', 'B', 'E'],
179+
... ['A', 'B', 'C', 'E'],
180+
... ['B', 'E']
181+
... ]
182+
>>> min_sup = 2
183+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
184+
185+
>>> node1 = TreeNode("A", 3, None)
186+
>>> node2 = TreeNode("B", 4, None)
187+
>>> node1 = update_header(node1, node2)
188+
>>> node1.node_link.name
189+
'B'
190+
>>> node2.node_link is None
191+
True
192+
"""
193+
while node_to_test.node_link is not None:
194+
node_to_test = node_to_test.node_link
195+
if node_to_test.node_link is None:
196+
node_to_test.node_link = TreeNode(target_node.name, target_node.count, node_to_test)
197+
# Return the updated node
198+
return node_to_test
199+
200+
201+
def ascend_tree(leaf_node: TreeNode, prefix_path: list) -> None:
202+
"""
203+
Ascend the FP-Tree from a leaf node to its root,
204+
adding item names to the prefix path.
205+
206+
Args:
207+
leaf_node (TreeNode): The leaf node to start ascending from.
208+
prefix_path (list): A list to store the item as they are ascended.
209+
210+
Example:
211+
>>> data_set = [
212+
... ['A', 'B', 'C'],
213+
... ['A', 'C'],
214+
... ['A', 'B', 'E'],
215+
... ['A', 'B', 'C', 'E'],
216+
... ['B', 'E']
217+
... ]
218+
>>> min_sup = 2
219+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
220+
221+
>>> path = []
222+
>>> ascend_tree(fp_tree.children['A'], path)
223+
>>> path # ascending from a leaf node 'A'
224+
['A']
225+
"""
226+
if leaf_node.parent is not None:
227+
prefix_path.append(leaf_node.name)
228+
ascend_tree(leaf_node.parent, prefix_path)
229+
230+
231+
def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict:
232+
"""
233+
Find the conditional pattern base for a given base pattern.
234+
235+
Args:
236+
base_pat (frozenset): The base pattern for which to find
237+
the conditional pattern base.
238+
tree_node (TreeNode): The node in the FP-Tree.
239+
240+
Example:
241+
>>> data_set = [
242+
... ['A', 'B', 'C'],
243+
... ['A', 'C'],
244+
... ['A', 'B', 'E'],
245+
... ['A', 'B', 'C', 'E'],
246+
... ['B', 'E']
247+
... ]
248+
>>> min_sup = 2
249+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
250+
>>> base_pattern = frozenset(['A'])
251+
>>> cond_pat = find_prefix_path(base_pattern, fp_tree.children['A'])
252+
>>> sorted(cond_pat.keys())
253+
[]
254+
"""
255+
cond_pats: dict = {}
256+
while tree_node is not None:
257+
prefix_path: list = []
258+
ascend_tree(tree_node, prefix_path)
259+
if len(prefix_path) > 1:
260+
cond_pats[frozenset(prefix_path[1:])] = tree_node.count
261+
tree_node = tree_node.node_link
262+
return cond_pats
263+
264+
265+
def mine_tree(
266+
in_tree: TreeNode,
267+
header_table: dict,
268+
min_sup: int,
269+
pre_fix: set,
270+
freq_item_list: list,
271+
) -> None:
272+
"""
273+
Mine the FP-Tree recursively to discover frequent itemsets.
274+
275+
Args:
276+
in_tree (TreeNode): The FP-Tree to mine.
277+
header_table (dict): The header table with item information.
278+
min_sup (int): The minimum support threshold.
279+
pre_fix (set): A set of items as a prefix for the itemsets being mined.
280+
freq_item_list (list): A list to store the frequent itemsets.
281+
282+
Example:
283+
>>> data_set = [
284+
... ['A', 'B', 'C'],
285+
... ['A', 'C'],
286+
... ['A', 'B', 'E'],
287+
... ['A', 'B', 'C', 'E'],
288+
... ['B', 'E']
289+
... ]
290+
>>> min_sup = 2
291+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
292+
293+
>>> frequent_itemsets = []
294+
>>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets)
295+
>>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}]
296+
>>> all(expected in frequent_itemsets for expected in expe_itm)
297+
True
298+
"""
299+
sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0])
300+
big_l = [item[0] for item in sorted_items]
301+
for base_pat in big_l:
302+
new_freq_set = pre_fix.copy()
303+
new_freq_set.add(base_pat)
304+
freq_item_list.append(new_freq_set)
305+
cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1])
306+
my_cond_tree, my_head = create_tree(list(cond_patt_bases.keys()), min_sup)
307+
if my_head is not None:
308+
# Pass header_table[base_pat][1] as node_to_test to update_header
309+
header_table[base_pat][1] = update_header(
310+
header_table[base_pat][1], my_cond_tree
311+
)
312+
mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list)
313+
314+
315+
if __name__ == "__main__":
316+
import doctest
317+
318+
doctest.testmod()
319+
320+
data_set: list = [
321+
frozenset(["bread", "milk", "cheese"]),
322+
frozenset(["bread", "milk"]),
323+
frozenset(["bread", "diapers"]),
324+
frozenset(["bread", "milk", "diapers"]),
325+
frozenset(["milk", "diapers"]),
326+
frozenset(["milk", "cheese"]),
327+
frozenset(["diapers", "cheese"]),
328+
frozenset(["bread", "milk", "cheese", "diapers"]),
329+
]
330+
fp_tree, header_table = create_tree(data_set, min_sup=3)
331+
freq_items: list = []
332+
mine_tree(fp_tree, header_table, 3, set(), freq_items)
333+
print(freq_items)

0 commit comments

Comments
 (0)