Skip to content

Commit e582f61

Browse files
authored
Merge pull request #1 from TheAlgorithms/master
Add: FP Growth Algorithm (#10746)
2 parents 5645084 + 47c19d9 commit e582f61

File tree

2 files changed

+350
-0
lines changed

2 files changed

+350
-0
lines changed

Diff for: DIRECTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@
541541
* [Dimensionality Reduction](machine_learning/dimensionality_reduction.py)
542542
* Forecasting
543543
* [Run](machine_learning/forecasting/run.py)
544+
* [Frequent Pattern Growth Algorithm](machine_learning/frequent_pattern_growth.py)
544545
* [Gradient Descent](machine_learning/gradient_descent.py)
545546
* [K Means Clust](machine_learning/k_means_clust.py)
546547
* [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)

Diff for: machine_learning/frequent_pattern_growth.py

+349
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
"""
2+
The Frequent Pattern Growth algorithm (FP-Growth) is a widely used data mining
3+
technique for discovering frequent itemsets in large transaction databases.
4+
5+
It overcomes some of the limitations of traditional methods such as Apriori by
6+
efficiently constructing the FP-Tree
7+
8+
WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html
9+
10+
Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining
11+
"""
12+
from __future__ import annotations
13+
14+
from dataclasses import dataclass, field
15+
16+
17+
@dataclass
18+
class TreeNode:
19+
"""
20+
A node in a Frequent Pattern tree.
21+
22+
Args:
23+
name: The name of this node.
24+
num_occur: The number of occurrences of the node.
25+
parent_node: The parent node.
26+
27+
Example:
28+
>>> parent = TreeNode("Parent", 1, None)
29+
>>> child = TreeNode("Child", 2, parent)
30+
>>> child.name
31+
'Child'
32+
>>> child.count
33+
2
34+
"""
35+
36+
name: str
37+
count: int
38+
parent: TreeNode | None = None
39+
children: dict[str, TreeNode] = field(default_factory=dict)
40+
node_link: TreeNode | None = None
41+
42+
def __repr__(self) -> str:
43+
return f"TreeNode({self.name!r}, {self.count!r}, {self.parent!r})"
44+
45+
def inc(self, num_occur: int) -> None:
46+
self.count += num_occur
47+
48+
def disp(self, ind: int = 1) -> None:
49+
print(f"{' ' * ind} {self.name} {self.count}")
50+
for child in self.children.values():
51+
child.disp(ind + 1)
52+
53+
54+
def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
55+
"""
56+
Create Frequent Pattern tree
57+
58+
Args:
59+
data_set: A list of transactions, where each transaction is a list of items.
60+
min_sup: The minimum support threshold.
61+
Items with support less than this will be pruned. Default is 1.
62+
63+
Returns:
64+
The root of the FP-Tree.
65+
header_table: The header table dictionary with item information.
66+
67+
Example:
68+
>>> data_set = [
69+
... ['A', 'B', 'C'],
70+
... ['A', 'C'],
71+
... ['A', 'B', 'E'],
72+
... ['A', 'B', 'C', 'E'],
73+
... ['B', 'E']
74+
... ]
75+
>>> min_sup = 2
76+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
77+
>>> fp_tree
78+
TreeNode('Null Set', 1, None)
79+
>>> len(header_table)
80+
4
81+
>>> header_table["A"]
82+
[[4, None], TreeNode('A', 4, TreeNode('Null Set', 1, None))]
83+
>>> header_table["E"][1] # doctest: +NORMALIZE_WHITESPACE
84+
TreeNode('E', 1, TreeNode('B', 3, TreeNode('A', 4, TreeNode('Null Set', 1, None))))
85+
>>> sorted(header_table)
86+
['A', 'B', 'C', 'E']
87+
>>> fp_tree.name
88+
'Null Set'
89+
>>> sorted(fp_tree.children)
90+
['A', 'B']
91+
>>> fp_tree.children['A'].name
92+
'A'
93+
>>> sorted(fp_tree.children['A'].children)
94+
['B', 'C']
95+
"""
96+
header_table: dict = {}
97+
for trans in data_set:
98+
for item in trans:
99+
header_table[item] = header_table.get(item, [0, None])
100+
header_table[item][0] += 1
101+
102+
for k in list(header_table):
103+
if header_table[k][0] < min_sup:
104+
del header_table[k]
105+
106+
if not (freq_item_set := set(header_table)):
107+
return TreeNode("Null Set", 1, None), {}
108+
109+
for k in header_table:
110+
header_table[k] = [header_table[k], None]
111+
112+
fp_tree = TreeNode("Null Set", 1, None) # Parent is None for the root node
113+
for tran_set in data_set:
114+
local_d = {
115+
item: header_table[item][0] for item in tran_set if item in freq_item_set
116+
}
117+
if local_d:
118+
sorted_items = sorted(
119+
local_d.items(), key=lambda item_info: item_info[1], reverse=True
120+
)
121+
ordered_items = [item[0] for item in sorted_items]
122+
update_tree(ordered_items, fp_tree, header_table, 1)
123+
124+
return fp_tree, header_table
125+
126+
127+
def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) -> None:
128+
"""
129+
Update the FP-Tree with a transaction.
130+
131+
Args:
132+
items: List of items in the transaction.
133+
in_tree: The current node in the FP-Tree.
134+
header_table: The header table dictionary with item information.
135+
count: The count of the transaction.
136+
137+
Example:
138+
>>> data_set = [
139+
... ['A', 'B', 'C'],
140+
... ['A', 'C'],
141+
... ['A', 'B', 'E'],
142+
... ['A', 'B', 'C', 'E'],
143+
... ['B', 'E']
144+
... ]
145+
>>> min_sup = 2
146+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
147+
>>> fp_tree
148+
TreeNode('Null Set', 1, None)
149+
>>> transaction = ['A', 'B', 'E']
150+
>>> update_tree(transaction, fp_tree, header_table, 1)
151+
>>> fp_tree
152+
TreeNode('Null Set', 1, None)
153+
>>> fp_tree.children['A'].children['B'].children['E'].children
154+
{}
155+
>>> fp_tree.children['A'].children['B'].children['E'].count
156+
2
157+
>>> header_table['E'][1].name
158+
'E'
159+
"""
160+
if items[0] in in_tree.children:
161+
in_tree.children[items[0]].inc(count)
162+
else:
163+
in_tree.children[items[0]] = TreeNode(items[0], count, in_tree)
164+
if header_table[items[0]][1] is None:
165+
header_table[items[0]][1] = in_tree.children[items[0]]
166+
else:
167+
update_header(header_table[items[0]][1], in_tree.children[items[0]])
168+
if len(items) > 1:
169+
update_tree(items[1:], in_tree.children[items[0]], header_table, count)
170+
171+
172+
def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
173+
"""
174+
Update the header table with a node link.
175+
176+
Args:
177+
node_to_test: The node to be updated in the header table.
178+
target_node: The node to link to.
179+
180+
Example:
181+
>>> data_set = [
182+
... ['A', 'B', 'C'],
183+
... ['A', 'C'],
184+
... ['A', 'B', 'E'],
185+
... ['A', 'B', 'C', 'E'],
186+
... ['B', 'E']
187+
... ]
188+
>>> min_sup = 2
189+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
190+
>>> fp_tree
191+
TreeNode('Null Set', 1, None)
192+
>>> node1 = TreeNode("A", 3, None)
193+
>>> node2 = TreeNode("B", 4, None)
194+
>>> node1
195+
TreeNode('A', 3, None)
196+
>>> node1 = update_header(node1, node2)
197+
>>> node1
198+
TreeNode('A', 3, None)
199+
>>> node1.node_link
200+
TreeNode('B', 4, None)
201+
>>> node2.node_link is None
202+
True
203+
"""
204+
while node_to_test.node_link is not None:
205+
node_to_test = node_to_test.node_link
206+
if node_to_test.node_link is None:
207+
node_to_test.node_link = target_node
208+
# Return the updated node
209+
return node_to_test
210+
211+
212+
def ascend_tree(leaf_node: TreeNode, prefix_path: list[str]) -> None:
213+
"""
214+
Ascend the FP-Tree from a leaf node to its root, adding item names to the prefix
215+
path.
216+
217+
Args:
218+
leaf_node: The leaf node to start ascending from.
219+
prefix_path: A list to store the item as they are ascended.
220+
221+
Example:
222+
>>> data_set = [
223+
... ['A', 'B', 'C'],
224+
... ['A', 'C'],
225+
... ['A', 'B', 'E'],
226+
... ['A', 'B', 'C', 'E'],
227+
... ['B', 'E']
228+
... ]
229+
>>> min_sup = 2
230+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
231+
232+
>>> path = []
233+
>>> ascend_tree(fp_tree.children['A'], path)
234+
>>> path # ascending from a leaf node 'A'
235+
['A']
236+
"""
237+
if leaf_node.parent is not None:
238+
prefix_path.append(leaf_node.name)
239+
ascend_tree(leaf_node.parent, prefix_path)
240+
241+
242+
def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict:
243+
"""
244+
Find the conditional pattern base for a given base pattern.
245+
246+
Args:
247+
base_pat: The base pattern for which to find the conditional pattern base.
248+
tree_node: The node in the FP-Tree.
249+
250+
Example:
251+
>>> data_set = [
252+
... ['A', 'B', 'C'],
253+
... ['A', 'C'],
254+
... ['A', 'B', 'E'],
255+
... ['A', 'B', 'C', 'E'],
256+
... ['B', 'E']
257+
... ]
258+
>>> min_sup = 2
259+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
260+
>>> fp_tree
261+
TreeNode('Null Set', 1, None)
262+
>>> len(header_table)
263+
4
264+
>>> base_pattern = frozenset(['A'])
265+
>>> sorted(find_prefix_path(base_pattern, fp_tree.children['A']))
266+
[]
267+
"""
268+
cond_pats: dict = {}
269+
while tree_node is not None:
270+
prefix_path: list = []
271+
ascend_tree(tree_node, prefix_path)
272+
if len(prefix_path) > 1:
273+
cond_pats[frozenset(prefix_path[1:])] = tree_node.count
274+
tree_node = tree_node.node_link
275+
return cond_pats
276+
277+
278+
def mine_tree(
279+
in_tree: TreeNode,
280+
header_table: dict,
281+
min_sup: int,
282+
pre_fix: set,
283+
freq_item_list: list,
284+
) -> None:
285+
"""
286+
Mine the FP-Tree recursively to discover frequent itemsets.
287+
288+
Args:
289+
in_tree: The FP-Tree to mine.
290+
header_table: The header table dictionary with item information.
291+
min_sup: The minimum support threshold.
292+
pre_fix: A set of items as a prefix for the itemsets being mined.
293+
freq_item_list: A list to store the frequent itemsets.
294+
295+
Example:
296+
>>> data_set = [
297+
... ['A', 'B', 'C'],
298+
... ['A', 'C'],
299+
... ['A', 'B', 'E'],
300+
... ['A', 'B', 'C', 'E'],
301+
... ['B', 'E']
302+
... ]
303+
>>> min_sup = 2
304+
>>> fp_tree, header_table = create_tree(data_set, min_sup)
305+
>>> fp_tree
306+
TreeNode('Null Set', 1, None)
307+
>>> frequent_itemsets = []
308+
>>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets)
309+
>>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}]
310+
>>> all(expected in frequent_itemsets for expected in expe_itm)
311+
True
312+
"""
313+
sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0])
314+
big_l = [item[0] for item in sorted_items]
315+
for base_pat in big_l:
316+
new_freq_set = pre_fix.copy()
317+
new_freq_set.add(base_pat)
318+
freq_item_list.append(new_freq_set)
319+
cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1])
320+
my_cond_tree, my_head = create_tree(list(cond_patt_bases), min_sup)
321+
if my_head is not None:
322+
# Pass header_table[base_pat][1] as node_to_test to update_header
323+
header_table[base_pat][1] = update_header(
324+
header_table[base_pat][1], my_cond_tree
325+
)
326+
mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list)
327+
328+
329+
if __name__ == "__main__":
330+
from doctest import testmod
331+
332+
testmod()
333+
data_set: list[frozenset] = [
334+
frozenset(["bread", "milk", "cheese"]),
335+
frozenset(["bread", "milk"]),
336+
frozenset(["bread", "diapers"]),
337+
frozenset(["bread", "milk", "diapers"]),
338+
frozenset(["milk", "diapers"]),
339+
frozenset(["milk", "cheese"]),
340+
frozenset(["diapers", "cheese"]),
341+
frozenset(["bread", "milk", "cheese", "diapers"]),
342+
]
343+
print(f"{len(data_set) = }")
344+
fp_tree, header_table = create_tree(data_set, min_sup=3)
345+
print(f"{fp_tree = }")
346+
print(f"{len(header_table) = }")
347+
freq_items: list = []
348+
mine_tree(fp_tree, header_table, 3, set(), freq_items)
349+
print(f"{freq_items = }")

0 commit comments

Comments
 (0)