From 0d48203ecfe036c0e5d424a3080b6e1943ad6f8d Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 12:03:17 +0530 Subject: [PATCH 01/21] feat: adding Apriori Algorithm --- DIRECTORY.md | 1 + machine_learning/apriori_algorithm.py | 114 ++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 machine_learning/apriori_algorithm.py diff --git a/DIRECTORY.md b/DIRECTORY.md index 2c6000c94ed4..ef526df7aa8c 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -546,6 +546,7 @@ * [Word Frequency Functions](machine_learning/word_frequency_functions.py) * [Xgboost Classifier](machine_learning/xgboost_classifier.py) * [Xgboost Regressor](machine_learning/xgboost_regressor.py) + * [Apriori Algorithm](machine_learning/apriori_algorithm.py) ## Maths * [Abs](maths/abs.py) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py new file mode 100644 index 000000000000..504415a229fd --- /dev/null +++ b/machine_learning/apriori_algorithm.py @@ -0,0 +1,114 @@ +""" +Apriori Algorithm is a Association rule mining technique, also known as market basket analysis, is a data mining technique that aims to discover interesting relationships or associations among a set of items in a transactional or relational database. It focuses on finding patterns and dependencies between items based on their co-occurrence in transactions. + +Association rule mining is commonly used in retail and e-commerce industries to uncover relationships between products that are frequently purchased together. The technique helps businesses understand customer behavior, improve marketing strategies, optimize product placement, and support decision-making processes. + +The output of association rule mining is typically represented in the form of "if-then" rules, known as association rules. These rules consist of an antecedent (a set of items) and a consequent (another item or set of items). The rules indicate the likelihood or support of the consequent item(s) appearing in transactions that contain the antecedent item(s). The strength of the association is measured by various metrics such as support, confidence, and lift. + +For example, Apriori Algorithm state: "If a customer buys item A and item B, then they are likely to buy item C." This rule suggests a relationship between items A, B, and C, indicating that customers who purchased A and B are more likely to purchase item C as well. + +WIKI: https://en.wikipedia.org/wiki/Apriori_algorithm +Examples: https://www.kaggle.com/code/earthian/apriori-association-rules-mining +""" + +from typing import List, Tuple + +def load_data() -> List[List[str]]: + # Sample transaction dataset + data = [ + ["milk", "bread"], + ["milk", "butter"], + ["milk", "bread", "nuts"], + ["milk", "bread", "chips"], + ["milk", "butter", "chips"], + ["milk", "bread", "butter", "cola"], + ["nuts", "bread", "butter", "cola"], + ["bread", "butter", "cola", "ice"], + ["bread", "butter", "cola", "ice", "bun"], + ] + return data + +def generate_candidates(itemset: List[str], length: int): + candidates = [] + for i in range(len(itemset)): + for j in range(i + 1, len(itemset)): + # Create a candidate by taking the union of two lists + candidate = list(itemset[i]) + [item for item in itemset[j] if item not in itemset[i]] + if len(candidate) == length: + candidates.append(candidate) + + return candidates + + +def prune(itemset: List[str], candidates: List[List[str]], length: int) -> List[List[str]]: + # Prune candidate itemsets + """ + The goal of pruning is to filter out candidate itemsets that are not frequent. This is done by checking if all the (k-1) subsets of a candidate itemset are present in the frequent itemsets of the previous iteration (valid subsequences of the frequent itemsets from the previous iteration). + """ + pruned = [] + for candidate in candidates: + is_subsequence = True + for item in candidate: + if item not in itemset or itemset.count(item) < length - 1: + is_subsequence = False + break + if is_subsequence: + pruned.append(candidate) + return pruned + +def apriori(data: List[List[str]], min_support: int) -> List[Tuple[List[str], int]]: + itemset = [set(transaction) for transaction in data] + frequent_itemsets = [] + length = 1 + + while itemset: + # Count itemset support + counts = [0] * len(itemset) + for i, transaction in enumerate(data): + for j, item in enumerate(itemset): + if item.issubset(transaction): # using set for faster membership checking + counts[j] += 1 + + # Prune infrequent itemsets + itemset = [item for i, item in enumerate(itemset) if counts[i] >= min_support] + + # Append frequent itemsets (as a list to maintain order) + for i, item in enumerate(itemset): + frequent_itemsets.append((list(item), counts[i])) + + length += 1 + candidates = generate_candidates(itemset, len(next(iter(itemset))) + 1) + itemset = prune(itemset, candidates, len(next(iter(itemset))) + 1) + + return frequent_itemsets + + +if __name__ == "__main__": + """ + Apriori algorithm for finding frequent itemsets. + + Args: + data (List[List[str]]): A list of transactions, where each transaction is a list of items. + min_support (int): The minimum support threshold for frequent itemsets. + + Returns: + List[Tuple[List[str], int]]: A list of frequent itemsets along with their support counts. + + Example: + >>> data = [["milk", "bread"], ["milk", "butter"], ["milk", "bread", "nuts"]] + >>> min_support = 2 + >>> frequent_itemsets = apriori(data, min_support) + >>> frequent_itemsets + [(['milk'], 3), (['bread'], 3), (['butter'], 2), (['nuts'], 1), (['milk', 'bread'], 2)] + + >>> data = [["apple", "banana", "cherry"], ["banana", "cherry"], ["apple", "banana"]] + >>> min_support = 2 + >>> frequent_itemsets = apriori(data, min_support) + >>> frequent_itemsets + [(['apple'], 2), (['banana'], 3), (['cherry'], 2), (['apple', 'banana'], 2), (['banana', 'cherry'], 2)] + """ + data = load_data() + min_support = 2 # user-defined threshold or minimum support level + frequent_itemsets = apriori(data, min_support) + for itemset, support in frequent_itemsets: + print(f"{itemset}: {support}") \ No newline at end of file From bc5d02bea83287ba5a9d0e85e0595b75bbf47713 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 06:45:03 +0000 Subject: [PATCH 02/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 504415a229fd..fb2a3b9ded50 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -13,6 +13,7 @@ from typing import List, Tuple + def load_data() -> List[List[str]]: # Sample transaction dataset data = [ @@ -28,19 +29,24 @@ def load_data() -> List[List[str]]: ] return data + def generate_candidates(itemset: List[str], length: int): candidates = [] for i in range(len(itemset)): for j in range(i + 1, len(itemset)): # Create a candidate by taking the union of two lists - candidate = list(itemset[i]) + [item for item in itemset[j] if item not in itemset[i]] + candidate = list(itemset[i]) + [ + item for item in itemset[j] if item not in itemset[i] + ] if len(candidate) == length: candidates.append(candidate) return candidates -def prune(itemset: List[str], candidates: List[List[str]], length: int) -> List[List[str]]: +def prune( + itemset: List[str], candidates: List[List[str]], length: int +) -> List[List[str]]: # Prune candidate itemsets """ The goal of pruning is to filter out candidate itemsets that are not frequent. This is done by checking if all the (k-1) subsets of a candidate itemset are present in the frequent itemsets of the previous iteration (valid subsequences of the frequent itemsets from the previous iteration). @@ -56,6 +62,7 @@ def prune(itemset: List[str], candidates: List[List[str]], length: int) -> List[ pruned.append(candidate) return pruned + def apriori(data: List[List[str]], min_support: int) -> List[Tuple[List[str], int]]: itemset = [set(transaction) for transaction in data] frequent_itemsets = [] @@ -66,12 +73,14 @@ def apriori(data: List[List[str]], min_support: int) -> List[Tuple[List[str], in counts = [0] * len(itemset) for i, transaction in enumerate(data): for j, item in enumerate(itemset): - if item.issubset(transaction): # using set for faster membership checking + if item.issubset( + transaction + ): # using set for faster membership checking counts[j] += 1 # Prune infrequent itemsets itemset = [item for i, item in enumerate(itemset) if counts[i] >= min_support] - + # Append frequent itemsets (as a list to maintain order) for i, item in enumerate(itemset): frequent_itemsets.append((list(item), counts[i])) @@ -108,7 +117,7 @@ def apriori(data: List[List[str]], min_support: int) -> List[Tuple[List[str], in [(['apple'], 2), (['banana'], 3), (['cherry'], 2), (['apple', 'banana'], 2), (['banana', 'cherry'], 2)] """ data = load_data() - min_support = 2 # user-defined threshold or minimum support level + min_support = 2 # user-defined threshold or minimum support level frequent_itemsets = apriori(data, min_support) for itemset, support in frequent_itemsets: - print(f"{itemset}: {support}") \ No newline at end of file + print(f"{itemset}: {support}") From 542a7206f62b43d6545f439045256c46eb763950 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 12:24:16 +0530 Subject: [PATCH 03/21] fix: doctest, typo --- machine_learning/apriori_algorithm.py | 42 ++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 504415a229fd..b67951c4cda4 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -14,6 +14,12 @@ from typing import List, Tuple def load_data() -> List[List[str]]: + """ + Returns a sample transaction dataset. + + >>> load_data() + [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] + """ # Sample transaction dataset data = [ ["milk", "bread"], @@ -28,7 +34,18 @@ def load_data() -> List[List[str]]: ] return data -def generate_candidates(itemset: List[str], length: int): +def generate_candidates(itemset: List[str], length: int) -> List[List[str]]: + """ + Generates candidate itemsets of size k from the given itemsets. + + >>> itemsets = [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts']] + >>> generate_candidates(itemsets, 2) + [['milk', 'bread'], ['milk', 'butter'], ['bread', 'butter']] + + >>> itemsets = [['milk', 'bread'], ['milk', 'butter'], ['bread', 'butter']] + >>> generate_candidates(itemsets, 3) + [['milk', 'bread', 'butter']] + """ candidates = [] for i in range(len(itemset)): for j in range(i + 1, len(itemset)): @@ -44,6 +61,18 @@ def prune(itemset: List[str], candidates: List[List[str]], length: int) -> List[ # Prune candidate itemsets """ The goal of pruning is to filter out candidate itemsets that are not frequent. This is done by checking if all the (k-1) subsets of a candidate itemset are present in the frequent itemsets of the previous iteration (valid subsequences of the frequent itemsets from the previous iteration). + + Prunes candidate itemsets that are not frequent. + + >>> itemset = ['bread', 'butter', 'milk'] + >>> candidates = [['bread', 'butter'], ['bread', 'milk'], ['butter', 'milk'], ['bread', 'butter', 'milk'], ['nuts', 'bread', 'butter']] + >>> prune(itemset, candidates, 3) + [['bread', 'butter', 'milk']] + + >>> itemset = ['bread', 'butter', 'milk'] + >>> candidates = [['bread', 'butter'], ['bread', 'milk'], ['butter', 'milk'], ['bread', 'butter', 'milk'], ['nuts', 'bread', 'butter']] + >>> prune(itemset, candidates, 2) + [['bread', 'butter'], ['bread', 'milk'], ['butter', 'milk'], ['nuts', 'bread', 'butter']] """ pruned = [] for candidate in candidates: @@ -57,6 +86,17 @@ def prune(itemset: List[str], candidates: List[List[str]], length: int) -> List[ return pruned def apriori(data: List[List[str]], min_support: int) -> List[Tuple[List[str], int]]: + """ + Returns a list of frequent itemsets and their support counts. + + >>> data = [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] + >>> apriori(data, 3) + [(['bread'], 7), (['butter'], 7), (['milk'], 8), (['cola', 'butter'], 3), (['bread', 'butter'], 4), (['bread', 'milk'], 4), (['butter', 'milk'], 4), (['bread', 'cola'], 3), (['milk', 'cola'], 3), (['bread', 'butter', 'milk'], 3), (['bread', 'milk', 'cola'], 3), (['butter', 'milk', 'cola'], 3), (['bread', 'butter', 'cola'], 3), (['bread', 'butter', 'milk', 'cola'], 3)] + + >>> data = [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] + >>> apriori(data, 5) + [(['bread'], 7), (['butter'], 7), (['milk'], 8)] + """ itemset = [set(transaction) for transaction in data] frequent_itemsets = [] length = 1 From 6155bdf4f35d1d93e17870f1fb06a380c5c7dd6c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 06:57:01 +0000 Subject: [PATCH 04/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 4a04fd8abb64..5ca0743bf6a8 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -35,6 +35,7 @@ def load_data() -> List[List[str]]: ] return data + def generate_candidates(itemset: List[str], length: int) -> List[List[str]]: """ Generates candidate itemsets of size k from the given itemsets. @@ -48,6 +49,7 @@ def generate_candidates(itemset: List[str], length: int) -> List[List[str]]: [['milk', 'bread', 'butter']] """ + def generate_candidates(itemset: List[str], length: int): candidates = [] for i in range(len(itemset)): From 12cb7a27913c2f8315000c1d65cee2e2eca61b6f Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 12:42:06 +0530 Subject: [PATCH 05/21] fix: type error, code refactore --- machine_learning/apriori_algorithm.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 4a04fd8abb64..6c1b4e89f05b 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -11,10 +11,7 @@ Examples: https://www.kaggle.com/code/earthian/apriori-association-rules-mining """ -from typing import List, Tuple - - -def load_data() -> List[List[str]]: +def load_data() -> list[list[str]]: """ Returns a sample transaction dataset. @@ -35,7 +32,7 @@ def load_data() -> List[List[str]]: ] return data -def generate_candidates(itemset: List[str], length: int) -> List[List[str]]: +def generate_candidates(itemset: list[str], length: int): """ Generates candidate itemsets of size k from the given itemsets. @@ -47,8 +44,6 @@ def generate_candidates(itemset: List[str], length: int) -> List[List[str]]: >>> generate_candidates(itemsets, 3) [['milk', 'bread', 'butter']] """ - -def generate_candidates(itemset: List[str], length: int): candidates = [] for i in range(len(itemset)): for j in range(i + 1, len(itemset)): @@ -63,8 +58,8 @@ def generate_candidates(itemset: List[str], length: int): def prune( - itemset: List[str], candidates: List[List[str]], length: int -) -> List[List[str]]: + itemset: list[str], candidates: list[list[str]], length: int +) -> list[list[str]]: # Prune candidate itemsets """ The goal of pruning is to filter out candidate itemsets that are not frequent. This is done by checking if all the (k-1) subsets of a candidate itemset are present in the frequent itemsets of the previous iteration (valid subsequences of the frequent itemsets from the previous iteration). @@ -93,7 +88,7 @@ def prune( return pruned -def apriori(data: List[List[str]], min_support: int) -> List[Tuple[List[str], int]]: +def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], int]]: """ Returns a list of frequent itemsets and their support counts. @@ -138,11 +133,11 @@ def apriori(data: List[List[str]], min_support: int) -> List[Tuple[List[str], in Apriori algorithm for finding frequent itemsets. Args: - data (List[List[str]]): A list of transactions, where each transaction is a list of items. + data (list[list[str]]): A list of transactions, where each transaction is a list of items. min_support (int): The minimum support threshold for frequent itemsets. Returns: - List[Tuple[List[str], int]]: A list of frequent itemsets along with their support counts. + list[Tuple[list[str], int]]: A list of frequent itemsets along with their support counts. Example: >>> data = [["milk", "bread"], ["milk", "butter"], ["milk", "bread", "nuts"]] From 2544fa8962df0575e39ef7f095ce6788b706b1c4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 07:15:24 +0000 Subject: [PATCH 06/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 168d722ed2c8..aa5d4dd0e20f 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -11,6 +11,7 @@ Examples: https://www.kaggle.com/code/earthian/apriori-association-rules-mining """ + def load_data() -> list[list[str]]: """ Returns a sample transaction dataset. @@ -32,6 +33,7 @@ def load_data() -> list[list[str]]: ] return data + def generate_candidates(itemset: list[str], length: int): """ Generates candidate itemsets of size k from the given itemsets. @@ -57,7 +59,9 @@ def generate_candidates(itemset: list[str], length: int): return candidates -def prune(itemset: list[str], candidates: list[list[str]], length: int ) -> list[list[str]]: +def prune( + itemset: list[str], candidates: list[list[str]], length: int +) -> list[list[str]]: # Prune candidate itemsets """ The goal of pruning is to filter out candidate itemsets that are not frequent. This is done by checking if all the (k-1) subsets of a candidate itemset are present in the frequent itemsets of the previous iteration (valid subsequences of the frequent itemsets from the previous iteration). @@ -107,7 +111,9 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in counts = [0] * len(itemset) for i, transaction in enumerate(data): for j, item in enumerate(itemset): - if item.issubset(transaction): # using set for faster membership checking + if item.issubset( + transaction + ): # using set for faster membership checking counts[j] += 1 # Prune infrequent itemsets @@ -152,4 +158,4 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in min_support = 2 # user-defined threshold or minimum support level frequent_itemsets = apriori(data, min_support) for itemset, support in frequent_itemsets: - print(f"{itemset}: {support}") \ No newline at end of file + print(f"{itemset}: {support}") From 6f2570e987b4c6cbeb9587d24e1d693a8698bb0f Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 12:47:53 +0530 Subject: [PATCH 07/21] fix: refactore code --- machine_learning/apriori_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index aa5d4dd0e20f..26e82ef2950a 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -34,7 +34,7 @@ def load_data() -> list[list[str]]: return data -def generate_candidates(itemset: list[str], length: int): +def generate_candidates(itemset: list[str], length: int) -> list[list[str]]: """ Generates candidate itemsets of size k from the given itemsets. From 4481bed4477ab731f9a58019c52342885a818d23 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 14:41:16 +0530 Subject: [PATCH 08/21] fix: doctest --- machine_learning/apriori_algorithm.py | 69 +++++++++++---------------- 1 file changed, 27 insertions(+), 42 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 26e82ef2950a..0f135a0373c7 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -38,20 +38,16 @@ def generate_candidates(itemset: list[str], length: int) -> list[list[str]]: """ Generates candidate itemsets of size k from the given itemsets. - >>> itemsets = [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts']] + >>> itemsets = ['A', 'B', 'C', 'D'] >>> generate_candidates(itemsets, 2) - [['milk', 'bread'], ['milk', 'butter'], ['bread', 'butter']] - - >>> itemsets = [['milk', 'bread'], ['milk', 'butter'], ['bread', 'butter']] - >>> generate_candidates(itemsets, 3) - [['milk', 'bread', 'butter']] + [['A', 'B'], ['A', 'C'], ['A', 'D'], ['B', 'C'], ['B', 'D'], ['C', 'D']] """ candidates = [] for i in range(len(itemset)): for j in range(i + 1, len(itemset)): # Create a candidate by taking the union of two lists candidate = list(itemset[i]) + [ - item for item in itemset[j] if item not in itemset[i] + item for item in list(itemset[j]) if item not in list(itemset[i]) ] if len(candidate) == length: candidates.append(candidate) @@ -68,15 +64,15 @@ def prune( Prunes candidate itemsets that are not frequent. - >>> itemset = ['bread', 'butter', 'milk'] - >>> candidates = [['bread', 'butter'], ['bread', 'milk'], ['butter', 'milk'], ['bread', 'butter', 'milk'], ['nuts', 'bread', 'butter']] - >>> prune(itemset, candidates, 3) - [['bread', 'butter', 'milk']] - - >>> itemset = ['bread', 'butter', 'milk'] - >>> candidates = [['bread', 'butter'], ['bread', 'milk'], ['butter', 'milk'], ['bread', 'butter', 'milk'], ['nuts', 'bread', 'butter']] + >>> itemset = ['X', 'Y', 'Z'] + >>> candidates = [['X', 'Y'], ['X', 'Z'], ['Y', 'Z']] >>> prune(itemset, candidates, 2) - [['bread', 'butter'], ['bread', 'milk'], ['butter', 'milk'], ['nuts', 'bread', 'butter']] + [['X', 'Y'], ['X', 'Z'], ['Y', 'Z']] + + >>> itemset = ['1', '2', '3', '4'] + >>> candidates = ['1', '2', '4'] + >>> prune(itemset, candidates, 3) + [] """ pruned = [] for candidate in candidates: @@ -94,13 +90,13 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in """ Returns a list of frequent itemsets and their support counts. - >>> data = [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] - >>> apriori(data, 3) - [(['bread'], 7), (['butter'], 7), (['milk'], 8), (['cola', 'butter'], 3), (['bread', 'butter'], 4), (['bread', 'milk'], 4), (['butter', 'milk'], 4), (['bread', 'cola'], 3), (['milk', 'cola'], 3), (['bread', 'butter', 'milk'], 3), (['bread', 'milk', 'cola'], 3), (['butter', 'milk', 'cola'], 3), (['bread', 'butter', 'cola'], 3), (['bread', 'butter', 'milk', 'cola'], 3)] + >>> data = [['A', 'B', 'C'], ['A', 'B'], ['A', 'C'], ['A', 'D'], ['B', 'C']] + >>> apriori(data, 2) + [(['A', 'B'], 1), (['A', 'C'], 2), (['B', 'C'], 2)] - >>> data = [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] - >>> apriori(data, 5) - [(['bread'], 7), (['butter'], 7), (['milk'], 8)] + >>> data = [['1', '2', '3'], ['1', '2'], ['1', '3'], ['1', '4'], ['2', '3']] + >>> apriori(data, 3) + [] """ itemset = [set(transaction) for transaction in data] frequent_itemsets = [] @@ -110,10 +106,8 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in # Count itemset support counts = [0] * len(itemset) for i, transaction in enumerate(data): - for j, item in enumerate(itemset): - if item.issubset( - transaction - ): # using set for faster membership checking + for j, candidate in enumerate(itemset): + if all(item in transaction for item in candidate): counts[j] += 1 # Prune infrequent itemsets @@ -121,11 +115,11 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in # Append frequent itemsets (as a list to maintain order) for i, item in enumerate(itemset): - frequent_itemsets.append((list(item), counts[i])) + frequent_itemsets.append((sorted(item), counts[i])) length += 1 - candidates = generate_candidates(itemset, len(next(iter(itemset))) + 1) - itemset = prune(itemset, candidates, len(next(iter(itemset))) + 1) + candidates = generate_candidates(itemset, length) + itemset = prune(itemset, candidates, length) return frequent_itemsets @@ -140,22 +134,13 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in Returns: list[Tuple[list[str], int]]: A list of frequent itemsets along with their support counts. - - Example: - >>> data = [["milk", "bread"], ["milk", "butter"], ["milk", "bread", "nuts"]] - >>> min_support = 2 - >>> frequent_itemsets = apriori(data, min_support) - >>> frequent_itemsets - [(['milk'], 3), (['bread'], 3), (['butter'], 2), (['nuts'], 1), (['milk', 'bread'], 2)] - - >>> data = [["apple", "banana", "cherry"], ["banana", "cherry"], ["apple", "banana"]] - >>> min_support = 2 - >>> frequent_itemsets = apriori(data, min_support) - >>> frequent_itemsets - [(['apple'], 2), (['banana'], 3), (['cherry'], 2), (['apple', 'banana'], 2), (['banana', 'cherry'], 2)] """ + import doctest + + doctest.testmod() + data = load_data() min_support = 2 # user-defined threshold or minimum support level frequent_itemsets = apriori(data, min_support) for itemset, support in frequent_itemsets: - print(f"{itemset}: {support}") + print(f"{itemset}: {support}") \ No newline at end of file From 61d696e7c2922a1e49cefa96ee8e45ee36f8a513 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 09:11:58 +0000 Subject: [PATCH 09/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 0f135a0373c7..0233da83d0a4 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -143,4 +143,4 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in min_support = 2 # user-defined threshold or minimum support level frequent_itemsets = apriori(data, min_support) for itemset, support in frequent_itemsets: - print(f"{itemset}: {support}") \ No newline at end of file + print(f"{itemset}: {support}") From 66b3dc86d1f76ceddec7be448380a50b48289ec0 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 15:03:41 +0530 Subject: [PATCH 10/21] fix: E501, B007 --- machine_learning/apriori_algorithm.py | 36 +++++++++++++++++---------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 0233da83d0a4..00269d13a13e 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -1,11 +1,15 @@ """ -Apriori Algorithm is a Association rule mining technique, also known as market basket analysis, is a data mining technique that aims to discover interesting relationships or associations among a set of items in a transactional or relational database. It focuses on finding patterns and dependencies between items based on their co-occurrence in transactions. - -Association rule mining is commonly used in retail and e-commerce industries to uncover relationships between products that are frequently purchased together. The technique helps businesses understand customer behavior, improve marketing strategies, optimize product placement, and support decision-making processes. - -The output of association rule mining is typically represented in the form of "if-then" rules, known as association rules. These rules consist of an antecedent (a set of items) and a consequent (another item or set of items). The rules indicate the likelihood or support of the consequent item(s) appearing in transactions that contain the antecedent item(s). The strength of the association is measured by various metrics such as support, confidence, and lift. - -For example, Apriori Algorithm state: "If a customer buys item A and item B, then they are likely to buy item C." This rule suggests a relationship between items A, B, and C, indicating that customers who purchased A and B are more likely to purchase item C as well. +Apriori Algorithm is a Association rule mining technique, +also known as market basket analysis, +aims to discover interesting relationships or associations +among a set of items in a transactional or relational database. + +For example, Apriori Algorithm state: +"If a customer buys item A and item B, +then they are likely to buy item C." +This rule suggests a relationship between items A, B, and C, +indicating that customers who purchased A and B are more +likely to purchase item C as well. WIKI: https://en.wikipedia.org/wiki/Apriori_algorithm Examples: https://www.kaggle.com/code/earthian/apriori-association-rules-mining @@ -17,7 +21,10 @@ def load_data() -> list[list[str]]: Returns a sample transaction dataset. >>> load_data() - [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] + [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], + ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], + ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], + ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] """ # Sample transaction dataset data = [ @@ -60,7 +67,10 @@ def prune( ) -> list[list[str]]: # Prune candidate itemsets """ - The goal of pruning is to filter out candidate itemsets that are not frequent. This is done by checking if all the (k-1) subsets of a candidate itemset are present in the frequent itemsets of the previous iteration (valid subsequences of the frequent itemsets from the previous iteration). + The goal of pruning is to filter out candidate itemsets that are not frequent. + This is done by checking if all the (k-1) subsets of a candidate itemset + are present in the frequent itemsets of the previous iteration + (valid subsequences of the frequent itemsets from the previous iteration). Prunes candidate itemsets that are not frequent. @@ -105,7 +115,7 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in while itemset: # Count itemset support counts = [0] * len(itemset) - for i, transaction in enumerate(data): + for transaction in data: for j, candidate in enumerate(itemset): if all(item in transaction for item in candidate): counts[j] += 1 @@ -129,11 +139,11 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in Apriori algorithm for finding frequent itemsets. Args: - data (list[list[str]]): A list of transactions, where each transaction is a list of items. - min_support (int): The minimum support threshold for frequent itemsets. + data: A list of transactions, where each transaction is a list of items. + min_support: The minimum support threshold for frequent itemsets. Returns: - list[Tuple[list[str], int]]: A list of frequent itemsets along with their support counts. + A list of frequent itemsets along with their support counts. """ import doctest From 95d4ca8da8680aaf79c72b736b989d19b7852d7f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 09:34:22 +0000 Subject: [PATCH 11/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 00269d13a13e..090a83086f87 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -1,14 +1,14 @@ """ -Apriori Algorithm is a Association rule mining technique, -also known as market basket analysis, -aims to discover interesting relationships or associations -among a set of items in a transactional or relational database. - -For example, Apriori Algorithm state: -"If a customer buys item A and item B, -then they are likely to buy item C." -This rule suggests a relationship between items A, B, and C, -indicating that customers who purchased A and B are more +Apriori Algorithm is a Association rule mining technique, +also known as market basket analysis, +aims to discover interesting relationships or associations +among a set of items in a transactional or relational database. + +For example, Apriori Algorithm state: +"If a customer buys item A and item B, +then they are likely to buy item C." +This rule suggests a relationship between items A, B, and C, +indicating that customers who purchased A and B are more likely to purchase item C as well. WIKI: https://en.wikipedia.org/wiki/Apriori_algorithm @@ -21,8 +21,8 @@ def load_data() -> list[list[str]]: Returns a sample transaction dataset. >>> load_data() - [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], - ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], + [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], + ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] """ @@ -68,8 +68,8 @@ def prune( # Prune candidate itemsets """ The goal of pruning is to filter out candidate itemsets that are not frequent. - This is done by checking if all the (k-1) subsets of a candidate itemset - are present in the frequent itemsets of the previous iteration + This is done by checking if all the (k-1) subsets of a candidate itemset + are present in the frequent itemsets of the previous iteration (valid subsequences of the frequent itemsets from the previous iteration). Prunes candidate itemsets that are not frequent. From 43226258dec441725c2bf6b4056427fc87c74c24 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 15:18:25 +0530 Subject: [PATCH 12/21] fix: err --- machine_learning/apriori_algorithm.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 090a83086f87..3c8f1979c165 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -21,22 +21,14 @@ def load_data() -> list[list[str]]: Returns a sample transaction dataset. >>> load_data() - [['milk', 'bread'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], - ['milk', 'bread', 'chips'], ['milk', 'butter', 'chips'], - ['milk', 'bread', 'butter', 'cola'], ['nuts', 'bread', 'butter', 'cola'], - ['bread', 'butter', 'cola', 'ice'], ['bread', 'butter', 'cola', 'ice', 'bun']] + [['milk'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips']] """ # Sample transaction dataset data = [ - ["milk", "bread"], + ["milk"], ["milk", "butter"], ["milk", "bread", "nuts"], - ["milk", "bread", "chips"], - ["milk", "butter", "chips"], - ["milk", "bread", "butter", "cola"], - ["nuts", "bread", "butter", "cola"], - ["bread", "butter", "cola", "ice"], - ["bread", "butter", "cola", "ice", "bun"], + ["milk", "bread", "chips"] ] return data @@ -108,7 +100,7 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in >>> apriori(data, 3) [] """ - itemset = [set(transaction) for transaction in data] + itemset = [list(transaction) for transaction in data] frequent_itemsets = [] length = 1 From 739429c5291925ded2a4be1975ed4cf1070f261d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 09:49:47 +0000 Subject: [PATCH 13/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 3c8f1979c165..17c7ff9e2947 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -28,7 +28,7 @@ def load_data() -> list[list[str]]: ["milk"], ["milk", "butter"], ["milk", "bread", "nuts"], - ["milk", "bread", "chips"] + ["milk", "bread", "chips"], ] return data From d0dc6a10082bfc66534505ec4df56057441a7ed7 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 15:30:14 +0530 Subject: [PATCH 14/21] fix: arg typ err --- machine_learning/apriori_algorithm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 3c8f1979c165..6f46f2d8f389 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -21,13 +21,13 @@ def load_data() -> list[list[str]]: Returns a sample transaction dataset. >>> load_data() - [['milk'], ['milk', 'butter'], ['milk', 'bread', 'nuts'], ['milk', 'bread', 'chips']] + [['milk'], ['milk', 'butter'], ['milk', 'bread'], ['milk', 'bread', 'chips']] """ # Sample transaction dataset data = [ ["milk"], ["milk", "butter"], - ["milk", "bread", "nuts"], + ["milk", "bread"], ["milk", "bread", "chips"] ] return data @@ -100,7 +100,7 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in >>> apriori(data, 3) [] """ - itemset = [list(transaction) for transaction in data] + itemset = [transaction for transaction in data] frequent_itemsets = [] length = 1 From 6fdf780eccfb359cf033e9aa841f27f247bfba08 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 10:02:20 +0000 Subject: [PATCH 15/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 6f46f2d8f389..f3b637ed526e 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -24,12 +24,7 @@ def load_data() -> list[list[str]]: [['milk'], ['milk', 'butter'], ['milk', 'bread'], ['milk', 'bread', 'chips']] """ # Sample transaction dataset - data = [ - ["milk"], - ["milk", "butter"], - ["milk", "bread"], - ["milk", "bread", "chips"] - ] + data = [["milk"], ["milk", "butter"], ["milk", "bread"], ["milk", "bread", "chips"]] return data From ae92f9062e9ce68a0653fcdf5f939d0d6033f04c Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 15:44:31 +0530 Subject: [PATCH 16/21] fix: typo --- machine_learning/apriori_algorithm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index f3b637ed526e..635dc484f29b 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -41,7 +41,7 @@ def generate_candidates(itemset: list[str], length: int) -> list[list[str]]: for j in range(i + 1, len(itemset)): # Create a candidate by taking the union of two lists candidate = list(itemset[i]) + [ - item for item in list(itemset[j]) if item not in list(itemset[i]) + item for item in itemset[j] if item not in itemset[i] ] if len(candidate) == length: candidates.append(candidate) @@ -95,7 +95,7 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in >>> apriori(data, 3) [] """ - itemset = [transaction for transaction in data] + itemset = [list(transaction) for transaction in data] frequent_itemsets = [] length = 1 From 89364ee00eeb03050cc1c8a8b757160ea5930d91 Mon Sep 17 00:00:00 2001 From: Jeel Gajera Date: Sun, 15 Oct 2023 15:54:32 +0530 Subject: [PATCH 17/21] fix: typo --- machine_learning/apriori_algorithm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 635dc484f29b..4e6ef30db942 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -28,7 +28,7 @@ def load_data() -> list[list[str]]: return data -def generate_candidates(itemset: list[str], length: int) -> list[list[str]]: +def generate_candidates(itemset: list, length: int) -> list: """ Generates candidate itemsets of size k from the given itemsets. @@ -50,8 +50,8 @@ def generate_candidates(itemset: list[str], length: int) -> list[list[str]]: def prune( - itemset: list[str], candidates: list[list[str]], length: int -) -> list[list[str]]: + itemset: list, candidates: list, length: int +) -> list: # Prune candidate itemsets """ The goal of pruning is to filter out candidate itemsets that are not frequent. From 3b0152a0e5977993cf3386adf6da6b06d3eb6f5a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Oct 2023 10:25:13 +0000 Subject: [PATCH 18/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/apriori_algorithm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 4e6ef30db942..21ce905520b6 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -49,9 +49,7 @@ def generate_candidates(itemset: list, length: int) -> list: return candidates -def prune( - itemset: list, candidates: list, length: int -) -> list: +def prune(itemset: list, candidates: list, length: int) -> list: # Prune candidate itemsets """ The goal of pruning is to filter out candidate itemsets that are not frequent. From 9aad8da7c6a3f8fa0e214970ac2f53b3c74ab9bf Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 15 Oct 2023 23:24:05 +0200 Subject: [PATCH 19/21] Replace generate_candidates() with itertools.combinations() --- machine_learning/apriori_algorithm.py | 65 ++++++++------------------- 1 file changed, 18 insertions(+), 47 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 21ce905520b6..44e2908d62ff 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -1,19 +1,16 @@ """ -Apriori Algorithm is a Association rule mining technique, -also known as market basket analysis, -aims to discover interesting relationships or associations -among a set of items in a transactional or relational database. - -For example, Apriori Algorithm state: -"If a customer buys item A and item B, -then they are likely to buy item C." -This rule suggests a relationship between items A, B, and C, -indicating that customers who purchased A and B are more -likely to purchase item C as well. +Apriori Algorithm is a Association rule mining technique, also known as market basket +analysis, aims to discover interesting relationships or associations among a set of +items in a transactional or relational database. + +For example, Apriori Algorithm states: "If a customer buys item A and item B, then they +are likely to buy item C." This rule suggests a relationship between items A, B, and C, +indicating that customers who purchased A and B are more likely to also purchase item C. WIKI: https://en.wikipedia.org/wiki/Apriori_algorithm Examples: https://www.kaggle.com/code/earthian/apriori-association-rules-mining """ +from itertools import combinations def load_data() -> list[list[str]]: @@ -23,39 +20,16 @@ def load_data() -> list[list[str]]: >>> load_data() [['milk'], ['milk', 'butter'], ['milk', 'bread'], ['milk', 'bread', 'chips']] """ - # Sample transaction dataset - data = [["milk"], ["milk", "butter"], ["milk", "bread"], ["milk", "bread", "chips"]] - return data - - -def generate_candidates(itemset: list, length: int) -> list: - """ - Generates candidate itemsets of size k from the given itemsets. - - >>> itemsets = ['A', 'B', 'C', 'D'] - >>> generate_candidates(itemsets, 2) - [['A', 'B'], ['A', 'C'], ['A', 'D'], ['B', 'C'], ['B', 'D'], ['C', 'D']] - """ - candidates = [] - for i in range(len(itemset)): - for j in range(i + 1, len(itemset)): - # Create a candidate by taking the union of two lists - candidate = list(itemset[i]) + [ - item for item in itemset[j] if item not in itemset[i] - ] - if len(candidate) == length: - candidates.append(candidate) - - return candidates + return [["milk"], ["milk", "butter"], ["milk", "bread"], ["milk", "bread", "chips"]] def prune(itemset: list, candidates: list, length: int) -> list: - # Prune candidate itemsets """ - The goal of pruning is to filter out candidate itemsets that are not frequent. - This is done by checking if all the (k-1) subsets of a candidate itemset - are present in the frequent itemsets of the previous iteration - (valid subsequences of the frequent itemsets from the previous iteration). + Prune candidate itemsets that are not frequent. + The goal of pruning is to filter out candidate itemsets that are not frequent. This + is done by checking if all the (k-1) subsets of a candidate itemset are present in + the frequent itemsets of the previous iteration (valid subsequences of the frequent + itemsets from the previous iteration). Prunes candidate itemsets that are not frequent. @@ -113,8 +87,7 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in frequent_itemsets.append((sorted(item), counts[i])) length += 1 - candidates = generate_candidates(itemset, length) - itemset = prune(itemset, candidates, length) + itemset = prune(itemset, combinations(itemset, length), length) return frequent_itemsets @@ -134,8 +107,6 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in doctest.testmod() - data = load_data() - min_support = 2 # user-defined threshold or minimum support level - frequent_itemsets = apriori(data, min_support) - for itemset, support in frequent_itemsets: - print(f"{itemset}: {support}") + # user-defined threshold or minimum support level + frequent_itemsets = apriori(data=load_data(), min_support=2) + print("\n".join(f"{itemset}: {support}" for itemset, support in frequent_itemsets)) From 571c0230db15c294c13ad4c30afdffcbe0f9d959 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 15 Oct 2023 23:36:08 +0200 Subject: [PATCH 20/21] mypy --- machine_learning/apriori_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index 44e2908d62ff..ff2fab34209c 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -23,7 +23,7 @@ def load_data() -> list[list[str]]: return [["milk"], ["milk", "butter"], ["milk", "bread"], ["milk", "bread", "chips"]] -def prune(itemset: list, candidates: list, length: int) -> list: +def prune(itemset: list, candidates: tuple[list[str]], length: int) -> list: """ Prune candidate itemsets that are not frequent. The goal of pruning is to filter out candidate itemsets that are not frequent. This From f916d8eddece35f890bd1abd06bc7ac230e380a9 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 15 Oct 2023 23:40:57 +0200 Subject: [PATCH 21/21] Update apriori_algorithm.py --- machine_learning/apriori_algorithm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/apriori_algorithm.py b/machine_learning/apriori_algorithm.py index ff2fab34209c..d9fd1f82ea3c 100644 --- a/machine_learning/apriori_algorithm.py +++ b/machine_learning/apriori_algorithm.py @@ -23,7 +23,7 @@ def load_data() -> list[list[str]]: return [["milk"], ["milk", "butter"], ["milk", "bread"], ["milk", "bread", "chips"]] -def prune(itemset: list, candidates: tuple[list[str]], length: int) -> list: +def prune(itemset: list, candidates: list, length: int) -> list: """ Prune candidate itemsets that are not frequent. The goal of pruning is to filter out candidate itemsets that are not frequent. This @@ -87,7 +87,7 @@ def apriori(data: list[list[str]], min_support: int) -> list[tuple[list[str], in frequent_itemsets.append((sorted(item), counts[i])) length += 1 - itemset = prune(itemset, combinations(itemset, length), length) + itemset = prune(itemset, list(combinations(itemset, length)), length) return frequent_itemsets