From 0af293b7b517f77a678f71403d7a978808ef845a Mon Sep 17 00:00:00 2001 From: Arko-Sengupta Date: Tue, 3 Sep 2024 13:34:41 +0530 Subject: [PATCH 1/8] Cosine Similarity Algorithm | Machine Learning --- machine_learning/cosine_similarity.py | 167 ++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 machine_learning/cosine_similarity.py diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py new file mode 100644 index 000000000000..2b9c7fe901d5 --- /dev/null +++ b/machine_learning/cosine_similarity.py @@ -0,0 +1,167 @@ +import spacy +import logging +import numpy as np + +class Cosine_Similarity: + """ + Cosine Similarity Algorithm + + Use Case: + - The Cosine Similarity Algorithm measures the Cosine of the Angle between two Non-Zero Vectors in a Multi-Dimensional Space. + - It is used to determine how similar two texts are based on their Vector representations. + - The similarity score ranges from -1 (Completely Dissimilar) to 1 (Completely Similar), with 0 indicating no Similarity. + + Dependencies: + - spacy: A Natural Language Processing library for Python, used here for Tokenization and Vectorization. + - numpy: A Library for Numerical Operations in Python, used for Mathematical Computations. + """ + + def __init__(self) -> None: + """ + Initializes the Cosine Similarity class by loading the SpaCy model. + """ + self.nlp = spacy.load('en_core_web_md') + + def Tokenize(self, text: str) -> list: + """ + Tokenizes the input text into a list of lowercased tokens. + + Parameters: + - text (str): The input text to be tokenized. + + Returns: + - list: A list of lowercased tokens. + """ + try: + doc = self.nlp(text) + tokens = [token.text.lower() for token in doc if not token.is_punct] + return tokens + except Exception as e: + logging.error("An error occurred during Tokenization: ", exc_info=e) + raise e + + def Vectorize(self, tokens: list) -> list: + """ + Converts tokens into their corresponding vector representations. + + Parameters: + - tokens (list): A list of tokens to be vectorized. + + Returns: + - list: A list of vectors corresponding to the tokens. + """ + try: + vectors = [self.nlp(token).vector for token in tokens if self.nlp(token).vector.any()] + return vectors + except Exception as e: + logging.error("An error occurred during Vectorization: ", exc_info=e) + raise e + + def Mean_Vector(self, vectors: list) -> np.ndarray: + """ + Computes the mean vector of a list of vectors. + + Parameters: + - vectors (list): A list of vectors to be averaged. + + Returns: + - np.ndarray: The mean vector. + """ + try: + if not vectors: + return np.zeros(self.nlp.vocab.vectors_length) + return np.mean(vectors, axis=0) + except Exception as e: + logging.error("An error occurred while computing the Mean Vector: ", exc_info=e) + raise e + + def Dot_Product(self, vector1: np.ndarray, vector2: np.ndarray) -> float: + """ + Computes the dot product between two vectors. + + Parameters: + - vector1 (np.ndarray): The first vector. + - vector2 (np.ndarray): The second vector. + + Returns: + - float: The dot product of the two vectors. + """ + try: + return np.dot(vector1, vector2) + except Exception as e: + logging.error("An error occurred during the dot Product Calculation: ", exc_info=e) + raise e + + def Magnitude(self, vector: np.ndarray) -> float: + """ + Computes the magnitude (norm) of a vector. + + Parameters: + - vector (np.ndarray): The vector whose magnitude is to be calculated. + + Returns: + - float: The magnitude of the vector. + """ + try: + return np.sqrt(np.sum(vector ** 2)) + except Exception as e: + logging.error("An error occurred while computing the Magnitude: ", exc_info=e) + raise e + + def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: + """ + Computes the cosine similarity between two vectors. + + Parameters: + - vector1 (np.ndarray): The first vector. + - vector2 (np.ndarray): The second vector. + + Returns: + - float: The cosine similarity between the two vectors. + """ + try: + dot = self.Dot_Product(vector1, vector2) + magnitude1, magnitude2 = self.Magnitude(vector1), self.Magnitude(vector2) + if magnitude1 == 0 or magnitude2 == 0: + return 0.0 + return dot / (magnitude1 * magnitude2) + except Exception as e: + logging.error("An error occurred during Cosine Similarity Calculation: ", exc_info=e) + raise e + + def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: + """ + Computes the cosine similarity percentage between two texts. + + Parameters: + - text1 (str): The first text. + - text2 (str): The second text. + + Returns: + - float: The cosine similarity percentage between the two texts. + """ + try: + tokens1 = self.Tokenize(text1) + tokens2 = self.Tokenize(text2) + + vectors1 = self.Vectorize(tokens1) + vectors2 = self.Vectorize(tokens2) + + mean_vec1 = self.Mean_Vector(vectors1) + mean_vec2 = self.Mean_Vector(vectors2) + + similarity = self.Cosine_Similarity(mean_vec1, mean_vec2) + return similarity * 100 + except Exception as e: + logging.error("An error occurred while computing the Cosine Similarity Percentage: ", exc_info=e) + raise e + +if __name__ == "__main__": + """ + Main function to Test the Cosine Similarity between two Texts. + """ + text1 = input("Please enter text 1: ") + text2 = input("Please enter text 2: ") + + similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage(text1, text2) + print(f"Cosine Similarity: {similarity_percentage:.2f}%") \ No newline at end of file From 3a623391fd224b08f6ec2084103867c9bd6a3b42 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 08:31:08 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/cosine_similarity.py | 65 +++++++++++++++++---------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py index 2b9c7fe901d5..62cc76a306a2 100644 --- a/machine_learning/cosine_similarity.py +++ b/machine_learning/cosine_similarity.py @@ -2,6 +2,7 @@ import logging import numpy as np + class Cosine_Similarity: """ Cosine Similarity Algorithm @@ -15,13 +16,13 @@ class Cosine_Similarity: - spacy: A Natural Language Processing library for Python, used here for Tokenization and Vectorization. - numpy: A Library for Numerical Operations in Python, used for Mathematical Computations. """ - + def __init__(self) -> None: """ Initializes the Cosine Similarity class by loading the SpaCy model. """ - self.nlp = spacy.load('en_core_web_md') - + self.nlp = spacy.load("en_core_web_md") + def Tokenize(self, text: str) -> list: """ Tokenizes the input text into a list of lowercased tokens. @@ -39,7 +40,7 @@ def Tokenize(self, text: str) -> list: except Exception as e: logging.error("An error occurred during Tokenization: ", exc_info=e) raise e - + def Vectorize(self, tokens: list) -> list: """ Converts tokens into their corresponding vector representations. @@ -51,12 +52,16 @@ def Vectorize(self, tokens: list) -> list: - list: A list of vectors corresponding to the tokens. """ try: - vectors = [self.nlp(token).vector for token in tokens if self.nlp(token).vector.any()] + vectors = [ + self.nlp(token).vector + for token in tokens + if self.nlp(token).vector.any() + ] return vectors except Exception as e: logging.error("An error occurred during Vectorization: ", exc_info=e) raise e - + def Mean_Vector(self, vectors: list) -> np.ndarray: """ Computes the mean vector of a list of vectors. @@ -72,9 +77,11 @@ def Mean_Vector(self, vectors: list) -> np.ndarray: return np.zeros(self.nlp.vocab.vectors_length) return np.mean(vectors, axis=0) except Exception as e: - logging.error("An error occurred while computing the Mean Vector: ", exc_info=e) + logging.error( + "An error occurred while computing the Mean Vector: ", exc_info=e + ) raise e - + def Dot_Product(self, vector1: np.ndarray, vector2: np.ndarray) -> float: """ Computes the dot product between two vectors. @@ -89,9 +96,11 @@ def Dot_Product(self, vector1: np.ndarray, vector2: np.ndarray) -> float: try: return np.dot(vector1, vector2) except Exception as e: - logging.error("An error occurred during the dot Product Calculation: ", exc_info=e) + logging.error( + "An error occurred during the dot Product Calculation: ", exc_info=e + ) raise e - + def Magnitude(self, vector: np.ndarray) -> float: """ Computes the magnitude (norm) of a vector. @@ -103,11 +112,13 @@ def Magnitude(self, vector: np.ndarray) -> float: - float: The magnitude of the vector. """ try: - return np.sqrt(np.sum(vector ** 2)) + return np.sqrt(np.sum(vector**2)) except Exception as e: - logging.error("An error occurred while computing the Magnitude: ", exc_info=e) + logging.error( + "An error occurred while computing the Magnitude: ", exc_info=e + ) raise e - + def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: """ Computes the cosine similarity between two vectors. @@ -126,9 +137,11 @@ def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: return 0.0 return dot / (magnitude1 * magnitude2) except Exception as e: - logging.error("An error occurred during Cosine Similarity Calculation: ", exc_info=e) + logging.error( + "An error occurred during Cosine Similarity Calculation: ", exc_info=e + ) raise e - + def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: """ Computes the cosine similarity percentage between two texts. @@ -143,25 +156,31 @@ def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: try: tokens1 = self.Tokenize(text1) tokens2 = self.Tokenize(text2) - + vectors1 = self.Vectorize(tokens1) vectors2 = self.Vectorize(tokens2) - + mean_vec1 = self.Mean_Vector(vectors1) mean_vec2 = self.Mean_Vector(vectors2) - + similarity = self.Cosine_Similarity(mean_vec1, mean_vec2) return similarity * 100 except Exception as e: - logging.error("An error occurred while computing the Cosine Similarity Percentage: ", exc_info=e) + logging.error( + "An error occurred while computing the Cosine Similarity Percentage: ", + exc_info=e, + ) raise e - + + if __name__ == "__main__": """ Main function to Test the Cosine Similarity between two Texts. """ text1 = input("Please enter text 1: ") text2 = input("Please enter text 2: ") - - similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage(text1, text2) - print(f"Cosine Similarity: {similarity_percentage:.2f}%") \ No newline at end of file + + similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage( + text1, text2 + ) + print(f"Cosine Similarity: {similarity_percentage:.2f}%") From e8ec6df269fc4e04a74c43414c03be4c1ed3eb5a Mon Sep 17 00:00:00 2001 From: Arko-Sengupta Date: Tue, 3 Sep 2024 14:11:55 +0530 Subject: [PATCH 3/8] Input Fixes --- machine_learning/cosine_similarity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py index 2b9c7fe901d5..42851bb2454a 100644 --- a/machine_learning/cosine_similarity.py +++ b/machine_learning/cosine_similarity.py @@ -160,8 +160,8 @@ def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: """ Main function to Test the Cosine Similarity between two Texts. """ - text1 = input("Please enter text 1: ") - text2 = input("Please enter text 2: ") + text1 = "The biggest Infrastructure in the World is Burj Khalifa" + text2 = "The name of the talllest Tower in the world is Burj Khalifa" similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage(text1, text2) print(f"Cosine Similarity: {similarity_percentage:.2f}%") \ No newline at end of file From 030ced3df01859d0994c4402cd95cbff78c1f726 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 08:44:10 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/cosine_similarity.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py index be673eb3e633..22c98e9b15da 100644 --- a/machine_learning/cosine_similarity.py +++ b/machine_learning/cosine_similarity.py @@ -179,6 +179,8 @@ def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: """ text1 = "The biggest Infrastructure in the World is Burj Khalifa" text2 = "The name of the talllest Tower in the world is Burj Khalifa" - - similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage(text1, text2) + + similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage( + text1, text2 + ) print(f"Cosine Similarity: {similarity_percentage:.2f}%") From 768015cfc05d5b75a47fcb73cb2e64e5d6a2a66e Mon Sep 17 00:00:00 2001 From: Arko-Sengupta Date: Tue, 3 Sep 2024 14:18:07 +0530 Subject: [PATCH 5/8] Lower Case Fixes --- machine_learning/cosine_similarity.py | 40 +++++++++++++-------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py index 22c98e9b15da..1aeaf59e388a 100644 --- a/machine_learning/cosine_similarity.py +++ b/machine_learning/cosine_similarity.py @@ -3,7 +3,7 @@ import numpy as np -class Cosine_Similarity: +class cosine_similarity: """ Cosine Similarity Algorithm @@ -23,7 +23,7 @@ def __init__(self) -> None: """ self.nlp = spacy.load("en_core_web_md") - def Tokenize(self, text: str) -> list: + def tokenize(self, text: str) -> list: """ Tokenizes the input text into a list of lowercased tokens. @@ -41,7 +41,7 @@ def Tokenize(self, text: str) -> list: logging.error("An error occurred during Tokenization: ", exc_info=e) raise e - def Vectorize(self, tokens: list) -> list: + def vectorize(self, tokens: list) -> list: """ Converts tokens into their corresponding vector representations. @@ -62,7 +62,7 @@ def Vectorize(self, tokens: list) -> list: logging.error("An error occurred during Vectorization: ", exc_info=e) raise e - def Mean_Vector(self, vectors: list) -> np.ndarray: + def mean_vector(self, vectors: list) -> np.ndarray: """ Computes the mean vector of a list of vectors. @@ -82,7 +82,7 @@ def Mean_Vector(self, vectors: list) -> np.ndarray: ) raise e - def Dot_Product(self, vector1: np.ndarray, vector2: np.ndarray) -> float: + def dot_product(self, vector1: np.ndarray, vector2: np.ndarray) -> float: """ Computes the dot product between two vectors. @@ -101,7 +101,7 @@ def Dot_Product(self, vector1: np.ndarray, vector2: np.ndarray) -> float: ) raise e - def Magnitude(self, vector: np.ndarray) -> float: + def magnitude(self, vector: np.ndarray) -> float: """ Computes the magnitude (norm) of a vector. @@ -119,7 +119,7 @@ def Magnitude(self, vector: np.ndarray) -> float: ) raise e - def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: + def cosine_similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: """ Computes the cosine similarity between two vectors. @@ -131,8 +131,8 @@ def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: - float: The cosine similarity between the two vectors. """ try: - dot = self.Dot_Product(vector1, vector2) - magnitude1, magnitude2 = self.Magnitude(vector1), self.Magnitude(vector2) + dot = self.dot_product(vector1, vector2) + magnitude1, magnitude2 = self.magnitude(vector1), self.magnitude(vector2) if magnitude1 == 0 or magnitude2 == 0: return 0.0 return dot / (magnitude1 * magnitude2) @@ -142,7 +142,7 @@ def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: ) raise e - def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: + def cosine_similarity_percentage(self, text1: str, text2: str) -> float: """ Computes the cosine similarity percentage between two texts. @@ -154,16 +154,16 @@ def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: - float: The cosine similarity percentage between the two texts. """ try: - tokens1 = self.Tokenize(text1) - tokens2 = self.Tokenize(text2) + tokens1 = self.tokenize(text1) + tokens2 = self.tokenize(text2) - vectors1 = self.Vectorize(tokens1) - vectors2 = self.Vectorize(tokens2) + vectors1 = self.vectorize(tokens1) + vectors2 = self.vectorize(tokens2) - mean_vec1 = self.Mean_Vector(vectors1) - mean_vec2 = self.Mean_Vector(vectors2) + mean_vec1 = self.mean_vector(vectors1) + mean_vec2 = self.mean_vector(vectors2) - similarity = self.Cosine_Similarity(mean_vec1, mean_vec2) + similarity = self.cosine_similarity(mean_vec1, mean_vec2) return similarity * 100 except Exception as e: logging.error( @@ -179,8 +179,6 @@ def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: """ text1 = "The biggest Infrastructure in the World is Burj Khalifa" text2 = "The name of the talllest Tower in the world is Burj Khalifa" - - similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage( - text1, text2 - ) + + similarity_percentage = cosine_similarity().cosine_similarity_percentage(text1, text2) print(f"Cosine Similarity: {similarity_percentage:.2f}%") From d8deb038157902b9e022ee6c560dc1a2554030bc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 08:48:32 +0000 Subject: [PATCH 6/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/cosine_similarity.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py index 1aeaf59e388a..7bf30c6a721d 100644 --- a/machine_learning/cosine_similarity.py +++ b/machine_learning/cosine_similarity.py @@ -179,6 +179,8 @@ def cosine_similarity_percentage(self, text1: str, text2: str) -> float: """ text1 = "The biggest Infrastructure in the World is Burj Khalifa" text2 = "The name of the talllest Tower in the world is Burj Khalifa" - - similarity_percentage = cosine_similarity().cosine_similarity_percentage(text1, text2) + + similarity_percentage = cosine_similarity().cosine_similarity_percentage( + text1, text2 + ) print(f"Cosine Similarity: {similarity_percentage:.2f}%") From d597f45c06484df48be526876a97a238ba4cfc30 Mon Sep 17 00:00:00 2001 From: Arko-Sengupta Date: Tue, 3 Sep 2024 14:27:12 +0530 Subject: [PATCH 7/8] Case Fixes --- machine_learning/cosine_similarity.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py index 1aeaf59e388a..44fd77951509 100644 --- a/machine_learning/cosine_similarity.py +++ b/machine_learning/cosine_similarity.py @@ -1,9 +1,8 @@ -import spacy import logging import numpy as np +import spacy - -class cosine_similarity: +class CosineSimilarity: """ Cosine Similarity Algorithm @@ -180,5 +179,5 @@ def cosine_similarity_percentage(self, text1: str, text2: str) -> float: text1 = "The biggest Infrastructure in the World is Burj Khalifa" text2 = "The name of the talllest Tower in the world is Burj Khalifa" - similarity_percentage = cosine_similarity().cosine_similarity_percentage(text1, text2) + similarity_percentage = CosineSimilarity().cosine_similarity_percentage(text1, text2) print(f"Cosine Similarity: {similarity_percentage:.2f}%") From fa91225b334db60a5383d86ebac2d7039ac5fc07 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 08:59:23 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/cosine_similarity.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/machine_learning/cosine_similarity.py b/machine_learning/cosine_similarity.py index ca49a912325e..2c3f2b4ced87 100644 --- a/machine_learning/cosine_similarity.py +++ b/machine_learning/cosine_similarity.py @@ -2,6 +2,7 @@ import numpy as np import spacy + class CosineSimilarity: """ Cosine Similarity Algorithm @@ -179,5 +180,7 @@ def cosine_similarity_percentage(self, text1: str, text2: str) -> float: text1 = "The biggest Infrastructure in the World is Burj Khalifa" text2 = "The name of the talllest Tower in the world is Burj Khalifa" - similarity_percentage = CosineSimilarity().cosine_similarity_percentage(text1, text2) + similarity_percentage = CosineSimilarity().cosine_similarity_percentage( + text1, text2 + ) print(f"Cosine Similarity: {similarity_percentage:.2f}%")