Skip to content

Commit 0af293b

Browse files
committed
Cosine Similarity Algorithm | Machine Learning
1 parent bd8085c commit 0af293b

File tree

1 file changed

+167
-0
lines changed

1 file changed

+167
-0
lines changed

machine_learning/cosine_similarity.py

+167
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import spacy
2+
import logging
3+
import numpy as np
4+
5+
class Cosine_Similarity:
6+
"""
7+
Cosine Similarity Algorithm
8+
9+
Use Case:
10+
- The Cosine Similarity Algorithm measures the Cosine of the Angle between two Non-Zero Vectors in a Multi-Dimensional Space.
11+
- It is used to determine how similar two texts are based on their Vector representations.
12+
- The similarity score ranges from -1 (Completely Dissimilar) to 1 (Completely Similar), with 0 indicating no Similarity.
13+
14+
Dependencies:
15+
- spacy: A Natural Language Processing library for Python, used here for Tokenization and Vectorization.
16+
- numpy: A Library for Numerical Operations in Python, used for Mathematical Computations.
17+
"""
18+
19+
def __init__(self) -> None:
20+
"""
21+
Initializes the Cosine Similarity class by loading the SpaCy model.
22+
"""
23+
self.nlp = spacy.load('en_core_web_md')
24+
25+
def Tokenize(self, text: str) -> list:
26+
"""
27+
Tokenizes the input text into a list of lowercased tokens.
28+
29+
Parameters:
30+
- text (str): The input text to be tokenized.
31+
32+
Returns:
33+
- list: A list of lowercased tokens.
34+
"""
35+
try:
36+
doc = self.nlp(text)
37+
tokens = [token.text.lower() for token in doc if not token.is_punct]
38+
return tokens
39+
except Exception as e:
40+
logging.error("An error occurred during Tokenization: ", exc_info=e)
41+
raise e
42+
43+
def Vectorize(self, tokens: list) -> list:
44+
"""
45+
Converts tokens into their corresponding vector representations.
46+
47+
Parameters:
48+
- tokens (list): A list of tokens to be vectorized.
49+
50+
Returns:
51+
- list: A list of vectors corresponding to the tokens.
52+
"""
53+
try:
54+
vectors = [self.nlp(token).vector for token in tokens if self.nlp(token).vector.any()]
55+
return vectors
56+
except Exception as e:
57+
logging.error("An error occurred during Vectorization: ", exc_info=e)
58+
raise e
59+
60+
def Mean_Vector(self, vectors: list) -> np.ndarray:
61+
"""
62+
Computes the mean vector of a list of vectors.
63+
64+
Parameters:
65+
- vectors (list): A list of vectors to be averaged.
66+
67+
Returns:
68+
- np.ndarray: The mean vector.
69+
"""
70+
try:
71+
if not vectors:
72+
return np.zeros(self.nlp.vocab.vectors_length)
73+
return np.mean(vectors, axis=0)
74+
except Exception as e:
75+
logging.error("An error occurred while computing the Mean Vector: ", exc_info=e)
76+
raise e
77+
78+
def Dot_Product(self, vector1: np.ndarray, vector2: np.ndarray) -> float:
79+
"""
80+
Computes the dot product between two vectors.
81+
82+
Parameters:
83+
- vector1 (np.ndarray): The first vector.
84+
- vector2 (np.ndarray): The second vector.
85+
86+
Returns:
87+
- float: The dot product of the two vectors.
88+
"""
89+
try:
90+
return np.dot(vector1, vector2)
91+
except Exception as e:
92+
logging.error("An error occurred during the dot Product Calculation: ", exc_info=e)
93+
raise e
94+
95+
def Magnitude(self, vector: np.ndarray) -> float:
96+
"""
97+
Computes the magnitude (norm) of a vector.
98+
99+
Parameters:
100+
- vector (np.ndarray): The vector whose magnitude is to be calculated.
101+
102+
Returns:
103+
- float: The magnitude of the vector.
104+
"""
105+
try:
106+
return np.sqrt(np.sum(vector ** 2))
107+
except Exception as e:
108+
logging.error("An error occurred while computing the Magnitude: ", exc_info=e)
109+
raise e
110+
111+
def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float:
112+
"""
113+
Computes the cosine similarity between two vectors.
114+
115+
Parameters:
116+
- vector1 (np.ndarray): The first vector.
117+
- vector2 (np.ndarray): The second vector.
118+
119+
Returns:
120+
- float: The cosine similarity between the two vectors.
121+
"""
122+
try:
123+
dot = self.Dot_Product(vector1, vector2)
124+
magnitude1, magnitude2 = self.Magnitude(vector1), self.Magnitude(vector2)
125+
if magnitude1 == 0 or magnitude2 == 0:
126+
return 0.0
127+
return dot / (magnitude1 * magnitude2)
128+
except Exception as e:
129+
logging.error("An error occurred during Cosine Similarity Calculation: ", exc_info=e)
130+
raise e
131+
132+
def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float:
133+
"""
134+
Computes the cosine similarity percentage between two texts.
135+
136+
Parameters:
137+
- text1 (str): The first text.
138+
- text2 (str): The second text.
139+
140+
Returns:
141+
- float: The cosine similarity percentage between the two texts.
142+
"""
143+
try:
144+
tokens1 = self.Tokenize(text1)
145+
tokens2 = self.Tokenize(text2)
146+
147+
vectors1 = self.Vectorize(tokens1)
148+
vectors2 = self.Vectorize(tokens2)
149+
150+
mean_vec1 = self.Mean_Vector(vectors1)
151+
mean_vec2 = self.Mean_Vector(vectors2)
152+
153+
similarity = self.Cosine_Similarity(mean_vec1, mean_vec2)
154+
return similarity * 100
155+
except Exception as e:
156+
logging.error("An error occurred while computing the Cosine Similarity Percentage: ", exc_info=e)
157+
raise e
158+
159+
if __name__ == "__main__":
160+
"""
161+
Main function to Test the Cosine Similarity between two Texts.
162+
"""
163+
text1 = input("Please enter text 1: ")
164+
text2 = input("Please enter text 2: ")
165+
166+
similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage(text1, text2)
167+
print(f"Cosine Similarity: {similarity_percentage:.2f}%")

0 commit comments

Comments
 (0)