1
+ import spacy
2
+ import logging
3
+ import numpy as np
4
+
5
+ class Cosine_Similarity :
6
+ """
7
+ Cosine Similarity Algorithm
8
+
9
+ Use Case:
10
+ - The Cosine Similarity Algorithm measures the Cosine of the Angle between two Non-Zero Vectors in a Multi-Dimensional Space.
11
+ - It is used to determine how similar two texts are based on their Vector representations.
12
+ - The similarity score ranges from -1 (Completely Dissimilar) to 1 (Completely Similar), with 0 indicating no Similarity.
13
+
14
+ Dependencies:
15
+ - spacy: A Natural Language Processing library for Python, used here for Tokenization and Vectorization.
16
+ - numpy: A Library for Numerical Operations in Python, used for Mathematical Computations.
17
+ """
18
+
19
+ def __init__ (self ) -> None :
20
+ """
21
+ Initializes the Cosine Similarity class by loading the SpaCy model.
22
+ """
23
+ self .nlp = spacy .load ('en_core_web_md' )
24
+
25
+ def Tokenize (self , text : str ) -> list :
26
+ """
27
+ Tokenizes the input text into a list of lowercased tokens.
28
+
29
+ Parameters:
30
+ - text (str): The input text to be tokenized.
31
+
32
+ Returns:
33
+ - list: A list of lowercased tokens.
34
+ """
35
+ try :
36
+ doc = self .nlp (text )
37
+ tokens = [token .text .lower () for token in doc if not token .is_punct ]
38
+ return tokens
39
+ except Exception as e :
40
+ logging .error ("An error occurred during Tokenization: " , exc_info = e )
41
+ raise e
42
+
43
+ def Vectorize (self , tokens : list ) -> list :
44
+ """
45
+ Converts tokens into their corresponding vector representations.
46
+
47
+ Parameters:
48
+ - tokens (list): A list of tokens to be vectorized.
49
+
50
+ Returns:
51
+ - list: A list of vectors corresponding to the tokens.
52
+ """
53
+ try :
54
+ vectors = [self .nlp (token ).vector for token in tokens if self .nlp (token ).vector .any ()]
55
+ return vectors
56
+ except Exception as e :
57
+ logging .error ("An error occurred during Vectorization: " , exc_info = e )
58
+ raise e
59
+
60
+ def Mean_Vector (self , vectors : list ) -> np .ndarray :
61
+ """
62
+ Computes the mean vector of a list of vectors.
63
+
64
+ Parameters:
65
+ - vectors (list): A list of vectors to be averaged.
66
+
67
+ Returns:
68
+ - np.ndarray: The mean vector.
69
+ """
70
+ try :
71
+ if not vectors :
72
+ return np .zeros (self .nlp .vocab .vectors_length )
73
+ return np .mean (vectors , axis = 0 )
74
+ except Exception as e :
75
+ logging .error ("An error occurred while computing the Mean Vector: " , exc_info = e )
76
+ raise e
77
+
78
+ def Dot_Product (self , vector1 : np .ndarray , vector2 : np .ndarray ) -> float :
79
+ """
80
+ Computes the dot product between two vectors.
81
+
82
+ Parameters:
83
+ - vector1 (np.ndarray): The first vector.
84
+ - vector2 (np.ndarray): The second vector.
85
+
86
+ Returns:
87
+ - float: The dot product of the two vectors.
88
+ """
89
+ try :
90
+ return np .dot (vector1 , vector2 )
91
+ except Exception as e :
92
+ logging .error ("An error occurred during the dot Product Calculation: " , exc_info = e )
93
+ raise e
94
+
95
+ def Magnitude (self , vector : np .ndarray ) -> float :
96
+ """
97
+ Computes the magnitude (norm) of a vector.
98
+
99
+ Parameters:
100
+ - vector (np.ndarray): The vector whose magnitude is to be calculated.
101
+
102
+ Returns:
103
+ - float: The magnitude of the vector.
104
+ """
105
+ try :
106
+ return np .sqrt (np .sum (vector ** 2 ))
107
+ except Exception as e :
108
+ logging .error ("An error occurred while computing the Magnitude: " , exc_info = e )
109
+ raise e
110
+
111
+ def Cosine_Similarity (self , vector1 : np .ndarray , vector2 : np .ndarray ) -> float :
112
+ """
113
+ Computes the cosine similarity between two vectors.
114
+
115
+ Parameters:
116
+ - vector1 (np.ndarray): The first vector.
117
+ - vector2 (np.ndarray): The second vector.
118
+
119
+ Returns:
120
+ - float: The cosine similarity between the two vectors.
121
+ """
122
+ try :
123
+ dot = self .Dot_Product (vector1 , vector2 )
124
+ magnitude1 , magnitude2 = self .Magnitude (vector1 ), self .Magnitude (vector2 )
125
+ if magnitude1 == 0 or magnitude2 == 0 :
126
+ return 0.0
127
+ return dot / (magnitude1 * magnitude2 )
128
+ except Exception as e :
129
+ logging .error ("An error occurred during Cosine Similarity Calculation: " , exc_info = e )
130
+ raise e
131
+
132
+ def Cosine_Similarity_Percentage (self , text1 : str , text2 : str ) -> float :
133
+ """
134
+ Computes the cosine similarity percentage between two texts.
135
+
136
+ Parameters:
137
+ - text1 (str): The first text.
138
+ - text2 (str): The second text.
139
+
140
+ Returns:
141
+ - float: The cosine similarity percentage between the two texts.
142
+ """
143
+ try :
144
+ tokens1 = self .Tokenize (text1 )
145
+ tokens2 = self .Tokenize (text2 )
146
+
147
+ vectors1 = self .Vectorize (tokens1 )
148
+ vectors2 = self .Vectorize (tokens2 )
149
+
150
+ mean_vec1 = self .Mean_Vector (vectors1 )
151
+ mean_vec2 = self .Mean_Vector (vectors2 )
152
+
153
+ similarity = self .Cosine_Similarity (mean_vec1 , mean_vec2 )
154
+ return similarity * 100
155
+ except Exception as e :
156
+ logging .error ("An error occurred while computing the Cosine Similarity Percentage: " , exc_info = e )
157
+ raise e
158
+
159
+ if __name__ == "__main__" :
160
+ """
161
+ Main function to Test the Cosine Similarity between two Texts.
162
+ """
163
+ text1 = input ("Please enter text 1: " )
164
+ text2 = input ("Please enter text 2: " )
165
+
166
+ similarity_percentage = Cosine_Similarity ().Cosine_Similarity_Percentage (text1 , text2 )
167
+ print (f"Cosine Similarity: { similarity_percentage :.2f} %" )
0 commit comments