-
-
Notifications
You must be signed in to change notification settings - Fork 46.6k
Cosine Similarity Algorithm | Machine Learning #11536
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
0af293b
3a62339
e8ec6df
1458803
030ced3
768015c
d8deb03
d597f45
2479eef
fa91225
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
import spacy | ||
import logging | ||
import numpy as np | ||
|
||
|
||
class Cosine_Similarity: | ||
Check failure on line 6 in machine_learning/cosine_similarity.py
|
||
""" | ||
Cosine Similarity Algorithm | ||
|
||
Use Case: | ||
- The Cosine Similarity Algorithm measures the Cosine of the Angle between two Non-Zero Vectors in a Multi-Dimensional Space. | ||
- It is used to determine how similar two texts are based on their Vector representations. | ||
- The similarity score ranges from -1 (Completely Dissimilar) to 1 (Completely Similar), with 0 indicating no Similarity. | ||
|
||
Dependencies: | ||
- spacy: A Natural Language Processing library for Python, used here for Tokenization and Vectorization. | ||
- numpy: A Library for Numerical Operations in Python, used for Mathematical Computations. | ||
""" | ||
|
||
def __init__(self) -> None: | ||
""" | ||
Initializes the Cosine Similarity class by loading the SpaCy model. | ||
""" | ||
self.nlp = spacy.load("en_core_web_md") | ||
|
||
def Tokenize(self, text: str) -> list: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the As there is no test file in this pull request nor any test function or class in the file |
||
""" | ||
Tokenizes the input text into a list of lowercased tokens. | ||
|
||
Parameters: | ||
- text (str): The input text to be tokenized. | ||
|
||
Returns: | ||
- list: A list of lowercased tokens. | ||
""" | ||
try: | ||
doc = self.nlp(text) | ||
tokens = [token.text.lower() for token in doc if not token.is_punct] | ||
return tokens | ||
except Exception as e: | ||
logging.error("An error occurred during Tokenization: ", exc_info=e) | ||
raise e | ||
|
||
def Vectorize(self, tokens: list) -> list: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the As there is no test file in this pull request nor any test function or class in the file |
||
""" | ||
Converts tokens into their corresponding vector representations. | ||
|
||
Parameters: | ||
- tokens (list): A list of tokens to be vectorized. | ||
|
||
Returns: | ||
- list: A list of vectors corresponding to the tokens. | ||
""" | ||
try: | ||
vectors = [ | ||
self.nlp(token).vector | ||
for token in tokens | ||
if self.nlp(token).vector.any() | ||
] | ||
return vectors | ||
except Exception as e: | ||
logging.error("An error occurred during Vectorization: ", exc_info=e) | ||
raise e | ||
|
||
def Mean_Vector(self, vectors: list) -> np.ndarray: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the As there is no test file in this pull request nor any test function or class in the file |
||
""" | ||
Computes the mean vector of a list of vectors. | ||
|
||
Parameters: | ||
- vectors (list): A list of vectors to be averaged. | ||
|
||
Returns: | ||
- np.ndarray: The mean vector. | ||
""" | ||
try: | ||
if not vectors: | ||
return np.zeros(self.nlp.vocab.vectors_length) | ||
return np.mean(vectors, axis=0) | ||
except Exception as e: | ||
logging.error( | ||
"An error occurred while computing the Mean Vector: ", exc_info=e | ||
) | ||
raise e | ||
|
||
def Dot_Product(self, vector1: np.ndarray, vector2: np.ndarray) -> float: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the As there is no test file in this pull request nor any test function or class in the file |
||
""" | ||
Computes the dot product between two vectors. | ||
|
||
Parameters: | ||
- vector1 (np.ndarray): The first vector. | ||
- vector2 (np.ndarray): The second vector. | ||
|
||
Returns: | ||
- float: The dot product of the two vectors. | ||
""" | ||
try: | ||
return np.dot(vector1, vector2) | ||
except Exception as e: | ||
logging.error( | ||
"An error occurred during the dot Product Calculation: ", exc_info=e | ||
) | ||
raise e | ||
|
||
def Magnitude(self, vector: np.ndarray) -> float: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the As there is no test file in this pull request nor any test function or class in the file |
||
""" | ||
Computes the magnitude (norm) of a vector. | ||
|
||
Parameters: | ||
- vector (np.ndarray): The vector whose magnitude is to be calculated. | ||
|
||
Returns: | ||
- float: The magnitude of the vector. | ||
""" | ||
try: | ||
return np.sqrt(np.sum(vector**2)) | ||
except Exception as e: | ||
logging.error( | ||
"An error occurred while computing the Magnitude: ", exc_info=e | ||
) | ||
raise e | ||
|
||
def Cosine_Similarity(self, vector1: np.ndarray, vector2: np.ndarray) -> float: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the As there is no test file in this pull request nor any test function or class in the file |
||
""" | ||
Computes the cosine similarity between two vectors. | ||
|
||
Parameters: | ||
- vector1 (np.ndarray): The first vector. | ||
- vector2 (np.ndarray): The second vector. | ||
|
||
Returns: | ||
- float: The cosine similarity between the two vectors. | ||
""" | ||
try: | ||
dot = self.Dot_Product(vector1, vector2) | ||
magnitude1, magnitude2 = self.Magnitude(vector1), self.Magnitude(vector2) | ||
if magnitude1 == 0 or magnitude2 == 0: | ||
return 0.0 | ||
return dot / (magnitude1 * magnitude2) | ||
except Exception as e: | ||
logging.error( | ||
"An error occurred during Cosine Similarity Calculation: ", exc_info=e | ||
) | ||
raise e | ||
|
||
def Cosine_Similarity_Percentage(self, text1: str, text2: str) -> float: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the As there is no test file in this pull request nor any test function or class in the file |
||
""" | ||
Computes the cosine similarity percentage between two texts. | ||
|
||
Parameters: | ||
- text1 (str): The first text. | ||
- text2 (str): The second text. | ||
|
||
Returns: | ||
- float: The cosine similarity percentage between the two texts. | ||
""" | ||
try: | ||
tokens1 = self.Tokenize(text1) | ||
tokens2 = self.Tokenize(text2) | ||
|
||
vectors1 = self.Vectorize(tokens1) | ||
vectors2 = self.Vectorize(tokens2) | ||
|
||
mean_vec1 = self.Mean_Vector(vectors1) | ||
mean_vec2 = self.Mean_Vector(vectors2) | ||
|
||
similarity = self.Cosine_Similarity(mean_vec1, mean_vec2) | ||
return similarity * 100 | ||
except Exception as e: | ||
logging.error( | ||
"An error occurred while computing the Cosine Similarity Percentage: ", | ||
exc_info=e, | ||
) | ||
raise e | ||
|
||
|
||
if __name__ == "__main__": | ||
""" | ||
Main function to Test the Cosine Similarity between two Texts. | ||
""" | ||
text1 = "The biggest Infrastructure in the World is Burj Khalifa" | ||
text2 = "The name of the talllest Tower in the world is Burj Khalifa" | ||
|
||
similarity_percentage = Cosine_Similarity().Cosine_Similarity_Percentage(text1, text2) | ||
print(f"Cosine Similarity: {similarity_percentage:.2f}%") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Class names should follow the
CamelCase
naming convention. Please update the following name accordingly:Cosine_Similarity