From 1898766c235fb7a479a8566737e3e66c74f3f448 Mon Sep 17 00:00:00 2001 From: Omar Hussein Date: Tue, 27 Aug 2024 22:34:34 -0400 Subject: [PATCH 1/2] Enhance similarity search with flexible distance metrics and k-nearest neighbors - Add distance_func parameter to support custom distance metrics - Implement k-nearest neighbors search - Optimize euclidean distance calculation using NumPy - Add cosine distance function as an alternative metric - Improve performance with list comprehensions - Add type hints for better code readability - Include additional examples demonstrating new features --- machine_learning/similarity_search.py | 60 +++++++++++++++++++-------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index 0bc3b17d7e5a..3d9bb97d99c1 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -6,15 +6,21 @@ returns a list containing two data for each vector: 1. the nearest vector 2. distance between the vector and the nearest vector (float) + +This implementation also includes cosine similarity as an alternative measure. """ from __future__ import annotations import math +from typing import Callable, List, Tuple, Union import numpy as np from numpy.linalg import norm +VectorType = Union[List[float], np.ndarray] +DistanceFunction = Callable[[np.ndarray, np.ndarray], float] + def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float: """ @@ -31,18 +37,23 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float: >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1])) 1.0 """ - return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b))) + return np.sqrt(np.sum((input_a - input_b) ** 2)) def similarity_search( - dataset: np.ndarray, value_array: np.ndarray -) -> list[list[list[float] | float]]: + dataset: np.ndarray, + value_array: np.ndarray, + distance_func: DistanceFunction = euclidean, + k: int = 1 +) -> List[List[Union[List[float], float]]]: """ :param dataset: Set containing the vectors. Should be ndarray. :param value_array: vector/vectors we want to know the nearest vector from dataset. + :param distance_func: Distance function to use (default: euclidean). + :param k: Number of nearest neighbors to return (default: 1). :return: Result will be a list containing - 1. the nearest vector - 2. distance from the vector + 1. the nearest vector(s) + 2. distance(s) from the vector(s) >>> dataset = np.array([[0], [1], [2]]) >>> value_array = np.array([[0]]) @@ -64,6 +75,11 @@ def similarity_search( >>> similarity_search(dataset, value_array) [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]] + >>> dataset = np.array([[0, 0], [1, 1], [2, 2]]) + >>> value_array = np.array([[0, 1]]) + >>> similarity_search(dataset, value_array, k=2) + [[[0, 0], 1.0], [[1, 1], 1.0]] + These are the errors that might occur: 1. If dimensions are different. @@ -125,19 +141,14 @@ def similarity_search( answer = [] for value in value_array: - dist = euclidean(value, dataset[0]) - vector = dataset[0].tolist() - - for dataset_value in dataset[1:]: - temp_dist = euclidean(value, dataset_value) - - if dist > temp_dist: - dist = temp_dist - vector = dataset_value.tolist() - - answer.append([vector, dist]) + distances = [distance_func(value, data_point) for data_point in dataset] + nearest_indices = np.argsort(distances)[:k] + answer.append([ + [dataset[i].tolist(), distances[i]] + for i in nearest_indices + ]) - return answer + return answer[0] if len(answer) == 1 else answer def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float: @@ -156,6 +167,21 @@ def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float: return np.dot(input_a, input_b) / (norm(input_a) * norm(input_b)) +def cosine_distance(input_a: np.ndarray, input_b: np.ndarray) -> float: + """ + Calculates cosine distance between two data. + :param input_a: ndarray of first vector. + :param input_b: ndarray of second vector. + :return: Cosine distance of input_a and input_b. + + >>> cosine_distance(np.array([1]), np.array([1])) + 0.0 + >>> round(cosine_distance(np.array([1, 2]), np.array([6, 32])), 7) + 0.0384761 + """ + return 1 - cosine_similarity(input_a, input_b) + + if __name__ == "__main__": import doctest From d7e56d11cfe55dad1bf9f4087fa1a94402811dfb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Aug 2024 02:38:03 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/similarity_search.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py index 3d9bb97d99c1..f33f0bed7075 100644 --- a/machine_learning/similarity_search.py +++ b/machine_learning/similarity_search.py @@ -44,7 +44,7 @@ def similarity_search( dataset: np.ndarray, value_array: np.ndarray, distance_func: DistanceFunction = euclidean, - k: int = 1 + k: int = 1, ) -> List[List[Union[List[float], float]]]: """ :param dataset: Set containing the vectors. Should be ndarray. @@ -143,10 +143,7 @@ def similarity_search( for value in value_array: distances = [distance_func(value, data_point) for data_point in dataset] nearest_indices = np.argsort(distances)[:k] - answer.append([ - [dataset[i].tolist(), distances[i]] - for i in nearest_indices - ]) + answer.append([[dataset[i].tolist(), distances[i]] for i in nearest_indices]) return answer[0] if len(answer) == 1 else answer