From 1898766c235fb7a479a8566737e3e66c74f3f448 Mon Sep 17 00:00:00 2001
From: Omar Hussein <omarmoh.said@gmail.com>
Date: Tue, 27 Aug 2024 22:34:34 -0400
Subject: [PATCH 1/2] Enhance similarity search with flexible distance metrics
 and k-nearest neighbors

- Add distance_func parameter to support custom distance metrics
- Implement k-nearest neighbors search
- Optimize euclidean distance calculation using NumPy
- Add cosine distance function as an alternative metric
- Improve performance with list comprehensions
- Add type hints for better code readability
- Include additional examples demonstrating new features
---
 machine_learning/similarity_search.py | 60 +++++++++++++++++++--------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index 0bc3b17d7e5a..3d9bb97d99c1 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -6,15 +6,21 @@
 returns a list containing two data for each vector:
     1. the nearest vector
     2. distance between the vector and the nearest vector (float)
+
+This implementation also includes cosine similarity as an alternative measure.
 """
 
 from __future__ import annotations
 
 import math
+from typing import Callable, List, Tuple, Union
 
 import numpy as np
 from numpy.linalg import norm
 
+VectorType = Union[List[float], np.ndarray]
+DistanceFunction = Callable[[np.ndarray, np.ndarray], float]
+
 
 def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
     """
@@ -31,18 +37,23 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
     >>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
     1.0
     """
-    return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
+    return np.sqrt(np.sum((input_a - input_b) ** 2))
 
 
 def similarity_search(
-    dataset: np.ndarray, value_array: np.ndarray
-) -> list[list[list[float] | float]]:
+    dataset: np.ndarray,
+    value_array: np.ndarray,
+    distance_func: DistanceFunction = euclidean,
+    k: int = 1
+) -> List[List[Union[List[float], float]]]:
     """
     :param dataset: Set containing the vectors. Should be ndarray.
     :param value_array: vector/vectors we want to know the nearest vector from dataset.
+    :param distance_func: Distance function to use (default: euclidean).
+    :param k: Number of nearest neighbors to return (default: 1).
     :return: Result will be a list containing
-            1. the nearest vector
-            2. distance from the vector
+            1. the nearest vector(s)
+            2. distance(s) from the vector(s)
 
     >>> dataset = np.array([[0], [1], [2]])
     >>> value_array = np.array([[0]])
@@ -64,6 +75,11 @@ def similarity_search(
     >>> similarity_search(dataset, value_array)
     [[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
 
+    >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
+    >>> value_array = np.array([[0, 1]])
+    >>> similarity_search(dataset, value_array, k=2)
+    [[[0, 0], 1.0], [[1, 1], 1.0]]
+
     These are the errors that might occur:
 
     1. If dimensions are different.
@@ -125,19 +141,14 @@ def similarity_search(
     answer = []
 
     for value in value_array:
-        dist = euclidean(value, dataset[0])
-        vector = dataset[0].tolist()
-
-        for dataset_value in dataset[1:]:
-            temp_dist = euclidean(value, dataset_value)
-
-            if dist > temp_dist:
-                dist = temp_dist
-                vector = dataset_value.tolist()
-
-        answer.append([vector, dist])
+        distances = [distance_func(value, data_point) for data_point in dataset]
+        nearest_indices = np.argsort(distances)[:k]
+        answer.append([
+            [dataset[i].tolist(), distances[i]]
+            for i in nearest_indices
+        ])
 
-    return answer
+    return answer[0] if len(answer) == 1 else answer
 
 
 def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float:
@@ -156,6 +167,21 @@ def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float:
     return np.dot(input_a, input_b) / (norm(input_a) * norm(input_b))
 
 
+def cosine_distance(input_a: np.ndarray, input_b: np.ndarray) -> float:
+    """
+    Calculates cosine distance between two data.
+    :param input_a: ndarray of first vector.
+    :param input_b: ndarray of second vector.
+    :return: Cosine distance of input_a and input_b.
+
+    >>> cosine_distance(np.array([1]), np.array([1]))
+    0.0
+    >>> round(cosine_distance(np.array([1, 2]), np.array([6, 32])), 7)
+    0.0384761
+    """
+    return 1 - cosine_similarity(input_a, input_b)
+
+
 if __name__ == "__main__":
     import doctest
 

From d7e56d11cfe55dad1bf9f4087fa1a94402811dfb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 28 Aug 2024 02:38:03 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/similarity_search.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/machine_learning/similarity_search.py b/machine_learning/similarity_search.py
index 3d9bb97d99c1..f33f0bed7075 100644
--- a/machine_learning/similarity_search.py
+++ b/machine_learning/similarity_search.py
@@ -44,7 +44,7 @@ def similarity_search(
     dataset: np.ndarray,
     value_array: np.ndarray,
     distance_func: DistanceFunction = euclidean,
-    k: int = 1
+    k: int = 1,
 ) -> List[List[Union[List[float], float]]]:
     """
     :param dataset: Set containing the vectors. Should be ndarray.
@@ -143,10 +143,7 @@ def similarity_search(
     for value in value_array:
         distances = [distance_func(value, data_point) for data_point in dataset]
         nearest_indices = np.argsort(distances)[:k]
-        answer.append([
-            [dataset[i].tolist(), distances[i]]
-            for i in nearest_indices
-        ])
+        answer.append([[dataset[i].tolist(), distances[i]] for i in nearest_indices])
 
     return answer[0] if len(answer) == 1 else answer