Skip to content

Commit 1898766

Browse files
authored
Enhance similarity search with flexible distance metrics and k-nearest neighbors
- Add distance_func parameter to support custom distance metrics - Implement k-nearest neighbors search - Optimize euclidean distance calculation using NumPy - Add cosine distance function as an alternative metric - Improve performance with list comprehensions - Add type hints for better code readability - Include additional examples demonstrating new features
1 parent c8e131b commit 1898766

File tree

1 file changed

+43
-17
lines changed

1 file changed

+43
-17
lines changed

machine_learning/similarity_search.py

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,21 @@
66
returns a list containing two data for each vector:
77
1. the nearest vector
88
2. distance between the vector and the nearest vector (float)
9+
10+
This implementation also includes cosine similarity as an alternative measure.
911
"""
1012

1113
from __future__ import annotations
1214

1315
import math
16+
from typing import Callable, List, Tuple, Union
1417

1518
import numpy as np
1619
from numpy.linalg import norm
1720

21+
VectorType = Union[List[float], np.ndarray]
22+
DistanceFunction = Callable[[np.ndarray, np.ndarray], float]
23+
1824

1925
def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
2026
"""
@@ -31,18 +37,23 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
3137
>>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
3238
1.0
3339
"""
34-
return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
40+
return np.sqrt(np.sum((input_a - input_b) ** 2))
3541

3642

3743
def similarity_search(
38-
dataset: np.ndarray, value_array: np.ndarray
39-
) -> list[list[list[float] | float]]:
44+
dataset: np.ndarray,
45+
value_array: np.ndarray,
46+
distance_func: DistanceFunction = euclidean,
47+
k: int = 1
48+
) -> List[List[Union[List[float], float]]]:
4049
"""
4150
:param dataset: Set containing the vectors. Should be ndarray.
4251
:param value_array: vector/vectors we want to know the nearest vector from dataset.
52+
:param distance_func: Distance function to use (default: euclidean).
53+
:param k: Number of nearest neighbors to return (default: 1).
4354
:return: Result will be a list containing
44-
1. the nearest vector
45-
2. distance from the vector
55+
1. the nearest vector(s)
56+
2. distance(s) from the vector(s)
4657
4758
>>> dataset = np.array([[0], [1], [2]])
4859
>>> value_array = np.array([[0]])
@@ -64,6 +75,11 @@ def similarity_search(
6475
>>> similarity_search(dataset, value_array)
6576
[[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
6677
78+
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
79+
>>> value_array = np.array([[0, 1]])
80+
>>> similarity_search(dataset, value_array, k=2)
81+
[[[0, 0], 1.0], [[1, 1], 1.0]]
82+
6783
These are the errors that might occur:
6884
6985
1. If dimensions are different.
@@ -125,19 +141,14 @@ def similarity_search(
125141
answer = []
126142

127143
for value in value_array:
128-
dist = euclidean(value, dataset[0])
129-
vector = dataset[0].tolist()
130-
131-
for dataset_value in dataset[1:]:
132-
temp_dist = euclidean(value, dataset_value)
133-
134-
if dist > temp_dist:
135-
dist = temp_dist
136-
vector = dataset_value.tolist()
137-
138-
answer.append([vector, dist])
144+
distances = [distance_func(value, data_point) for data_point in dataset]
145+
nearest_indices = np.argsort(distances)[:k]
146+
answer.append([
147+
[dataset[i].tolist(), distances[i]]
148+
for i in nearest_indices
149+
])
139150

140-
return answer
151+
return answer[0] if len(answer) == 1 else answer
141152

142153

143154
def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float:
@@ -156,6 +167,21 @@ def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float:
156167
return np.dot(input_a, input_b) / (norm(input_a) * norm(input_b))
157168

158169

170+
def cosine_distance(input_a: np.ndarray, input_b: np.ndarray) -> float:
171+
"""
172+
Calculates cosine distance between two data.
173+
:param input_a: ndarray of first vector.
174+
:param input_b: ndarray of second vector.
175+
:return: Cosine distance of input_a and input_b.
176+
177+
>>> cosine_distance(np.array([1]), np.array([1]))
178+
0.0
179+
>>> round(cosine_distance(np.array([1, 2]), np.array([6, 32])), 7)
180+
0.0384761
181+
"""
182+
return 1 - cosine_similarity(input_a, input_b)
183+
184+
159185
if __name__ == "__main__":
160186
import doctest
161187

0 commit comments

Comments
 (0)