6
6
returns a list containing two data for each vector:
7
7
1. the nearest vector
8
8
2. distance between the vector and the nearest vector (float)
9
+
10
+ This implementation also includes cosine similarity as an alternative measure.
9
11
"""
10
12
11
13
from __future__ import annotations
12
14
13
15
import math
16
+ from typing import Callable , List , Tuple , Union
14
17
15
18
import numpy as np
16
19
from numpy .linalg import norm
17
20
21
+ VectorType = Union [List [float ], np .ndarray ]
22
+ DistanceFunction = Callable [[np .ndarray , np .ndarray ], float ]
23
+
18
24
19
25
def euclidean (input_a : np .ndarray , input_b : np .ndarray ) -> float :
20
26
"""
@@ -31,18 +37,23 @@ def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
31
37
>>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
32
38
1.0
33
39
"""
34
- return math .sqrt (sum (pow ( a - b , 2 ) for a , b in zip ( input_a , input_b ) ))
40
+ return np .sqrt (np . sum (( input_a - input_b ) ** 2 ))
35
41
36
42
37
43
def similarity_search (
38
- dataset : np .ndarray , value_array : np .ndarray
39
- ) -> list [list [list [float ] | float ]]:
44
+ dataset : np .ndarray ,
45
+ value_array : np .ndarray ,
46
+ distance_func : DistanceFunction = euclidean ,
47
+ k : int = 1
48
+ ) -> List [List [Union [List [float ], float ]]]:
40
49
"""
41
50
:param dataset: Set containing the vectors. Should be ndarray.
42
51
:param value_array: vector/vectors we want to know the nearest vector from dataset.
52
+ :param distance_func: Distance function to use (default: euclidean).
53
+ :param k: Number of nearest neighbors to return (default: 1).
43
54
:return: Result will be a list containing
44
- 1. the nearest vector
45
- 2. distance from the vector
55
+ 1. the nearest vector(s)
56
+ 2. distance(s) from the vector(s)
46
57
47
58
>>> dataset = np.array([[0], [1], [2]])
48
59
>>> value_array = np.array([[0]])
@@ -64,6 +75,11 @@ def similarity_search(
64
75
>>> similarity_search(dataset, value_array)
65
76
[[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
66
77
78
+ >>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
79
+ >>> value_array = np.array([[0, 1]])
80
+ >>> similarity_search(dataset, value_array, k=2)
81
+ [[[0, 0], 1.0], [[1, 1], 1.0]]
82
+
67
83
These are the errors that might occur:
68
84
69
85
1. If dimensions are different.
@@ -125,19 +141,14 @@ def similarity_search(
125
141
answer = []
126
142
127
143
for value in value_array :
128
- dist = euclidean (value , dataset [0 ])
129
- vector = dataset [0 ].tolist ()
130
-
131
- for dataset_value in dataset [1 :]:
132
- temp_dist = euclidean (value , dataset_value )
133
-
134
- if dist > temp_dist :
135
- dist = temp_dist
136
- vector = dataset_value .tolist ()
137
-
138
- answer .append ([vector , dist ])
144
+ distances = [distance_func (value , data_point ) for data_point in dataset ]
145
+ nearest_indices = np .argsort (distances )[:k ]
146
+ answer .append ([
147
+ [dataset [i ].tolist (), distances [i ]]
148
+ for i in nearest_indices
149
+ ])
139
150
140
- return answer
151
+ return answer [ 0 ] if len ( answer ) == 1 else answer
141
152
142
153
143
154
def cosine_similarity (input_a : np .ndarray , input_b : np .ndarray ) -> float :
@@ -156,6 +167,21 @@ def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float:
156
167
return np .dot (input_a , input_b ) / (norm (input_a ) * norm (input_b ))
157
168
158
169
170
+ def cosine_distance (input_a : np .ndarray , input_b : np .ndarray ) -> float :
171
+ """
172
+ Calculates cosine distance between two data.
173
+ :param input_a: ndarray of first vector.
174
+ :param input_b: ndarray of second vector.
175
+ :return: Cosine distance of input_a and input_b.
176
+
177
+ >>> cosine_distance(np.array([1]), np.array([1]))
178
+ 0.0
179
+ >>> round(cosine_distance(np.array([1, 2]), np.array([6, 32])), 7)
180
+ 0.0384761
181
+ """
182
+ return 1 - cosine_similarity (input_a , input_b )
183
+
184
+
159
185
if __name__ == "__main__" :
160
186
import doctest
161
187
0 commit comments