From 45533f3b1c14680194ab39735d95c1b06506d08f Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 30 May 2023 04:24:04 -0700 Subject: [PATCH 1/8] Add type hints to k_nearest_neighbours.py --- machine_learning/k_nearest_neighbours.py | 57 ++++++++++++++---------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 2a90cfe5987a..a4a8dce616ec 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -4,55 +4,66 @@ from sklearn import datasets from sklearn.model_selection import train_test_split -data = datasets.load_iris() -X = np.array(data["data"]) -y = np.array(data["target"]) -classes = data["target_names"] - -X_train, X_test, y_train, y_test = train_test_split(X, y) - - -def euclidean_distance(a, b): +def euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: """ - Gives the euclidean distance between two points - >>> euclidean_distance([0, 0], [3, 4]) + Calculate the Euclidean distance between two points + >>> euclidean_distance(np.array([0, 0]), np.array([3, 4])) 5.0 - >>> euclidean_distance([1, 2, 3], [1, 8, 11]) + >>> euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) 10.0 """ - return np.linalg.norm(np.array(a) - np.array(b)) + return np.linalg.norm(a - b) -def classifier(train_data, train_target, classes, point, k=5): +def classifier( + train_data: np.ndarray[float], + train_target: np.ndarray[int], + class_labels: list[str], + pred_point: np.ndarray[float], + k: int = 5, +) -> str: """ - Classifies the point using the KNN algorithm - k closest points are found (ranked in ascending order of euclidean distance) + Classifies a given point using the KNN algorithm + k closest points are found (ranked in ascending order of Euclidean distance) Params: :train_data: Set of points that are classified into two or more classes :train_target: List of classes in the order of train_data points :classes: Labels of the classes :point: The data point that needs to be classified - >>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] - >>> y_train = [0, 0, 0, 0, 1, 1, 1] - >>> classes = ['A','B']; point = [1.2,1.2] - >>> classifier(X_train, y_train, classes,point) + >>> train_X = np.array([[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]) + >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1]) + >>> classes = ['A', 'B'] + >>> point = np.array([1.2, 1.2]) + >>> classifier(train_X, train_y, classes, point) 'A' """ data = zip(train_data, train_target) # List of distances of all points from the point to be classified distances = [] for data_point in data: - distance = euclidean_distance(data_point[0], point) + distance = euclidean_distance(data_point[0], pred_point) distances.append((distance, data_point[1])) # Choosing 'k' points with the least distances. votes = [i[1] for i in sorted(distances)[:k]] # Most commonly occurring class among them # is the class into which the point is classified result = Counter(votes).most_common(1)[0][0] - return classes[result] + return class_labels[result] if __name__ == "__main__": - print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4])) + import doctest + + doctest.testmod() + + iris = datasets.load_iris() + + X = np.array(iris["data"]) + y = np.array(iris["target"]) + iris_classes = iris["target_names"] + + X_train, X_test, y_train, y_test = train_test_split(X, y) + iris_point = np.array([4.4, 3.1, 1.3, 1.4]) + print(classifier(X_train, y_train, iris_classes, iris_point)) From bfcaf640f1cf405797212de3641409fa9cc3d100 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 30 May 2023 04:38:32 -0700 Subject: [PATCH 2/8] Refactor k_nearest_neighbours.py into class --- machine_learning/k_nearest_neighbours.py | 96 +++++++++++++----------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index a4a8dce616ec..3f8879a633fa 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -5,52 +5,59 @@ from sklearn.model_selection import train_test_split -def euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: - """ - Calculate the Euclidean distance between two points - >>> euclidean_distance(np.array([0, 0]), np.array([3, 4])) - 5.0 - >>> euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) - 10.0 - """ - return np.linalg.norm(a - b) +class KNN: + def __init__( + self, + train_data: np.ndarray[float], + train_target: np.ndarray[int], + class_labels: list[str], + ) -> None: + self.data = zip(train_data, train_target) + self.labels = class_labels + @staticmethod + def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: + """ + Calculate the Euclidean distance between two points + >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4])) + 5.0 + >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) + 10.0 + """ + return np.linalg.norm(a - b) -def classifier( - train_data: np.ndarray[float], - train_target: np.ndarray[int], - class_labels: list[str], - pred_point: np.ndarray[float], - k: int = 5, -) -> str: - """ - Classifies a given point using the KNN algorithm - k closest points are found (ranked in ascending order of Euclidean distance) - Params: - :train_data: Set of points that are classified into two or more classes - :train_target: List of classes in the order of train_data points - :classes: Labels of the classes - :point: The data point that needs to be classified + def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: + """ + Classifies a given point using the KNN algorithm + k closest points are found (ranked in ascending order of Euclidean distance) + Params: + :train_data: Set of points that are classified into two or more classes + :train_target: List of classes in the order of train_data points + :classes: Labels of the classes + :point: The data point that needs to be classified - >>> train_X = np.array([[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]) - >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1]) - >>> classes = ['A', 'B'] - >>> point = np.array([1.2, 1.2]) - >>> classifier(train_X, train_y, classes, point) - 'A' - """ - data = zip(train_data, train_target) - # List of distances of all points from the point to be classified - distances = [] - for data_point in data: - distance = euclidean_distance(data_point[0], pred_point) - distances.append((distance, data_point[1])) - # Choosing 'k' points with the least distances. - votes = [i[1] for i in sorted(distances)[:k]] - # Most commonly occurring class among them - # is the class into which the point is classified - result = Counter(votes).most_common(1)[0][0] - return class_labels[result] + >>> train_X = np.array( + ... [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] + ... ) + >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1]) + >>> classes = ['A', 'B'] + >>> knn = KNN(train_X, train_y, classes) + >>> point = np.array([1.2, 1.2]) + >>> knn.classify(point) + 'A' + """ + # List of distances of all points from the point to be classified + distances = ( + (self._euclidean_distance(data_point[0], pred_point), data_point[1]) + for data_point in self.data + ) + + # Choosing 'k' points with the least distances. + votes = (i[1] for i in sorted(distances)[:k]) + + # Most commonly occurring class is the one into which the point is classified + result = Counter(votes).most_common(1)[0][0] + return self.labels[result] if __name__ == "__main__": @@ -66,4 +73,5 @@ def classifier( X_train, X_test, y_train, y_test = train_test_split(X, y) iris_point = np.array([4.4, 3.1, 1.3, 1.4]) - print(classifier(X_train, y_train, iris_classes, iris_point)) + classifier = KNN(X_train, y_train, iris_classes) + print(classifier.classify(iris_point)) From f884c742cd508ff34a7fc80a79939905e7a84da5 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 30 May 2023 04:57:57 -0700 Subject: [PATCH 3/8] Add documentation to k_nearest_neighbours.py --- machine_learning/k_nearest_neighbours.py | 30 ++++++++++++++++-------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 3f8879a633fa..e1c96f9eb3de 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -1,3 +1,17 @@ +""" +k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning +algorithm used for classification. Given some labelled training data, a given +point is classified using its k nearest neighbours according to some distance +metric. The most commonly occurring label among the neighbours becomes the label +of the given point. In effect, the label of the given point is decided by a +majority vote. + +This implementation uses the commonly used Euclidean distance metric, but other +distance metrics can also be used. + +Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm +""" + from collections import Counter import numpy as np @@ -12,6 +26,9 @@ def __init__( train_target: np.ndarray[int], class_labels: list[str], ) -> None: + """ + Create a kNN classifier using the given training data and class labels + """ self.data = zip(train_data, train_target) self.labels = class_labels @@ -28,14 +45,7 @@ def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: """ - Classifies a given point using the KNN algorithm - k closest points are found (ranked in ascending order of Euclidean distance) - Params: - :train_data: Set of points that are classified into two or more classes - :train_target: List of classes in the order of train_data points - :classes: Labels of the classes - :point: The data point that needs to be classified - + Classify a given point using the kNN algorithm >>> train_X = np.array( ... [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] ... ) @@ -46,13 +56,13 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: >>> knn.classify(point) 'A' """ - # List of distances of all points from the point to be classified + # Distances of all points from the point to be classified distances = ( (self._euclidean_distance(data_point[0], pred_point), data_point[1]) for data_point in self.data ) - # Choosing 'k' points with the least distances. + # Choosing k points with the shortest distances votes = (i[1] for i in sorted(distances)[:k]) # Most commonly occurring class is the one into which the point is classified From cec7610abf85f6b81d6bfedc5e16a22bfdec1464 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 30 May 2023 18:16:33 -0700 Subject: [PATCH 4/8] Use heap-based priority queue for k_nearest_neighbours.py --- machine_learning/k_nearest_neighbours.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index e1c96f9eb3de..79f4a95cfc80 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -13,6 +13,7 @@ """ from collections import Counter +from heapq import nsmallest import numpy as np from sklearn import datasets @@ -63,7 +64,7 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: ) # Choosing k points with the shortest distances - votes = (i[1] for i in sorted(distances)[:k]) + votes = (i[1] for i in nsmallest(k, distances)) # Most commonly occurring class is the one into which the point is classified result = Counter(votes).most_common(1)[0][0] From 529dd356e666669db5b890e6233cd0334255db1a Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 30 May 2023 18:17:05 -0700 Subject: [PATCH 5/8] Delete knn_sklearn.py --- machine_learning/knn_sklearn.py | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 machine_learning/knn_sklearn.py diff --git a/machine_learning/knn_sklearn.py b/machine_learning/knn_sklearn.py deleted file mode 100644 index 4a621a4244b6..000000000000 --- a/machine_learning/knn_sklearn.py +++ /dev/null @@ -1,31 +0,0 @@ -from sklearn.datasets import load_iris -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier - -# Load iris file -iris = load_iris() -iris.keys() - - -print(f"Target names: \n {iris.target_names} ") -print(f"\n Features: \n {iris.feature_names}") - -# Train set e Test set -X_train, X_test, y_train, y_test = train_test_split( - iris["data"], iris["target"], random_state=4 -) - -# KNN - -knn = KNeighborsClassifier(n_neighbors=1) -knn.fit(X_train, y_train) - -# new array to test -X_new = [[1, 2, 1, 4], [2, 3, 4, 5]] - -prediction = knn.predict(X_new) - -print( - f"\nNew array: \n {X_new}\n\nTarget Names Prediction: \n" - f" {iris['target_names'][prediction]}" -) From 9510d5c91de2d0e9e0ea23a53376057326625b94 Mon Sep 17 00:00:00 2001 From: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Date: Wed, 31 May 2023 01:17:43 +0000 Subject: [PATCH 6/8] updating DIRECTORY.md --- DIRECTORY.md | 1 - 1 file changed, 1 deletion(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index 231b0e2f1d2f..f9c156054cb5 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -494,7 +494,6 @@ * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) - * [Knn Sklearn](machine_learning/knn_sklearn.py) * [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py) * [Linear Regression](machine_learning/linear_regression.py) * Local Weighted Learning From ee20be762348ba8a3243f45c74c6c47f7b2172f1 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Fri, 28 Jul 2023 18:48:32 -0700 Subject: [PATCH 7/8] Use optional args in k_nearest_neighbours.py for demo purposes --- machine_learning/k_nearest_neighbours.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 79f4a95cfc80..d451dbaab927 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -82,7 +82,7 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: y = np.array(iris["target"]) iris_classes = iris["target_names"] - X_train, X_test, y_train, y_test = train_test_split(X, y) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) iris_point = np.array([4.4, 3.1, 1.3, 1.4]) - classifier = KNN(X_train, y_train, iris_classes) + classifier = KNN(X_train, y_train, iris_classes, k=3) print(classifier.classify(iris_point)) From 3140c9cb4ec7a031c6a25d641bd9aa20f1908d4b Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Fri, 28 Jul 2023 18:58:46 -0700 Subject: [PATCH 8/8] Fix wrong function arg in k_nearest_neighbours.py --- machine_learning/k_nearest_neighbours.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index d451dbaab927..a43757c5c20e 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -84,5 +84,5 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) iris_point = np.array([4.4, 3.1, 1.3, 1.4]) - classifier = KNN(X_train, y_train, iris_classes, k=3) - print(classifier.classify(iris_point)) + classifier = KNN(X_train, y_train, iris_classes) + print(classifier.classify(iris_point, k=3))