From 45533f3b1c14680194ab39735d95c1b06506d08f Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Tue, 30 May 2023 04:24:04 -0700
Subject: [PATCH 1/8] Add type hints to k_nearest_neighbours.py

---
 machine_learning/k_nearest_neighbours.py | 57 ++++++++++++++----------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py
index 2a90cfe5987a..a4a8dce616ec 100644
--- a/machine_learning/k_nearest_neighbours.py
+++ b/machine_learning/k_nearest_neighbours.py
@@ -4,55 +4,66 @@
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 
-data = datasets.load_iris()
 
-X = np.array(data["data"])
-y = np.array(data["target"])
-classes = data["target_names"]
-
-X_train, X_test, y_train, y_test = train_test_split(X, y)
-
-
-def euclidean_distance(a, b):
+def euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
     """
-    Gives the euclidean distance between two points
-    >>> euclidean_distance([0, 0], [3, 4])
+    Calculate the Euclidean distance between two points
+    >>> euclidean_distance(np.array([0, 0]), np.array([3, 4]))
     5.0
-    >>> euclidean_distance([1, 2, 3], [1, 8, 11])
+    >>> euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
     10.0
     """
-    return np.linalg.norm(np.array(a) - np.array(b))
+    return np.linalg.norm(a - b)
 
 
-def classifier(train_data, train_target, classes, point, k=5):
+def classifier(
+    train_data: np.ndarray[float],
+    train_target: np.ndarray[int],
+    class_labels: list[str],
+    pred_point: np.ndarray[float],
+    k: int = 5,
+) -> str:
     """
-    Classifies the point using the KNN algorithm
-    k closest points are found (ranked in ascending order of euclidean distance)
+    Classifies a given point using the KNN algorithm
+    k closest points are found (ranked in ascending order of Euclidean distance)
     Params:
     :train_data: Set of points that are classified into two or more classes
     :train_target: List of classes in the order of train_data points
     :classes: Labels of the classes
     :point: The data point that needs to be classified
 
-    >>> X_train = [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
-    >>> y_train = [0, 0, 0, 0, 1, 1, 1]
-    >>> classes = ['A','B']; point = [1.2,1.2]
-    >>> classifier(X_train, y_train, classes,point)
+    >>> train_X = np.array([[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]])
+    >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])
+    >>> classes = ['A', 'B']
+    >>> point = np.array([1.2, 1.2])
+    >>> classifier(train_X, train_y, classes, point)
     'A'
     """
     data = zip(train_data, train_target)
     # List of distances of all points from the point to be classified
     distances = []
     for data_point in data:
-        distance = euclidean_distance(data_point[0], point)
+        distance = euclidean_distance(data_point[0], pred_point)
         distances.append((distance, data_point[1]))
     # Choosing 'k' points with the least distances.
     votes = [i[1] for i in sorted(distances)[:k]]
     # Most commonly occurring class among them
     # is the class into which the point is classified
     result = Counter(votes).most_common(1)[0][0]
-    return classes[result]
+    return class_labels[result]
 
 
 if __name__ == "__main__":
-    print(classifier(X_train, y_train, classes, [4.4, 3.1, 1.3, 1.4]))
+    import doctest
+
+    doctest.testmod()
+
+    iris = datasets.load_iris()
+
+    X = np.array(iris["data"])
+    y = np.array(iris["target"])
+    iris_classes = iris["target_names"]
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+    iris_point = np.array([4.4, 3.1, 1.3, 1.4])
+    print(classifier(X_train, y_train, iris_classes, iris_point))

From bfcaf640f1cf405797212de3641409fa9cc3d100 Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Tue, 30 May 2023 04:38:32 -0700
Subject: [PATCH 2/8] Refactor k_nearest_neighbours.py into class

---
 machine_learning/k_nearest_neighbours.py | 96 +++++++++++++-----------
 1 file changed, 52 insertions(+), 44 deletions(-)

diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py
index a4a8dce616ec..3f8879a633fa 100644
--- a/machine_learning/k_nearest_neighbours.py
+++ b/machine_learning/k_nearest_neighbours.py
@@ -5,52 +5,59 @@
 from sklearn.model_selection import train_test_split
 
 
-def euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
-    """
-    Calculate the Euclidean distance between two points
-    >>> euclidean_distance(np.array([0, 0]), np.array([3, 4]))
-    5.0
-    >>> euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
-    10.0
-    """
-    return np.linalg.norm(a - b)
+class KNN:
+    def __init__(
+        self,
+        train_data: np.ndarray[float],
+        train_target: np.ndarray[int],
+        class_labels: list[str],
+    ) -> None:
+        self.data = zip(train_data, train_target)
+        self.labels = class_labels
 
+    @staticmethod
+    def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
+        """
+        Calculate the Euclidean distance between two points
+        >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))
+        5.0
+        >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
+        10.0
+        """
+        return np.linalg.norm(a - b)
 
-def classifier(
-    train_data: np.ndarray[float],
-    train_target: np.ndarray[int],
-    class_labels: list[str],
-    pred_point: np.ndarray[float],
-    k: int = 5,
-) -> str:
-    """
-    Classifies a given point using the KNN algorithm
-    k closest points are found (ranked in ascending order of Euclidean distance)
-    Params:
-    :train_data: Set of points that are classified into two or more classes
-    :train_target: List of classes in the order of train_data points
-    :classes: Labels of the classes
-    :point: The data point that needs to be classified
+    def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
+        """
+        Classifies a given point using the KNN algorithm
+        k closest points are found (ranked in ascending order of Euclidean distance)
+        Params:
+        :train_data: Set of points that are classified into two or more classes
+        :train_target: List of classes in the order of train_data points
+        :classes: Labels of the classes
+        :point: The data point that needs to be classified
 
-    >>> train_X = np.array([[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]])
-    >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])
-    >>> classes = ['A', 'B']
-    >>> point = np.array([1.2, 1.2])
-    >>> classifier(train_X, train_y, classes, point)
-    'A'
-    """
-    data = zip(train_data, train_target)
-    # List of distances of all points from the point to be classified
-    distances = []
-    for data_point in data:
-        distance = euclidean_distance(data_point[0], pred_point)
-        distances.append((distance, data_point[1]))
-    # Choosing 'k' points with the least distances.
-    votes = [i[1] for i in sorted(distances)[:k]]
-    # Most commonly occurring class among them
-    # is the class into which the point is classified
-    result = Counter(votes).most_common(1)[0][0]
-    return class_labels[result]
+        >>> train_X = np.array(
+        ...     [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
+        ... )
+        >>> train_y = np.array([0, 0, 0, 0, 1, 1, 1])
+        >>> classes = ['A', 'B']
+        >>> knn = KNN(train_X, train_y, classes)
+        >>> point = np.array([1.2, 1.2])
+        >>> knn.classify(point)
+        'A'
+        """
+        # List of distances of all points from the point to be classified
+        distances = (
+            (self._euclidean_distance(data_point[0], pred_point), data_point[1])
+            for data_point in self.data
+        )
+
+        # Choosing 'k' points with the least distances.
+        votes = (i[1] for i in sorted(distances)[:k])
+
+        # Most commonly occurring class is the one into which the point is classified
+        result = Counter(votes).most_common(1)[0][0]
+        return self.labels[result]
 
 
 if __name__ == "__main__":
@@ -66,4 +73,5 @@ def classifier(
 
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     iris_point = np.array([4.4, 3.1, 1.3, 1.4])
-    print(classifier(X_train, y_train, iris_classes, iris_point))
+    classifier = KNN(X_train, y_train, iris_classes)
+    print(classifier.classify(iris_point))

From f884c742cd508ff34a7fc80a79939905e7a84da5 Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Tue, 30 May 2023 04:57:57 -0700
Subject: [PATCH 3/8] Add documentation to k_nearest_neighbours.py

---
 machine_learning/k_nearest_neighbours.py | 30 ++++++++++++++++--------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py
index 3f8879a633fa..e1c96f9eb3de 100644
--- a/machine_learning/k_nearest_neighbours.py
+++ b/machine_learning/k_nearest_neighbours.py
@@ -1,3 +1,17 @@
+"""
+k-Nearest Neighbours (kNN) is a simple non-parametric supervised learning
+algorithm used for classification. Given some labelled training data, a given
+point is classified using its k nearest neighbours according to some distance
+metric. The most commonly occurring label among the neighbours becomes the label
+of the given point. In effect, the label of the given point is decided by a
+majority vote.
+
+This implementation uses the commonly used Euclidean distance metric, but other
+distance metrics can also be used.
+
+Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+"""
+
 from collections import Counter
 
 import numpy as np
@@ -12,6 +26,9 @@ def __init__(
         train_target: np.ndarray[int],
         class_labels: list[str],
     ) -> None:
+        """
+        Create a kNN classifier using the given training data and class labels
+        """
         self.data = zip(train_data, train_target)
         self.labels = class_labels
 
@@ -28,14 +45,7 @@ def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
 
     def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
         """
-        Classifies a given point using the KNN algorithm
-        k closest points are found (ranked in ascending order of Euclidean distance)
-        Params:
-        :train_data: Set of points that are classified into two or more classes
-        :train_target: List of classes in the order of train_data points
-        :classes: Labels of the classes
-        :point: The data point that needs to be classified
-
+        Classify a given point using the kNN algorithm
         >>> train_X = np.array(
         ...     [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]]
         ... )
@@ -46,13 +56,13 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
         >>> knn.classify(point)
         'A'
         """
-        # List of distances of all points from the point to be classified
+        # Distances of all points from the point to be classified
         distances = (
             (self._euclidean_distance(data_point[0], pred_point), data_point[1])
             for data_point in self.data
         )
 
-        # Choosing 'k' points with the least distances.
+        # Choosing k points with the shortest distances
         votes = (i[1] for i in sorted(distances)[:k])
 
         # Most commonly occurring class is the one into which the point is classified

From cec7610abf85f6b81d6bfedc5e16a22bfdec1464 Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Tue, 30 May 2023 18:16:33 -0700
Subject: [PATCH 4/8] Use heap-based priority queue for k_nearest_neighbours.py

---
 machine_learning/k_nearest_neighbours.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py
index e1c96f9eb3de..79f4a95cfc80 100644
--- a/machine_learning/k_nearest_neighbours.py
+++ b/machine_learning/k_nearest_neighbours.py
@@ -13,6 +13,7 @@
 """
 
 from collections import Counter
+from heapq import nsmallest
 
 import numpy as np
 from sklearn import datasets
@@ -63,7 +64,7 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
         )
 
         # Choosing k points with the shortest distances
-        votes = (i[1] for i in sorted(distances)[:k])
+        votes = (i[1] for i in nsmallest(k, distances))
 
         # Most commonly occurring class is the one into which the point is classified
         result = Counter(votes).most_common(1)[0][0]

From 529dd356e666669db5b890e6233cd0334255db1a Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Tue, 30 May 2023 18:17:05 -0700
Subject: [PATCH 5/8] Delete knn_sklearn.py

---
 machine_learning/knn_sklearn.py | 31 -------------------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 machine_learning/knn_sklearn.py

diff --git a/machine_learning/knn_sklearn.py b/machine_learning/knn_sklearn.py
deleted file mode 100644
index 4a621a4244b6..000000000000
--- a/machine_learning/knn_sklearn.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from sklearn.datasets import load_iris
-from sklearn.model_selection import train_test_split
-from sklearn.neighbors import KNeighborsClassifier
-
-# Load iris file
-iris = load_iris()
-iris.keys()
-
-
-print(f"Target names: \n {iris.target_names} ")
-print(f"\n Features: \n {iris.feature_names}")
-
-# Train set e Test set
-X_train, X_test, y_train, y_test = train_test_split(
-    iris["data"], iris["target"], random_state=4
-)
-
-# KNN
-
-knn = KNeighborsClassifier(n_neighbors=1)
-knn.fit(X_train, y_train)
-
-# new array to test
-X_new = [[1, 2, 1, 4], [2, 3, 4, 5]]
-
-prediction = knn.predict(X_new)
-
-print(
-    f"\nNew array: \n {X_new}\n\nTarget Names Prediction: \n"
-    f" {iris['target_names'][prediction]}"
-)

From 9510d5c91de2d0e9e0ea23a53376057326625b94 Mon Sep 17 00:00:00 2001
From: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
Date: Wed, 31 May 2023 01:17:43 +0000
Subject: [PATCH 6/8] updating DIRECTORY.md

---
 DIRECTORY.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/DIRECTORY.md b/DIRECTORY.md
index 231b0e2f1d2f..f9c156054cb5 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -494,7 +494,6 @@
   * [Gradient Descent](machine_learning/gradient_descent.py)
   * [K Means Clust](machine_learning/k_means_clust.py)
   * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
-  * [Knn Sklearn](machine_learning/knn_sklearn.py)
   * [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py)
   * [Linear Regression](machine_learning/linear_regression.py)
   * Local Weighted Learning

From ee20be762348ba8a3243f45c74c6c47f7b2172f1 Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Fri, 28 Jul 2023 18:48:32 -0700
Subject: [PATCH 7/8] Use optional args in k_nearest_neighbours.py for demo
 purposes

---
 machine_learning/k_nearest_neighbours.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py
index 79f4a95cfc80..d451dbaab927 100644
--- a/machine_learning/k_nearest_neighbours.py
+++ b/machine_learning/k_nearest_neighbours.py
@@ -82,7 +82,7 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
     y = np.array(iris["target"])
     iris_classes = iris["target_names"]
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
     iris_point = np.array([4.4, 3.1, 1.3, 1.4])
-    classifier = KNN(X_train, y_train, iris_classes)
+    classifier = KNN(X_train, y_train, iris_classes, k=3)
     print(classifier.classify(iris_point))

From 3140c9cb4ec7a031c6a25d641bd9aa20f1908d4b Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Fri, 28 Jul 2023 18:58:46 -0700
Subject: [PATCH 8/8] Fix wrong function arg in k_nearest_neighbours.py

---
 machine_learning/k_nearest_neighbours.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py
index d451dbaab927..a43757c5c20e 100644
--- a/machine_learning/k_nearest_neighbours.py
+++ b/machine_learning/k_nearest_neighbours.py
@@ -84,5 +84,5 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
     iris_point = np.array([4.4, 3.1, 1.3, 1.4])
-    classifier = KNN(X_train, y_train, iris_classes, k=3)
-    print(classifier.classify(iris_point))
+    classifier = KNN(X_train, y_train, iris_classes)
+    print(classifier.classify(iris_point, k=3))