From f7ba9da92ae49f8c191877c1c17318c24c74600c Mon Sep 17 00:00:00 2001 From: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Date: Mon, 2 Jan 2023 01:57:11 +0000 Subject: [PATCH 01/36] updating DIRECTORY.md --- DIRECTORY.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index 3437df12cbf5..5ce9dca74c06 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -123,6 +123,7 @@ * [Huffman](compression/huffman.py) * [Lempel Ziv](compression/lempel_ziv.py) * [Lempel Ziv Decompress](compression/lempel_ziv_decompress.py) + * [Lz77](compression/lz77.py) * [Peak Signal To Noise Ratio](compression/peak_signal_to_noise_ratio.py) * [Run Length Encoding](compression/run_length_encoding.py) @@ -1162,7 +1163,7 @@ * [Get Amazon Product Data](web_programming/get_amazon_product_data.py) * [Get Imdb Top 250 Movies Csv](web_programming/get_imdb_top_250_movies_csv.py) * [Get Imdbtop](web_programming/get_imdbtop.py) - * [Get Top Billioners](web_programming/get_top_billioners.py) + * [Get Top Billionaires](web_programming/get_top_billionaires.py) * [Get Top Hn Posts](web_programming/get_top_hn_posts.py) * [Get User Tweets](web_programming/get_user_tweets.py) * [Giphy](web_programming/giphy.py) From 9d1971b11f736898b1ff2112aa0de470977224c0 Mon Sep 17 00:00:00 2001 From: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:25:40 +0000 Subject: [PATCH 02/36] updating DIRECTORY.md --- DIRECTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/DIRECTORY.md b/DIRECTORY.md index 5ce9dca74c06..31e86ea59b79 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -557,6 +557,7 @@ * [Gamma Recursive](maths/gamma_recursive.py) * [Gaussian](maths/gaussian.py) * [Gaussian Error Linear Unit](maths/gaussian_error_linear_unit.py) + * [Gcd Of N Numbers](maths/gcd_of_n_numbers.py) * [Greatest Common Divisor](maths/greatest_common_divisor.py) * [Greedy Coin Change](maths/greedy_coin_change.py) * [Hamming Numbers](maths/hamming_numbers.py) From 5f404b482ffe5f6ac26e87ea18ed74be042be7c0 Mon Sep 17 00:00:00 2001 From: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Date: Thu, 26 Jan 2023 07:26:59 +0000 Subject: [PATCH 03/36] updating DIRECTORY.md --- DIRECTORY.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index 31e86ea59b79..a8786cc2591f 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -543,8 +543,7 @@ * [Euler Modified](maths/euler_modified.py) * [Eulers Totient](maths/eulers_totient.py) * [Extended Euclidean Algorithm](maths/extended_euclidean_algorithm.py) - * [Factorial Iterative](maths/factorial_iterative.py) - * [Factorial Recursive](maths/factorial_recursive.py) + * [Factorial](maths/factorial.py) * [Factors](maths/factors.py) * [Fermat Little Theorem](maths/fermat_little_theorem.py) * [Fibonacci](maths/fibonacci.py) From d9aff2f6bf10071711c3e2859d523123b3a74749 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Mon, 2 Jan 2023 23:49:21 -0800 Subject: [PATCH 04/36] Fix mypy errors in local_weighted_learning.py --- .../local_weighted_learning.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 6260e9ac6bfe..3c766ca4b60c 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -3,8 +3,8 @@ def weighted_matrix( - point: np.array, training_data_x: np.array, bandwidth: float -) -> np.array: + point: np.ndarray, training_data_x: np.ndarray, bandwidth: float +) -> np.ndarray: """ Calculate the weight for every point in the data set. point --> the x value at which we want to make predictions @@ -28,11 +28,11 @@ def weighted_matrix( def local_weight( - point: np.array, - training_data_x: np.array, - training_data_y: np.array, + point: np.ndarray, + training_data_x: np.ndarray, + training_data_y: np.ndarray, bandwidth: float, -) -> np.array: +) -> np.ndarray: """ Calculate the local weights using the weight_matrix function on training data. Return the weighted matrix. @@ -54,8 +54,8 @@ def local_weight( def local_weight_regression( - training_data_x: np.array, training_data_y: np.array, bandwidth: float -) -> np.array: + training_data_x: np.ndarray, training_data_y: np.ndarray, bandwidth: float +) -> np.ndarray: """ Calculate predictions for each data point on axis >>> local_weight_regression( @@ -78,7 +78,7 @@ def local_weight_regression( def load_data( dataset_name: str, cola_name: str, colb_name: str -) -> tuple[np.array, np.array, np.array, np.array]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points """ @@ -99,7 +99,9 @@ def load_data( return training_data_x, mcol_b, col_a, col_b -def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array: +def get_preds( + training_data_x: np.ndarray, mcol_b: np.ndarray, tau: float +) -> np.ndarray: """ Get predictions with minimum error for each training data >>> get_preds( @@ -114,10 +116,10 @@ def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.arr def plot_preds( - training_data_x: np.array, - predictions: np.array, - col_x: np.array, - col_y: np.array, + training_data_x: np.ndarray, + predictions: np.ndarray, + col_x: np.ndarray, + col_y: np.ndarray, cola_name: str, colb_name: str, ) -> plt.plot: From f519f82b0903ab827e1852091f0563b55a5259dc Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:55:22 -0800 Subject: [PATCH 05/36] Rename vars for clarity --- .../local_weighted_learning.py | 88 ++++++++----------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 3c766ca4b60c..b15670aca3ef 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -2,13 +2,11 @@ import numpy as np -def weighted_matrix( - point: np.ndarray, training_data_x: np.ndarray, bandwidth: float -) -> np.ndarray: +def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ Calculate the weight for every point in the data set. point --> the x value at which we want to make predictions - >>> weighted_matrix( + >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... 0.6 @@ -17,21 +15,18 @@ def weighted_matrix( [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(training_data_x) # m is the number of training samples + m, _ = np.shape(x_train) # m is the number of training samples weights = np.eye(m) # Initializing weights as identity matrix # calculating weights for all training examples [x(i)'s] for j in range(m): - diff = point - training_data_x[j] - weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) + diff = point - x_train[j] + weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) return weights def local_weight( - point: np.ndarray, - training_data_x: np.ndarray, - training_data_y: np.ndarray, - bandwidth: float, + point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ Calculate the local weights using the weight_matrix function on training data. @@ -45,16 +40,16 @@ def local_weight( array([[0.00873174], [0.08272556]]) """ - weight = weighted_matrix(point, training_data_x, bandwidth) - w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( - training_data_x.T @ weight @ training_data_y.T + weight_mat = weight_matrix(point, x_train, tau) + weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ ( + x_train.T @ weight_mat @ y_train.T ) - return w + return weight def local_weight_regression( - training_data_x: np.ndarray, training_data_y: np.ndarray, bandwidth: float + x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ Calculate predictions for each data point on axis @@ -65,19 +60,17 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(training_data_x) - ypred = np.zeros(m) + m, _ = np.shape(x_train) + y_pred = np.zeros(m) - for i, item in enumerate(training_data_x): - ypred[i] = item @ local_weight( - item, training_data_x, training_data_y, bandwidth - ) + for i, item in enumerate(x_train): + y_pred[i] = item @ local_weight(item, x_train, y_train, tau) - return ypred + return y_pred def load_data( - dataset_name: str, cola_name: str, colb_name: str + dataset_name: str, x_name: str, y_name: str ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points @@ -85,23 +78,21 @@ def load_data( import seaborn as sns data = sns.load_dataset(dataset_name) - col_a = np.array(data[cola_name]) # total_bill - col_b = np.array(data[colb_name]) # tip + x_data = np.array(data[x_name]) # total_bill + y_data = np.array(data[y_name]) # tip - mcol_a = col_a.copy() - mcol_b = col_b.copy() + mcol_a = x_data.copy() + mcol_b = y_data.copy() one = np.ones(np.shape(mcol_b)[0], dtype=int) # pairing elements of one and mcol_a - training_data_x = np.column_stack((one, mcol_a)) + x_train = np.column_stack((one, mcol_a)) - return training_data_x, mcol_b, col_a, col_b + return x_train, mcol_b, x_data, y_data -def get_preds( - training_data_x: np.ndarray, mcol_b: np.ndarray, tau: float -) -> np.ndarray: +def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: """ Get predictions with minimum error for each training data >>> get_preds( @@ -111,33 +102,32 @@ def get_preds( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - ypred = local_weight_regression(training_data_x, mcol_b, tau) - return ypred + y_pred = local_weight_regression(x_train, y_train, tau) + return y_pred def plot_preds( - training_data_x: np.ndarray, + x_train: np.ndarray, predictions: np.ndarray, - col_x: np.ndarray, - col_y: np.ndarray, - cola_name: str, - colb_name: str, + x_data: np.ndarray, + y_data: np.ndarray, + x_name: str, + y_name: str, ) -> plt.plot: """ Plot predictions and display the graph """ - xsort = training_data_x.copy() - xsort.sort(axis=0) - plt.scatter(col_x, col_y, color="blue") + x_train_sorted = np.sort(x_train, axis=0) + plt.scatter(x_data, y_data, color="blue") plt.plot( - xsort[:, 1], - predictions[training_data_x[:, 1].argsort(0)], + x_train_sorted[:, 1], + predictions[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) plt.title("Local Weighted Regression") - plt.xlabel(cola_name) - plt.ylabel(colb_name) + plt.xlabel(x_name) + plt.ylabel(y_name) plt.show() @@ -146,6 +136,6 @@ def plot_preds( doctest.testmod() - training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") + training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = get_preds(training_data_x, mcol_b, 0.5) - plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip") + plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 4ae72ab61e57286f64d725e3f7ab36f8dbecff4b Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:59:44 -0800 Subject: [PATCH 06/36] Refactor to remove duplicate var --- .../local_weighted_learning.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index b15670aca3ef..95de87584479 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -71,7 +71,7 @@ def local_weight_regression( def load_data( dataset_name: str, x_name: str, y_name: str -) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points """ @@ -81,15 +81,12 @@ def load_data( x_data = np.array(data[x_name]) # total_bill y_data = np.array(data[y_name]) # tip - mcol_a = x_data.copy() - mcol_b = y_data.copy() + one = np.ones(np.shape(y_data)[0], dtype=int) - one = np.ones(np.shape(mcol_b)[0], dtype=int) + # pairing elements of one and x_data + x_train = np.column_stack((one, x_data)) - # pairing elements of one and mcol_a - x_train = np.column_stack((one, mcol_a)) - - return x_train, mcol_b, x_data, y_data + return x_train, x_data, y_data def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: @@ -108,7 +105,7 @@ def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarra def plot_preds( x_train: np.ndarray, - predictions: np.ndarray, + preds: np.ndarray, x_data: np.ndarray, y_data: np.ndarray, x_name: str, @@ -121,7 +118,7 @@ def plot_preds( plt.scatter(x_data, y_data, color="blue") plt.plot( x_train_sorted[:, 1], - predictions[x_train[:, 1].argsort(0)], + preds[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) @@ -136,6 +133,6 @@ def plot_preds( doctest.testmod() - training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, mcol_b, 0.5) + training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") + predictions = get_preds(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 0ca1d067abaa5ba31540922cc91fb534826d1a74 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:01:11 -0800 Subject: [PATCH 07/36] Refactor to remove unneeded wrapper function --- .../local_weighted_learning.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 95de87584479..df7dd2861f70 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -89,20 +89,6 @@ def load_data( return x_train, x_data, y_data -def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: - """ - Get predictions with minimum error for each training data - >>> get_preds( - ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), - ... np.array([[1.01, 1.66, 3.5]]), - ... 0.6 - ... ) - array([1.07173261, 1.65970737, 3.50160179]) - """ - y_pred = local_weight_regression(x_train, y_train, tau) - return y_pred - - def plot_preds( x_train: np.ndarray, preds: np.ndarray, @@ -134,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 2721c8e64101613642149061ac75c93abbfcc3b7 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:10:30 -0800 Subject: [PATCH 08/36] Increase value of tau in demo to make predictions less overfit --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index df7dd2861f70..88a834091717 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -120,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = local_weight_regression(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From f4194687724e42d5a7779201444c70a0f534acf8 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:29:45 -0800 Subject: [PATCH 09/36] Expand function documentation and add algorithm explanation --- .../local_weighted_learning.py | 93 +++++++++++++++---- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 88a834091717..a3c877f3e932 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,11 +1,54 @@ +""" +Locally weighted linear regression, also called local regression, is a type of +non-parametric linear regression that prioritizes data closest to a given +prediction point. The algorithm estimates the vector of model coefficients β +using weighted least squares regression: + +β = (XᵀWX)⁻¹(XᵀWy), + +where X is the design matrix, y is the response vector, and W is the diagonal +weight matrix. + +This implementation calculates wᵢ, the weight of the ith training sample, using +the Gaussian weight: + +wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), + +where xᵢ is the ith training sample, x is the prediction point, τ is the +"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² +norm). The bandwidth τ controls how quickly the weight of a training sample +decreases as its distance from the prediction point increases. One can think of +the Gaussian weight as a bell curve centered around the prediction point: a +training sample is weighted lower if it's farther from the center, and τ +controls the spread of the bell curve. + +Other types of locally weighted regression such as locally estimated scatterplot +smoothing (LOESS) typically use different weight functions. + +References: + - https://en.wikipedia.org/wiki/Local_regression + - https://en.wikipedia.org/wiki/Weighted_least_squares + - https://cs229.stanford.edu/notes2022fall/main_notes.pdf +""" + import matplotlib.pyplot as plt import numpy as np def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ - Calculate the weight for every point in the data set. - point --> the x value at which we want to make predictions + Calculate the weight of every point in the training data around a given + prediction point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + n x n weight matrix around the prediction point, where n is the size of + the training set >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -15,13 +58,12 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(x_train) # m is the number of training samples - weights = np.eye(m) # Initializing weights as identity matrix - - # calculating weights for all training examples [x(i)'s] - for j in range(m): + n = len(x_train) # Number of training samples + weights = np.eye(n) # Initialize weights as identity matrix + for j in range(n): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) + return weights @@ -29,8 +71,17 @@ def local_weight( point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate the local weights using the weight_matrix function on training data. - Return the weighted matrix. + Calculate the local weights at a given prediction point using the weight + matrix for that point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + Returns: + ndarray of local weights >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -52,7 +103,16 @@ def local_weight_regression( x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate predictions for each data point on axis + Calculate predictions for each point in the training data + + Args: + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + ndarray of predictions >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -60,9 +120,7 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(x_train) - y_pred = np.zeros(m) - + y_pred = np.zeros(len(x_train)) # Initialize array of predictions for i, item in enumerate(x_train): y_pred[i] = item @ local_weight(item, x_train, y_train, tau) @@ -74,14 +132,15 @@ def load_data( ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points + >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - x_data = np.array(data[x_name]) # total_bill - y_data = np.array(data[y_name]) # tip + x_data = np.array(data[x_name]) + y_data = np.array(data[y_name]) - one = np.ones(np.shape(y_data)[0], dtype=int) + one = np.ones(len(y_data)) # pairing elements of one and x_data x_train = np.column_stack((one, x_data)) @@ -99,6 +158,7 @@ def plot_preds( ) -> plt.plot: """ Plot predictions and display the graph + >>> pass # No doctests, function is for demo purposes only """ x_train_sorted = np.sort(x_train, axis=0) plt.scatter(x_data, y_data, color="blue") @@ -119,6 +179,7 @@ def plot_preds( doctest.testmod() + # Demo with a dataset from the seaborn module training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 425302a01f57925995b478e8737dbac7deb0da56 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:57:24 -0800 Subject: [PATCH 10/36] Rename var to avoid confusion Rename n to m, as n tends to be used for the number of parameters rather than the sample size --- .../local_weighted_learning/local_weighted_learning.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index a3c877f3e932..d183a1f20077 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -47,7 +47,7 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar decreases as the distance from the prediction point increases Returns: - n x n weight matrix around the prediction point, where n is the size of + m x m weight matrix around the prediction point, where m is the size of the training set >>> weight_matrix( ... np.array([1., 1.]), @@ -58,9 +58,9 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - n = len(x_train) # Number of training samples - weights = np.eye(n) # Initialize weights as identity matrix - for j in range(n): + m = len(x_train) # Number of training samples + weights = np.eye(m) # Initialize weights as identity matrix + for j in range(m): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) From 99c1125370c32e7d23d94b477c5992a0115bb120 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 08:15:01 -0800 Subject: [PATCH 11/36] Fix plot_preds return type The plot_preds function plots the data but doesn't actually return anything --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index d183a1f20077..8dd0e55d41df 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -155,7 +155,7 @@ def plot_preds( y_data: np.ndarray, x_name: str, y_name: str, -) -> plt.plot: +) -> None: """ Plot predictions and display the graph >>> pass # No doctests, function is for demo purposes only From 6c923d1589b0a5e38d1164032a4c47e982d54bd0 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:55:22 -0800 Subject: [PATCH 12/36] Rebase onto master --- .../local_weighted_learning.py | 126 ++++++------------ 1 file changed, 41 insertions(+), 85 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 8dd0e55d41df..b15670aca3ef 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,54 +1,11 @@ -""" -Locally weighted linear regression, also called local regression, is a type of -non-parametric linear regression that prioritizes data closest to a given -prediction point. The algorithm estimates the vector of model coefficients β -using weighted least squares regression: - -β = (XᵀWX)⁻¹(XᵀWy), - -where X is the design matrix, y is the response vector, and W is the diagonal -weight matrix. - -This implementation calculates wᵢ, the weight of the ith training sample, using -the Gaussian weight: - -wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), - -where xᵢ is the ith training sample, x is the prediction point, τ is the -"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² -norm). The bandwidth τ controls how quickly the weight of a training sample -decreases as its distance from the prediction point increases. One can think of -the Gaussian weight as a bell curve centered around the prediction point: a -training sample is weighted lower if it's farther from the center, and τ -controls the spread of the bell curve. - -Other types of locally weighted regression such as locally estimated scatterplot -smoothing (LOESS) typically use different weight functions. - -References: - - https://en.wikipedia.org/wiki/Local_regression - - https://en.wikipedia.org/wiki/Weighted_least_squares - - https://cs229.stanford.edu/notes2022fall/main_notes.pdf -""" - import matplotlib.pyplot as plt import numpy as np def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ - Calculate the weight of every point in the training data around a given - prediction point - - Args: - point: x-value at which the prediction is being made - x_train: ndarray of x-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - - Returns: - m x m weight matrix around the prediction point, where m is the size of - the training set + Calculate the weight for every point in the data set. + point --> the x value at which we want to make predictions >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -58,12 +15,13 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m = len(x_train) # Number of training samples - weights = np.eye(m) # Initialize weights as identity matrix + m, _ = np.shape(x_train) # m is the number of training samples + weights = np.eye(m) # Initializing weights as identity matrix + + # calculating weights for all training examples [x(i)'s] for j in range(m): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) - return weights @@ -71,17 +29,8 @@ def local_weight( point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate the local weights at a given prediction point using the weight - matrix for that point - - Args: - point: x-value at which the prediction is being made - x_train: ndarray of x-values for training - y_train: ndarray of y-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - Returns: - ndarray of local weights + Calculate the local weights using the weight_matrix function on training data. + Return the weighted matrix. >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -103,16 +52,7 @@ def local_weight_regression( x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate predictions for each point in the training data - - Args: - x_train: ndarray of x-values for training - y_train: ndarray of y-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - - Returns: - ndarray of predictions + Calculate predictions for each data point on axis >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -120,7 +60,9 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - y_pred = np.zeros(len(x_train)) # Initialize array of predictions + m, _ = np.shape(x_train) + y_pred = np.zeros(m) + for i, item in enumerate(x_train): y_pred[i] = item @ local_weight(item, x_train, y_train, tau) @@ -129,42 +71,57 @@ def local_weight_regression( def load_data( dataset_name: str, x_name: str, y_name: str -) -> tuple[np.ndarray, np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points - >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - x_data = np.array(data[x_name]) - y_data = np.array(data[y_name]) + x_data = np.array(data[x_name]) # total_bill + y_data = np.array(data[y_name]) # tip + + mcol_a = x_data.copy() + mcol_b = y_data.copy() + + one = np.ones(np.shape(mcol_b)[0], dtype=int) - one = np.ones(len(y_data)) + # pairing elements of one and mcol_a + x_train = np.column_stack((one, mcol_a)) - # pairing elements of one and x_data - x_train = np.column_stack((one, x_data)) + return x_train, mcol_b, x_data, y_data - return x_train, x_data, y_data + +def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: + """ + Get predictions with minimum error for each training data + >>> get_preds( + ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), + ... np.array([[1.01, 1.66, 3.5]]), + ... 0.6 + ... ) + array([1.07173261, 1.65970737, 3.50160179]) + """ + y_pred = local_weight_regression(x_train, y_train, tau) + return y_pred def plot_preds( x_train: np.ndarray, - preds: np.ndarray, + predictions: np.ndarray, x_data: np.ndarray, y_data: np.ndarray, x_name: str, y_name: str, -) -> None: +) -> plt.plot: """ Plot predictions and display the graph - >>> pass # No doctests, function is for demo purposes only """ x_train_sorted = np.sort(x_train, axis=0) plt.scatter(x_data, y_data, color="blue") plt.plot( x_train_sorted[:, 1], - preds[x_train[:, 1].argsort(0)], + predictions[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) @@ -179,7 +136,6 @@ def plot_preds( doctest.testmod() - # Demo with a dataset from the seaborn module - training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = local_weight_regression(training_data_x, tip, 5) + training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") + predictions = get_preds(training_data_x, mcol_b, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 834a15ab77e043c2104f339385f393320d38c0d7 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:59:44 -0800 Subject: [PATCH 13/36] Refactor to remove duplicate var --- .../local_weighted_learning.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index b15670aca3ef..95de87584479 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -71,7 +71,7 @@ def local_weight_regression( def load_data( dataset_name: str, x_name: str, y_name: str -) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points """ @@ -81,15 +81,12 @@ def load_data( x_data = np.array(data[x_name]) # total_bill y_data = np.array(data[y_name]) # tip - mcol_a = x_data.copy() - mcol_b = y_data.copy() + one = np.ones(np.shape(y_data)[0], dtype=int) - one = np.ones(np.shape(mcol_b)[0], dtype=int) + # pairing elements of one and x_data + x_train = np.column_stack((one, x_data)) - # pairing elements of one and mcol_a - x_train = np.column_stack((one, mcol_a)) - - return x_train, mcol_b, x_data, y_data + return x_train, x_data, y_data def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: @@ -108,7 +105,7 @@ def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarra def plot_preds( x_train: np.ndarray, - predictions: np.ndarray, + preds: np.ndarray, x_data: np.ndarray, y_data: np.ndarray, x_name: str, @@ -121,7 +118,7 @@ def plot_preds( plt.scatter(x_data, y_data, color="blue") plt.plot( x_train_sorted[:, 1], - predictions[x_train[:, 1].argsort(0)], + preds[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) @@ -136,6 +133,6 @@ def plot_preds( doctest.testmod() - training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, mcol_b, 0.5) + training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") + predictions = get_preds(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 5aaa02afe80573f6e80c1753752c2dea71af2222 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:01:11 -0800 Subject: [PATCH 14/36] Refactor to remove unneeded wrapper function --- .../local_weighted_learning.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 95de87584479..df7dd2861f70 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -89,20 +89,6 @@ def load_data( return x_train, x_data, y_data -def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: - """ - Get predictions with minimum error for each training data - >>> get_preds( - ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), - ... np.array([[1.01, 1.66, 3.5]]), - ... 0.6 - ... ) - array([1.07173261, 1.65970737, 3.50160179]) - """ - y_pred = local_weight_regression(x_train, y_train, tau) - return y_pred - - def plot_preds( x_train: np.ndarray, preds: np.ndarray, @@ -134,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From bc658310f1e288356b2c1c707c86e0dc0ee3a4ba Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:10:30 -0800 Subject: [PATCH 15/36] Increase value of tau in demo to make predictions less overfit --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index df7dd2861f70..88a834091717 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -120,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = local_weight_regression(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 44070386b3adad6994a8ed1305e84bddd9a1990b Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:29:45 -0800 Subject: [PATCH 16/36] Expand function documentation and add algorithm explanation --- .../local_weighted_learning.py | 93 +++++++++++++++---- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 88a834091717..a3c877f3e932 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,11 +1,54 @@ +""" +Locally weighted linear regression, also called local regression, is a type of +non-parametric linear regression that prioritizes data closest to a given +prediction point. The algorithm estimates the vector of model coefficients β +using weighted least squares regression: + +β = (XᵀWX)⁻¹(XᵀWy), + +where X is the design matrix, y is the response vector, and W is the diagonal +weight matrix. + +This implementation calculates wᵢ, the weight of the ith training sample, using +the Gaussian weight: + +wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), + +where xᵢ is the ith training sample, x is the prediction point, τ is the +"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² +norm). The bandwidth τ controls how quickly the weight of a training sample +decreases as its distance from the prediction point increases. One can think of +the Gaussian weight as a bell curve centered around the prediction point: a +training sample is weighted lower if it's farther from the center, and τ +controls the spread of the bell curve. + +Other types of locally weighted regression such as locally estimated scatterplot +smoothing (LOESS) typically use different weight functions. + +References: + - https://en.wikipedia.org/wiki/Local_regression + - https://en.wikipedia.org/wiki/Weighted_least_squares + - https://cs229.stanford.edu/notes2022fall/main_notes.pdf +""" + import matplotlib.pyplot as plt import numpy as np def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ - Calculate the weight for every point in the data set. - point --> the x value at which we want to make predictions + Calculate the weight of every point in the training data around a given + prediction point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + n x n weight matrix around the prediction point, where n is the size of + the training set >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -15,13 +58,12 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(x_train) # m is the number of training samples - weights = np.eye(m) # Initializing weights as identity matrix - - # calculating weights for all training examples [x(i)'s] - for j in range(m): + n = len(x_train) # Number of training samples + weights = np.eye(n) # Initialize weights as identity matrix + for j in range(n): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) + return weights @@ -29,8 +71,17 @@ def local_weight( point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate the local weights using the weight_matrix function on training data. - Return the weighted matrix. + Calculate the local weights at a given prediction point using the weight + matrix for that point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + Returns: + ndarray of local weights >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -52,7 +103,16 @@ def local_weight_regression( x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate predictions for each data point on axis + Calculate predictions for each point in the training data + + Args: + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + ndarray of predictions >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -60,9 +120,7 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(x_train) - y_pred = np.zeros(m) - + y_pred = np.zeros(len(x_train)) # Initialize array of predictions for i, item in enumerate(x_train): y_pred[i] = item @ local_weight(item, x_train, y_train, tau) @@ -74,14 +132,15 @@ def load_data( ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points + >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - x_data = np.array(data[x_name]) # total_bill - y_data = np.array(data[y_name]) # tip + x_data = np.array(data[x_name]) + y_data = np.array(data[y_name]) - one = np.ones(np.shape(y_data)[0], dtype=int) + one = np.ones(len(y_data)) # pairing elements of one and x_data x_train = np.column_stack((one, x_data)) @@ -99,6 +158,7 @@ def plot_preds( ) -> plt.plot: """ Plot predictions and display the graph + >>> pass # No doctests, function is for demo purposes only """ x_train_sorted = np.sort(x_train, axis=0) plt.scatter(x_data, y_data, color="blue") @@ -119,6 +179,7 @@ def plot_preds( doctest.testmod() + # Demo with a dataset from the seaborn module training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 8b3b8544d91a283c562258848afcd27fb0a6e525 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:57:24 -0800 Subject: [PATCH 17/36] Rename var to avoid confusion Rename n to m, as n tends to be used for the number of parameters rather than the sample size --- .../local_weighted_learning/local_weighted_learning.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index a3c877f3e932..d183a1f20077 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -47,7 +47,7 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar decreases as the distance from the prediction point increases Returns: - n x n weight matrix around the prediction point, where n is the size of + m x m weight matrix around the prediction point, where m is the size of the training set >>> weight_matrix( ... np.array([1., 1.]), @@ -58,9 +58,9 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - n = len(x_train) # Number of training samples - weights = np.eye(n) # Initialize weights as identity matrix - for j in range(n): + m = len(x_train) # Number of training samples + weights = np.eye(m) # Initialize weights as identity matrix + for j in range(m): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) From b8334f2ab85dea08105b3d4ca0505036465dbd74 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 08:15:01 -0800 Subject: [PATCH 18/36] Fix plot_preds return type The plot_preds function plots the data but doesn't actually return anything --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index d183a1f20077..8dd0e55d41df 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -155,7 +155,7 @@ def plot_preds( y_data: np.ndarray, x_name: str, y_name: str, -) -> plt.plot: +) -> None: """ Plot predictions and display the graph >>> pass # No doctests, function is for demo purposes only From fab7be60624ad3310b45874c57bad9c9c01a7b69 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Mon, 2 Jan 2023 23:49:21 -0800 Subject: [PATCH 19/36] Fix mypy errors in local_weighted_learning.py --- .../local_weighted_learning.py | 186 +++++++----------- 1 file changed, 76 insertions(+), 110 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 8dd0e55d41df..3c766ca4b60c 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,55 +1,14 @@ -""" -Locally weighted linear regression, also called local regression, is a type of -non-parametric linear regression that prioritizes data closest to a given -prediction point. The algorithm estimates the vector of model coefficients β -using weighted least squares regression: - -β = (XᵀWX)⁻¹(XᵀWy), - -where X is the design matrix, y is the response vector, and W is the diagonal -weight matrix. - -This implementation calculates wᵢ, the weight of the ith training sample, using -the Gaussian weight: - -wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), - -where xᵢ is the ith training sample, x is the prediction point, τ is the -"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² -norm). The bandwidth τ controls how quickly the weight of a training sample -decreases as its distance from the prediction point increases. One can think of -the Gaussian weight as a bell curve centered around the prediction point: a -training sample is weighted lower if it's farther from the center, and τ -controls the spread of the bell curve. - -Other types of locally weighted regression such as locally estimated scatterplot -smoothing (LOESS) typically use different weight functions. - -References: - - https://en.wikipedia.org/wiki/Local_regression - - https://en.wikipedia.org/wiki/Weighted_least_squares - - https://cs229.stanford.edu/notes2022fall/main_notes.pdf -""" - import matplotlib.pyplot as plt import numpy as np -def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: +def weighted_matrix( + point: np.ndarray, training_data_x: np.ndarray, bandwidth: float +) -> np.ndarray: """ - Calculate the weight of every point in the training data around a given - prediction point - - Args: - point: x-value at which the prediction is being made - x_train: ndarray of x-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - - Returns: - m x m weight matrix around the prediction point, where m is the size of - the training set - >>> weight_matrix( + Calculate the weight for every point in the data set. + point --> the x value at which we want to make predictions + >>> weighted_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... 0.6 @@ -58,30 +17,25 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m = len(x_train) # Number of training samples - weights = np.eye(m) # Initialize weights as identity matrix - for j in range(m): - diff = point - x_train[j] - weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) + m, _ = np.shape(training_data_x) # m is the number of training samples + weights = np.eye(m) # Initializing weights as identity matrix + # calculating weights for all training examples [x(i)'s] + for j in range(m): + diff = point - training_data_x[j] + weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) return weights def local_weight( - point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float + point: np.ndarray, + training_data_x: np.ndarray, + training_data_y: np.ndarray, + bandwidth: float, ) -> np.ndarray: """ - Calculate the local weights at a given prediction point using the weight - matrix for that point - - Args: - point: x-value at which the prediction is being made - x_train: ndarray of x-values for training - y_train: ndarray of y-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - Returns: - ndarray of local weights + Calculate the local weights using the weight_matrix function on training data. + Return the weighted matrix. >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -91,28 +45,19 @@ def local_weight( array([[0.00873174], [0.08272556]]) """ - weight_mat = weight_matrix(point, x_train, tau) - weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ ( - x_train.T @ weight_mat @ y_train.T + weight = weighted_matrix(point, training_data_x, bandwidth) + w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( + training_data_x.T @ weight @ training_data_y.T ) - return weight + return w def local_weight_regression( - x_train: np.ndarray, y_train: np.ndarray, tau: float + training_data_x: np.ndarray, training_data_y: np.ndarray, bandwidth: float ) -> np.ndarray: """ - Calculate predictions for each point in the training data - - Args: - x_train: ndarray of x-values for training - y_train: ndarray of y-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - - Returns: - ndarray of predictions + Calculate predictions for each data point on axis >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -120,57 +65,79 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - y_pred = np.zeros(len(x_train)) # Initialize array of predictions - for i, item in enumerate(x_train): - y_pred[i] = item @ local_weight(item, x_train, y_train, tau) + m, _ = np.shape(training_data_x) + ypred = np.zeros(m) - return y_pred + for i, item in enumerate(training_data_x): + ypred[i] = item @ local_weight( + item, training_data_x, training_data_y, bandwidth + ) + + return ypred def load_data( - dataset_name: str, x_name: str, y_name: str -) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + dataset_name: str, cola_name: str, colb_name: str +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points - >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - x_data = np.array(data[x_name]) - y_data = np.array(data[y_name]) + col_a = np.array(data[cola_name]) # total_bill + col_b = np.array(data[colb_name]) # tip + + mcol_a = col_a.copy() + mcol_b = col_b.copy() + + one = np.ones(np.shape(mcol_b)[0], dtype=int) - one = np.ones(len(y_data)) + # pairing elements of one and mcol_a + training_data_x = np.column_stack((one, mcol_a)) - # pairing elements of one and x_data - x_train = np.column_stack((one, x_data)) + return training_data_x, mcol_b, col_a, col_b - return x_train, x_data, y_data + +def get_preds( + training_data_x: np.ndarray, mcol_b: np.ndarray, tau: float +) -> np.ndarray: + """ + Get predictions with minimum error for each training data + >>> get_preds( + ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), + ... np.array([[1.01, 1.66, 3.5]]), + ... 0.6 + ... ) + array([1.07173261, 1.65970737, 3.50160179]) + """ + ypred = local_weight_regression(training_data_x, mcol_b, tau) + return ypred def plot_preds( - x_train: np.ndarray, - preds: np.ndarray, - x_data: np.ndarray, - y_data: np.ndarray, - x_name: str, - y_name: str, -) -> None: + training_data_x: np.ndarray, + predictions: np.ndarray, + col_x: np.ndarray, + col_y: np.ndarray, + cola_name: str, + colb_name: str, +) -> plt.plot: """ Plot predictions and display the graph - >>> pass # No doctests, function is for demo purposes only """ - x_train_sorted = np.sort(x_train, axis=0) - plt.scatter(x_data, y_data, color="blue") + xsort = training_data_x.copy() + xsort.sort(axis=0) + plt.scatter(col_x, col_y, color="blue") plt.plot( - x_train_sorted[:, 1], - preds[x_train[:, 1].argsort(0)], + xsort[:, 1], + predictions[training_data_x[:, 1].argsort(0)], color="yellow", linewidth=5, ) plt.title("Local Weighted Regression") - plt.xlabel(x_name) - plt.ylabel(y_name) + plt.xlabel(cola_name) + plt.ylabel(colb_name) plt.show() @@ -179,7 +146,6 @@ def plot_preds( doctest.testmod() - # Demo with a dataset from the seaborn module - training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = local_weight_regression(training_data_x, tip, 5) - plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") + training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") + predictions = get_preds(training_data_x, mcol_b, 0.5) + plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip") From 092f1afc95580b274b6c97718010ac53c9cf8680 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:55:22 -0800 Subject: [PATCH 20/36] Rename vars for clarity --- .../local_weighted_learning.py | 88 ++++++++----------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 3c766ca4b60c..b15670aca3ef 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -2,13 +2,11 @@ import numpy as np -def weighted_matrix( - point: np.ndarray, training_data_x: np.ndarray, bandwidth: float -) -> np.ndarray: +def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ Calculate the weight for every point in the data set. point --> the x value at which we want to make predictions - >>> weighted_matrix( + >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... 0.6 @@ -17,21 +15,18 @@ def weighted_matrix( [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(training_data_x) # m is the number of training samples + m, _ = np.shape(x_train) # m is the number of training samples weights = np.eye(m) # Initializing weights as identity matrix # calculating weights for all training examples [x(i)'s] for j in range(m): - diff = point - training_data_x[j] - weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) + diff = point - x_train[j] + weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) return weights def local_weight( - point: np.ndarray, - training_data_x: np.ndarray, - training_data_y: np.ndarray, - bandwidth: float, + point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ Calculate the local weights using the weight_matrix function on training data. @@ -45,16 +40,16 @@ def local_weight( array([[0.00873174], [0.08272556]]) """ - weight = weighted_matrix(point, training_data_x, bandwidth) - w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( - training_data_x.T @ weight @ training_data_y.T + weight_mat = weight_matrix(point, x_train, tau) + weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ ( + x_train.T @ weight_mat @ y_train.T ) - return w + return weight def local_weight_regression( - training_data_x: np.ndarray, training_data_y: np.ndarray, bandwidth: float + x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ Calculate predictions for each data point on axis @@ -65,19 +60,17 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(training_data_x) - ypred = np.zeros(m) + m, _ = np.shape(x_train) + y_pred = np.zeros(m) - for i, item in enumerate(training_data_x): - ypred[i] = item @ local_weight( - item, training_data_x, training_data_y, bandwidth - ) + for i, item in enumerate(x_train): + y_pred[i] = item @ local_weight(item, x_train, y_train, tau) - return ypred + return y_pred def load_data( - dataset_name: str, cola_name: str, colb_name: str + dataset_name: str, x_name: str, y_name: str ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points @@ -85,23 +78,21 @@ def load_data( import seaborn as sns data = sns.load_dataset(dataset_name) - col_a = np.array(data[cola_name]) # total_bill - col_b = np.array(data[colb_name]) # tip + x_data = np.array(data[x_name]) # total_bill + y_data = np.array(data[y_name]) # tip - mcol_a = col_a.copy() - mcol_b = col_b.copy() + mcol_a = x_data.copy() + mcol_b = y_data.copy() one = np.ones(np.shape(mcol_b)[0], dtype=int) # pairing elements of one and mcol_a - training_data_x = np.column_stack((one, mcol_a)) + x_train = np.column_stack((one, mcol_a)) - return training_data_x, mcol_b, col_a, col_b + return x_train, mcol_b, x_data, y_data -def get_preds( - training_data_x: np.ndarray, mcol_b: np.ndarray, tau: float -) -> np.ndarray: +def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: """ Get predictions with minimum error for each training data >>> get_preds( @@ -111,33 +102,32 @@ def get_preds( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - ypred = local_weight_regression(training_data_x, mcol_b, tau) - return ypred + y_pred = local_weight_regression(x_train, y_train, tau) + return y_pred def plot_preds( - training_data_x: np.ndarray, + x_train: np.ndarray, predictions: np.ndarray, - col_x: np.ndarray, - col_y: np.ndarray, - cola_name: str, - colb_name: str, + x_data: np.ndarray, + y_data: np.ndarray, + x_name: str, + y_name: str, ) -> plt.plot: """ Plot predictions and display the graph """ - xsort = training_data_x.copy() - xsort.sort(axis=0) - plt.scatter(col_x, col_y, color="blue") + x_train_sorted = np.sort(x_train, axis=0) + plt.scatter(x_data, y_data, color="blue") plt.plot( - xsort[:, 1], - predictions[training_data_x[:, 1].argsort(0)], + x_train_sorted[:, 1], + predictions[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) plt.title("Local Weighted Regression") - plt.xlabel(cola_name) - plt.ylabel(colb_name) + plt.xlabel(x_name) + plt.ylabel(y_name) plt.show() @@ -146,6 +136,6 @@ def plot_preds( doctest.testmod() - training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") + training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = get_preds(training_data_x, mcol_b, 0.5) - plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip") + plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From df8e96d88df73846aca6624cc39aa4fd948cbb1f Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:59:44 -0800 Subject: [PATCH 21/36] Refactor to remove duplicate var --- .../local_weighted_learning.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index b15670aca3ef..95de87584479 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -71,7 +71,7 @@ def local_weight_regression( def load_data( dataset_name: str, x_name: str, y_name: str -) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points """ @@ -81,15 +81,12 @@ def load_data( x_data = np.array(data[x_name]) # total_bill y_data = np.array(data[y_name]) # tip - mcol_a = x_data.copy() - mcol_b = y_data.copy() + one = np.ones(np.shape(y_data)[0], dtype=int) - one = np.ones(np.shape(mcol_b)[0], dtype=int) + # pairing elements of one and x_data + x_train = np.column_stack((one, x_data)) - # pairing elements of one and mcol_a - x_train = np.column_stack((one, mcol_a)) - - return x_train, mcol_b, x_data, y_data + return x_train, x_data, y_data def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: @@ -108,7 +105,7 @@ def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarra def plot_preds( x_train: np.ndarray, - predictions: np.ndarray, + preds: np.ndarray, x_data: np.ndarray, y_data: np.ndarray, x_name: str, @@ -121,7 +118,7 @@ def plot_preds( plt.scatter(x_data, y_data, color="blue") plt.plot( x_train_sorted[:, 1], - predictions[x_train[:, 1].argsort(0)], + preds[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) @@ -136,6 +133,6 @@ def plot_preds( doctest.testmod() - training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, mcol_b, 0.5) + training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") + predictions = get_preds(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 7bab184e09017724d7d1b76978e1d11e3f1ce47c Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:01:11 -0800 Subject: [PATCH 22/36] Refactor to remove unneeded wrapper function --- .../local_weighted_learning.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 95de87584479..df7dd2861f70 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -89,20 +89,6 @@ def load_data( return x_train, x_data, y_data -def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: - """ - Get predictions with minimum error for each training data - >>> get_preds( - ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), - ... np.array([[1.01, 1.66, 3.5]]), - ... 0.6 - ... ) - array([1.07173261, 1.65970737, 3.50160179]) - """ - y_pred = local_weight_regression(x_train, y_train, tau) - return y_pred - - def plot_preds( x_train: np.ndarray, preds: np.ndarray, @@ -134,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 87e8ac3ae672b35ea91a6e1050ef22444fe15c7e Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:10:30 -0800 Subject: [PATCH 23/36] Increase value of tau in demo to make predictions less overfit --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index df7dd2861f70..88a834091717 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -120,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = local_weight_regression(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 5c36609b23a8553c74bc804597088180cecf983f Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:29:45 -0800 Subject: [PATCH 24/36] Expand function documentation and add algorithm explanation --- .../local_weighted_learning.py | 93 +++++++++++++++---- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 88a834091717..a3c877f3e932 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,11 +1,54 @@ +""" +Locally weighted linear regression, also called local regression, is a type of +non-parametric linear regression that prioritizes data closest to a given +prediction point. The algorithm estimates the vector of model coefficients β +using weighted least squares regression: + +β = (XᵀWX)⁻¹(XᵀWy), + +where X is the design matrix, y is the response vector, and W is the diagonal +weight matrix. + +This implementation calculates wᵢ, the weight of the ith training sample, using +the Gaussian weight: + +wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), + +where xᵢ is the ith training sample, x is the prediction point, τ is the +"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² +norm). The bandwidth τ controls how quickly the weight of a training sample +decreases as its distance from the prediction point increases. One can think of +the Gaussian weight as a bell curve centered around the prediction point: a +training sample is weighted lower if it's farther from the center, and τ +controls the spread of the bell curve. + +Other types of locally weighted regression such as locally estimated scatterplot +smoothing (LOESS) typically use different weight functions. + +References: + - https://en.wikipedia.org/wiki/Local_regression + - https://en.wikipedia.org/wiki/Weighted_least_squares + - https://cs229.stanford.edu/notes2022fall/main_notes.pdf +""" + import matplotlib.pyplot as plt import numpy as np def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ - Calculate the weight for every point in the data set. - point --> the x value at which we want to make predictions + Calculate the weight of every point in the training data around a given + prediction point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + n x n weight matrix around the prediction point, where n is the size of + the training set >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -15,13 +58,12 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(x_train) # m is the number of training samples - weights = np.eye(m) # Initializing weights as identity matrix - - # calculating weights for all training examples [x(i)'s] - for j in range(m): + n = len(x_train) # Number of training samples + weights = np.eye(n) # Initialize weights as identity matrix + for j in range(n): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) + return weights @@ -29,8 +71,17 @@ def local_weight( point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate the local weights using the weight_matrix function on training data. - Return the weighted matrix. + Calculate the local weights at a given prediction point using the weight + matrix for that point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + Returns: + ndarray of local weights >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -52,7 +103,16 @@ def local_weight_regression( x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate predictions for each data point on axis + Calculate predictions for each point in the training data + + Args: + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + ndarray of predictions >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -60,9 +120,7 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(x_train) - y_pred = np.zeros(m) - + y_pred = np.zeros(len(x_train)) # Initialize array of predictions for i, item in enumerate(x_train): y_pred[i] = item @ local_weight(item, x_train, y_train, tau) @@ -74,14 +132,15 @@ def load_data( ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points + >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - x_data = np.array(data[x_name]) # total_bill - y_data = np.array(data[y_name]) # tip + x_data = np.array(data[x_name]) + y_data = np.array(data[y_name]) - one = np.ones(np.shape(y_data)[0], dtype=int) + one = np.ones(len(y_data)) # pairing elements of one and x_data x_train = np.column_stack((one, x_data)) @@ -99,6 +158,7 @@ def plot_preds( ) -> plt.plot: """ Plot predictions and display the graph + >>> pass # No doctests, function is for demo purposes only """ x_train_sorted = np.sort(x_train, axis=0) plt.scatter(x_data, y_data, color="blue") @@ -119,6 +179,7 @@ def plot_preds( doctest.testmod() + # Demo with a dataset from the seaborn module training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 1ebb93d0cbacbf4f48f01498824f227ae8bf1521 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:57:24 -0800 Subject: [PATCH 25/36] Rename var to avoid confusion Rename n to m, as n tends to be used for the number of parameters rather than the sample size --- .../local_weighted_learning/local_weighted_learning.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index a3c877f3e932..d183a1f20077 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -47,7 +47,7 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar decreases as the distance from the prediction point increases Returns: - n x n weight matrix around the prediction point, where n is the size of + m x m weight matrix around the prediction point, where m is the size of the training set >>> weight_matrix( ... np.array([1., 1.]), @@ -58,9 +58,9 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - n = len(x_train) # Number of training samples - weights = np.eye(n) # Initialize weights as identity matrix - for j in range(n): + m = len(x_train) # Number of training samples + weights = np.eye(m) # Initialize weights as identity matrix + for j in range(m): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) From f63dbeafd4a0dd533a72d43414edc83ad3704b61 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 08:15:01 -0800 Subject: [PATCH 26/36] Fix plot_preds return type The plot_preds function plots the data but doesn't actually return anything --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index d183a1f20077..8dd0e55d41df 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -155,7 +155,7 @@ def plot_preds( y_data: np.ndarray, x_name: str, y_name: str, -) -> plt.plot: +) -> None: """ Plot predictions and display the graph >>> pass # No doctests, function is for demo purposes only From 892b5907f05f1476647f6aa666859156ca989d6d Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Mon, 2 Jan 2023 23:49:21 -0800 Subject: [PATCH 27/36] Fix mypy errors in local_weighted_learning.py --- .../local_weighted_learning.py | 186 +++++++----------- 1 file changed, 76 insertions(+), 110 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 8dd0e55d41df..3c766ca4b60c 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,55 +1,14 @@ -""" -Locally weighted linear regression, also called local regression, is a type of -non-parametric linear regression that prioritizes data closest to a given -prediction point. The algorithm estimates the vector of model coefficients β -using weighted least squares regression: - -β = (XᵀWX)⁻¹(XᵀWy), - -where X is the design matrix, y is the response vector, and W is the diagonal -weight matrix. - -This implementation calculates wᵢ, the weight of the ith training sample, using -the Gaussian weight: - -wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), - -where xᵢ is the ith training sample, x is the prediction point, τ is the -"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² -norm). The bandwidth τ controls how quickly the weight of a training sample -decreases as its distance from the prediction point increases. One can think of -the Gaussian weight as a bell curve centered around the prediction point: a -training sample is weighted lower if it's farther from the center, and τ -controls the spread of the bell curve. - -Other types of locally weighted regression such as locally estimated scatterplot -smoothing (LOESS) typically use different weight functions. - -References: - - https://en.wikipedia.org/wiki/Local_regression - - https://en.wikipedia.org/wiki/Weighted_least_squares - - https://cs229.stanford.edu/notes2022fall/main_notes.pdf -""" - import matplotlib.pyplot as plt import numpy as np -def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: +def weighted_matrix( + point: np.ndarray, training_data_x: np.ndarray, bandwidth: float +) -> np.ndarray: """ - Calculate the weight of every point in the training data around a given - prediction point - - Args: - point: x-value at which the prediction is being made - x_train: ndarray of x-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - - Returns: - m x m weight matrix around the prediction point, where m is the size of - the training set - >>> weight_matrix( + Calculate the weight for every point in the data set. + point --> the x value at which we want to make predictions + >>> weighted_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... 0.6 @@ -58,30 +17,25 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m = len(x_train) # Number of training samples - weights = np.eye(m) # Initialize weights as identity matrix - for j in range(m): - diff = point - x_train[j] - weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) + m, _ = np.shape(training_data_x) # m is the number of training samples + weights = np.eye(m) # Initializing weights as identity matrix + # calculating weights for all training examples [x(i)'s] + for j in range(m): + diff = point - training_data_x[j] + weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) return weights def local_weight( - point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float + point: np.ndarray, + training_data_x: np.ndarray, + training_data_y: np.ndarray, + bandwidth: float, ) -> np.ndarray: """ - Calculate the local weights at a given prediction point using the weight - matrix for that point - - Args: - point: x-value at which the prediction is being made - x_train: ndarray of x-values for training - y_train: ndarray of y-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - Returns: - ndarray of local weights + Calculate the local weights using the weight_matrix function on training data. + Return the weighted matrix. >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -91,28 +45,19 @@ def local_weight( array([[0.00873174], [0.08272556]]) """ - weight_mat = weight_matrix(point, x_train, tau) - weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ ( - x_train.T @ weight_mat @ y_train.T + weight = weighted_matrix(point, training_data_x, bandwidth) + w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( + training_data_x.T @ weight @ training_data_y.T ) - return weight + return w def local_weight_regression( - x_train: np.ndarray, y_train: np.ndarray, tau: float + training_data_x: np.ndarray, training_data_y: np.ndarray, bandwidth: float ) -> np.ndarray: """ - Calculate predictions for each point in the training data - - Args: - x_train: ndarray of x-values for training - y_train: ndarray of y-values for training - tau: bandwidth value, controls how quickly the weight of training values - decreases as the distance from the prediction point increases - - Returns: - ndarray of predictions + Calculate predictions for each data point on axis >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -120,57 +65,79 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - y_pred = np.zeros(len(x_train)) # Initialize array of predictions - for i, item in enumerate(x_train): - y_pred[i] = item @ local_weight(item, x_train, y_train, tau) + m, _ = np.shape(training_data_x) + ypred = np.zeros(m) - return y_pred + for i, item in enumerate(training_data_x): + ypred[i] = item @ local_weight( + item, training_data_x, training_data_y, bandwidth + ) + + return ypred def load_data( - dataset_name: str, x_name: str, y_name: str -) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + dataset_name: str, cola_name: str, colb_name: str +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points - >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - x_data = np.array(data[x_name]) - y_data = np.array(data[y_name]) + col_a = np.array(data[cola_name]) # total_bill + col_b = np.array(data[colb_name]) # tip + + mcol_a = col_a.copy() + mcol_b = col_b.copy() + + one = np.ones(np.shape(mcol_b)[0], dtype=int) - one = np.ones(len(y_data)) + # pairing elements of one and mcol_a + training_data_x = np.column_stack((one, mcol_a)) - # pairing elements of one and x_data - x_train = np.column_stack((one, x_data)) + return training_data_x, mcol_b, col_a, col_b - return x_train, x_data, y_data + +def get_preds( + training_data_x: np.ndarray, mcol_b: np.ndarray, tau: float +) -> np.ndarray: + """ + Get predictions with minimum error for each training data + >>> get_preds( + ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), + ... np.array([[1.01, 1.66, 3.5]]), + ... 0.6 + ... ) + array([1.07173261, 1.65970737, 3.50160179]) + """ + ypred = local_weight_regression(training_data_x, mcol_b, tau) + return ypred def plot_preds( - x_train: np.ndarray, - preds: np.ndarray, - x_data: np.ndarray, - y_data: np.ndarray, - x_name: str, - y_name: str, -) -> None: + training_data_x: np.ndarray, + predictions: np.ndarray, + col_x: np.ndarray, + col_y: np.ndarray, + cola_name: str, + colb_name: str, +) -> plt.plot: """ Plot predictions and display the graph - >>> pass # No doctests, function is for demo purposes only """ - x_train_sorted = np.sort(x_train, axis=0) - plt.scatter(x_data, y_data, color="blue") + xsort = training_data_x.copy() + xsort.sort(axis=0) + plt.scatter(col_x, col_y, color="blue") plt.plot( - x_train_sorted[:, 1], - preds[x_train[:, 1].argsort(0)], + xsort[:, 1], + predictions[training_data_x[:, 1].argsort(0)], color="yellow", linewidth=5, ) plt.title("Local Weighted Regression") - plt.xlabel(x_name) - plt.ylabel(y_name) + plt.xlabel(cola_name) + plt.ylabel(colb_name) plt.show() @@ -179,7 +146,6 @@ def plot_preds( doctest.testmod() - # Demo with a dataset from the seaborn module - training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = local_weight_regression(training_data_x, tip, 5) - plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") + training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") + predictions = get_preds(training_data_x, mcol_b, 0.5) + plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip") From f970285ddeb980b8e256ce777d4a6f478611fb4d Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:55:22 -0800 Subject: [PATCH 28/36] Rename vars for clarity --- .../local_weighted_learning.py | 88 ++++++++----------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 3c766ca4b60c..b15670aca3ef 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -2,13 +2,11 @@ import numpy as np -def weighted_matrix( - point: np.ndarray, training_data_x: np.ndarray, bandwidth: float -) -> np.ndarray: +def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ Calculate the weight for every point in the data set. point --> the x value at which we want to make predictions - >>> weighted_matrix( + >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... 0.6 @@ -17,21 +15,18 @@ def weighted_matrix( [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(training_data_x) # m is the number of training samples + m, _ = np.shape(x_train) # m is the number of training samples weights = np.eye(m) # Initializing weights as identity matrix # calculating weights for all training examples [x(i)'s] for j in range(m): - diff = point - training_data_x[j] - weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) + diff = point - x_train[j] + weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) return weights def local_weight( - point: np.ndarray, - training_data_x: np.ndarray, - training_data_y: np.ndarray, - bandwidth: float, + point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ Calculate the local weights using the weight_matrix function on training data. @@ -45,16 +40,16 @@ def local_weight( array([[0.00873174], [0.08272556]]) """ - weight = weighted_matrix(point, training_data_x, bandwidth) - w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( - training_data_x.T @ weight @ training_data_y.T + weight_mat = weight_matrix(point, x_train, tau) + weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ ( + x_train.T @ weight_mat @ y_train.T ) - return w + return weight def local_weight_regression( - training_data_x: np.ndarray, training_data_y: np.ndarray, bandwidth: float + x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ Calculate predictions for each data point on axis @@ -65,19 +60,17 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(training_data_x) - ypred = np.zeros(m) + m, _ = np.shape(x_train) + y_pred = np.zeros(m) - for i, item in enumerate(training_data_x): - ypred[i] = item @ local_weight( - item, training_data_x, training_data_y, bandwidth - ) + for i, item in enumerate(x_train): + y_pred[i] = item @ local_weight(item, x_train, y_train, tau) - return ypred + return y_pred def load_data( - dataset_name: str, cola_name: str, colb_name: str + dataset_name: str, x_name: str, y_name: str ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points @@ -85,23 +78,21 @@ def load_data( import seaborn as sns data = sns.load_dataset(dataset_name) - col_a = np.array(data[cola_name]) # total_bill - col_b = np.array(data[colb_name]) # tip + x_data = np.array(data[x_name]) # total_bill + y_data = np.array(data[y_name]) # tip - mcol_a = col_a.copy() - mcol_b = col_b.copy() + mcol_a = x_data.copy() + mcol_b = y_data.copy() one = np.ones(np.shape(mcol_b)[0], dtype=int) # pairing elements of one and mcol_a - training_data_x = np.column_stack((one, mcol_a)) + x_train = np.column_stack((one, mcol_a)) - return training_data_x, mcol_b, col_a, col_b + return x_train, mcol_b, x_data, y_data -def get_preds( - training_data_x: np.ndarray, mcol_b: np.ndarray, tau: float -) -> np.ndarray: +def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: """ Get predictions with minimum error for each training data >>> get_preds( @@ -111,33 +102,32 @@ def get_preds( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - ypred = local_weight_regression(training_data_x, mcol_b, tau) - return ypred + y_pred = local_weight_regression(x_train, y_train, tau) + return y_pred def plot_preds( - training_data_x: np.ndarray, + x_train: np.ndarray, predictions: np.ndarray, - col_x: np.ndarray, - col_y: np.ndarray, - cola_name: str, - colb_name: str, + x_data: np.ndarray, + y_data: np.ndarray, + x_name: str, + y_name: str, ) -> plt.plot: """ Plot predictions and display the graph """ - xsort = training_data_x.copy() - xsort.sort(axis=0) - plt.scatter(col_x, col_y, color="blue") + x_train_sorted = np.sort(x_train, axis=0) + plt.scatter(x_data, y_data, color="blue") plt.plot( - xsort[:, 1], - predictions[training_data_x[:, 1].argsort(0)], + x_train_sorted[:, 1], + predictions[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) plt.title("Local Weighted Regression") - plt.xlabel(cola_name) - plt.ylabel(colb_name) + plt.xlabel(x_name) + plt.ylabel(y_name) plt.show() @@ -146,6 +136,6 @@ def plot_preds( doctest.testmod() - training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") + training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = get_preds(training_data_x, mcol_b, 0.5) - plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip") + plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 9d50a7adc6ccdffdbe273fbd999a8b2636f4a12d Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 00:59:44 -0800 Subject: [PATCH 29/36] Refactor to remove duplicate var --- .../local_weighted_learning.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index b15670aca3ef..95de87584479 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -71,7 +71,7 @@ def local_weight_regression( def load_data( dataset_name: str, x_name: str, y_name: str -) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points """ @@ -81,15 +81,12 @@ def load_data( x_data = np.array(data[x_name]) # total_bill y_data = np.array(data[y_name]) # tip - mcol_a = x_data.copy() - mcol_b = y_data.copy() + one = np.ones(np.shape(y_data)[0], dtype=int) - one = np.ones(np.shape(mcol_b)[0], dtype=int) + # pairing elements of one and x_data + x_train = np.column_stack((one, x_data)) - # pairing elements of one and mcol_a - x_train = np.column_stack((one, mcol_a)) - - return x_train, mcol_b, x_data, y_data + return x_train, x_data, y_data def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: @@ -108,7 +105,7 @@ def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarra def plot_preds( x_train: np.ndarray, - predictions: np.ndarray, + preds: np.ndarray, x_data: np.ndarray, y_data: np.ndarray, x_name: str, @@ -121,7 +118,7 @@ def plot_preds( plt.scatter(x_data, y_data, color="blue") plt.plot( x_train_sorted[:, 1], - predictions[x_train[:, 1].argsort(0)], + preds[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) @@ -136,6 +133,6 @@ def plot_preds( doctest.testmod() - training_data_x, mcol_b, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, mcol_b, 0.5) + training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") + predictions = get_preds(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From e2ee5dbb700d37884df13c27647a3bbaaba9f0c7 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:01:11 -0800 Subject: [PATCH 30/36] Refactor to remove unneeded wrapper function --- .../local_weighted_learning.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 95de87584479..df7dd2861f70 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -89,20 +89,6 @@ def load_data( return x_train, x_data, y_data -def get_preds(x_train: np.ndarray, y_train: np.ndarray, tau: float) -> np.ndarray: - """ - Get predictions with minimum error for each training data - >>> get_preds( - ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), - ... np.array([[1.01, 1.66, 3.5]]), - ... 0.6 - ... ) - array([1.07173261, 1.65970737, 3.50160179]) - """ - y_pred = local_weight_regression(x_train, y_train, tau) - return y_pred - - def plot_preds( x_train: np.ndarray, preds: np.ndarray, @@ -134,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 0.5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 36593a05627e1abb0ba53255001681a1f7974f55 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 01:10:30 -0800 Subject: [PATCH 31/36] Increase value of tau in demo to make predictions less overfit --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index df7dd2861f70..88a834091717 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -120,5 +120,5 @@ def plot_preds( doctest.testmod() training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") - predictions = local_weight_regression(training_data_x, tip, 0.5) + predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From 490c5652199e5064b494bd1d4a4d797159c28a0f Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:29:45 -0800 Subject: [PATCH 32/36] Expand function documentation and add algorithm explanation --- .../local_weighted_learning.py | 93 +++++++++++++++---- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 88a834091717..a3c877f3e932 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,11 +1,54 @@ +""" +Locally weighted linear regression, also called local regression, is a type of +non-parametric linear regression that prioritizes data closest to a given +prediction point. The algorithm estimates the vector of model coefficients β +using weighted least squares regression: + +β = (XᵀWX)⁻¹(XᵀWy), + +where X is the design matrix, y is the response vector, and W is the diagonal +weight matrix. + +This implementation calculates wᵢ, the weight of the ith training sample, using +the Gaussian weight: + +wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), + +where xᵢ is the ith training sample, x is the prediction point, τ is the +"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² +norm). The bandwidth τ controls how quickly the weight of a training sample +decreases as its distance from the prediction point increases. One can think of +the Gaussian weight as a bell curve centered around the prediction point: a +training sample is weighted lower if it's farther from the center, and τ +controls the spread of the bell curve. + +Other types of locally weighted regression such as locally estimated scatterplot +smoothing (LOESS) typically use different weight functions. + +References: + - https://en.wikipedia.org/wiki/Local_regression + - https://en.wikipedia.org/wiki/Weighted_least_squares + - https://cs229.stanford.edu/notes2022fall/main_notes.pdf +""" + import matplotlib.pyplot as plt import numpy as np def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ - Calculate the weight for every point in the data set. - point --> the x value at which we want to make predictions + Calculate the weight of every point in the training data around a given + prediction point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + n x n weight matrix around the prediction point, where n is the size of + the training set >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -15,13 +58,12 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(x_train) # m is the number of training samples - weights = np.eye(m) # Initializing weights as identity matrix - - # calculating weights for all training examples [x(i)'s] - for j in range(m): + n = len(x_train) # Number of training samples + weights = np.eye(n) # Initialize weights as identity matrix + for j in range(n): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) + return weights @@ -29,8 +71,17 @@ def local_weight( point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate the local weights using the weight_matrix function on training data. - Return the weighted matrix. + Calculate the local weights at a given prediction point using the weight + matrix for that point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + Returns: + ndarray of local weights >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -52,7 +103,16 @@ def local_weight_regression( x_train: np.ndarray, y_train: np.ndarray, tau: float ) -> np.ndarray: """ - Calculate predictions for each data point on axis + Calculate predictions for each point in the training data + + Args: + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + ndarray of predictions >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -60,9 +120,7 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(x_train) - y_pred = np.zeros(m) - + y_pred = np.zeros(len(x_train)) # Initialize array of predictions for i, item in enumerate(x_train): y_pred[i] = item @ local_weight(item, x_train, y_train, tau) @@ -74,14 +132,15 @@ def load_data( ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points + >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - x_data = np.array(data[x_name]) # total_bill - y_data = np.array(data[y_name]) # tip + x_data = np.array(data[x_name]) + y_data = np.array(data[y_name]) - one = np.ones(np.shape(y_data)[0], dtype=int) + one = np.ones(len(y_data)) # pairing elements of one and x_data x_train = np.column_stack((one, x_data)) @@ -99,6 +158,7 @@ def plot_preds( ) -> plt.plot: """ Plot predictions and display the graph + >>> pass # No doctests, function is for demo purposes only """ x_train_sorted = np.sort(x_train, axis=0) plt.scatter(x_data, y_data, color="blue") @@ -119,6 +179,7 @@ def plot_preds( doctest.testmod() + # Demo with a dataset from the seaborn module training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip") From d0aa4b474ddcb4eb81e9d046d2c9e326aca3881b Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 07:57:24 -0800 Subject: [PATCH 33/36] Rename var to avoid confusion Rename n to m, as n tends to be used for the number of parameters rather than the sample size --- .../local_weighted_learning/local_weighted_learning.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index a3c877f3e932..d183a1f20077 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -47,7 +47,7 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar decreases as the distance from the prediction point increases Returns: - n x n weight matrix around the prediction point, where n is the size of + m x m weight matrix around the prediction point, where m is the size of the training set >>> weight_matrix( ... np.array([1., 1.]), @@ -58,9 +58,9 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - n = len(x_train) # Number of training samples - weights = np.eye(n) # Initialize weights as identity matrix - for j in range(n): + m = len(x_train) # Number of training samples + weights = np.eye(m) # Initialize weights as identity matrix + for j in range(m): diff = point - x_train[j] weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) From 3b24dfefc55e023a16123d11ec7cac26d7a33fb6 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Tue, 3 Jan 2023 08:15:01 -0800 Subject: [PATCH 34/36] Fix plot_preds return type The plot_preds function plots the data but doesn't actually return anything --- .../local_weighted_learning/local_weighted_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index d183a1f20077..8dd0e55d41df 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -155,7 +155,7 @@ def plot_preds( y_data: np.ndarray, x_name: str, y_name: str, -) -> plt.plot: +) -> None: """ Plot predictions and display the graph >>> pass # No doctests, function is for demo purposes only From 4bcae61b102992a650e9ec36a2747459e241fe9f Mon Sep 17 00:00:00 2001 From: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Date: Sat, 1 Apr 2023 06:37:14 +0000 Subject: [PATCH 35/36] updating DIRECTORY.md --- DIRECTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/DIRECTORY.md b/DIRECTORY.md index 1a641d8ecb59..33c816fc4add 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -317,6 +317,7 @@ * [Longest Sub Array](dynamic_programming/longest_sub_array.py) * [Matrix Chain Order](dynamic_programming/matrix_chain_order.py) * [Max Non Adjacent Sum](dynamic_programming/max_non_adjacent_sum.py) + * [Max Product Subarray](dynamic_programming/max_product_subarray.py) * [Max Sub Array](dynamic_programming/max_sub_array.py) * [Max Sum Contiguous Subsequence](dynamic_programming/max_sum_contiguous_subsequence.py) * [Min Distance Up Bottom](dynamic_programming/min_distance_up_bottom.py) From 8d8f28f3733ec9d4fb3e2a715628fb6fec41e9fd Mon Sep 17 00:00:00 2001 From: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Date: Sat, 1 Apr 2023 16:49:21 +0000 Subject: [PATCH 36/36] updating DIRECTORY.md --- DIRECTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/DIRECTORY.md b/DIRECTORY.md index c781b17bf05f..588d0b1e542e 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -715,6 +715,7 @@ * [Archimedes Principle](physics/archimedes_principle.py) * [Casimir Effect](physics/casimir_effect.py) * [Centripetal Force](physics/centripetal_force.py) + * [Grahams Law](physics/grahams_law.py) * [Horizontal Projectile Motion](physics/horizontal_projectile_motion.py) * [Hubble Parameter](physics/hubble_parameter.py) * [Ideal Gas Law](physics/ideal_gas_law.py)