diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py index 839a5366d1cc..eb34dffdf1b6 100644 --- a/machine_learning/linear_regression.py +++ b/machine_learning/linear_regression.py @@ -1,13 +1,3 @@ -""" -Linear regression is the most basic type of regression commonly used for -predictive analysis. The idea is pretty simple: we have a dataset and we have -features associated with it. Features should be chosen very cautiously -as they determine how much our model will be able to make future predictions. -We try to set the weight of these features, over many iterations, so that they best -fit our dataset. In this particular code, I had used a CSGO dataset (ADR vs -Rating). We try to best fit a line through dataset and estimate the parameters. -""" - import numpy as np import requests @@ -15,32 +5,32 @@ def collect_dataset(): """Collect dataset of CSGO The dataset contains ADR vs Rating of a Player - :return : dataset obtained from the link, as matrix + :return: dataset obtained from the link, as a matrix """ response = requests.get( - "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" - "master/Week1/ADRvsRating.csv", + "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/master/Week1/ADRvsRating.csv", timeout=10, ) - lines = response.text.splitlines() - data = [] - for item in lines: - item = item.split(",") - data.append(item) - data.pop(0) # This is for removing the labels from the list - dataset = np.matrix(data) - return dataset + data = np.loadtxt(response.text.splitlines()[1:], delimiter=",") # Skip the header + return data + + +def normalize_features(data): + """Normalize feature values to have mean 0 and variance 1""" + means = np.mean(data[:, :-1], axis=0) + stds = np.std(data[:, :-1], axis=0) + data[:, :-1] = (data[:, :-1] - means) / stds + return data def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta): - """Run steep gradient descent and updates the Feature vector accordingly_ - :param data_x : contains the dataset - :param data_y : contains the output associated with each data-entry - :param len_data : length of the data_ - :param alpha : Learning rate of the model - :param theta : Feature vector (weight's for our model) - ;param return : Updated Feature's, using - curr_features - alpha_ * gradient(w.r.t. feature) + """Run steep gradient descent and updates the Feature vector accordingly + :param data_x: contains the dataset + :param data_y: contains the output associated with each data-entry + :param len_data: length of the data + :param alpha: Learning rate of the model + :param theta: Feature vector (weights for our model) + :return: Updated Features, using curr_features - alpha * gradient(w.r.t. feature) """ n = len_data @@ -53,11 +43,11 @@ def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta): def sum_of_square_error(data_x, data_y, len_data, theta): """Return sum of square error for error calculation - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :param len_data : len of the dataset - :param theta : contains the feature vector - :return : sum of square error computed from given feature's + :param data_x: contains our dataset + :param data_y: contains the output (result vector) + :param len_data: length of the dataset + :param theta: contains the feature vector + :return: sum of square error computed from given features """ prod = np.dot(theta, data_x.transpose()) prod -= data_y.transpose() @@ -68,31 +58,36 @@ def sum_of_square_error(data_x, data_y, len_data, theta): def run_linear_regression(data_x, data_y): """Implement Linear regression over the dataset - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :return : feature for line of best fit (Feature vector) + :param data_x: contains our dataset + :param data_y: contains the output (result vector) + :return: feature for the line of best fit (Feature vector) """ iterations = 100000 alpha = 0.0001550 no_features = data_x.shape[1] - len_data = data_x.shape[0] - 1 + len_data = data_x.shape[0] theta = np.zeros((1, no_features)) + rng = np.random.default_rng() # Create a random generator instance for i in range(iterations): - theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) - error = sum_of_square_error(data_x, data_y, len_data, theta) + indices = rng.choice(len_data, size=32, replace=False) # Randomly sample indices using the generator + x_batch = data_x[indices] + y_batch = data_y[indices] + + theta = run_steep_gradient_descent(x_batch, y_batch, len(x_batch), alpha, theta) + error = sum_of_square_error(x_batch, y_batch, len(x_batch), theta) print(f"At Iteration {i + 1} - Error is {error:.5f}") return theta def mean_absolute_error(predicted_y, original_y): - """Return sum of square error for error calculation - :param predicted_y : contains the output of prediction (result vector) - :param original_y : contains values of expected outcome - :return : mean absolute error computed from given feature's + """Return mean absolute error for error calculation + :param predicted_y: contains the output of prediction (result vector) + :param original_y: contains values of expected outcome + :return: mean absolute error computed from given features """ total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y)) return total / len(original_y) @@ -101,9 +96,10 @@ def mean_absolute_error(predicted_y, original_y): def main(): """Driver function""" data = collect_dataset() + data = normalize_features(data) # Normalize the features len_data = data.shape[0] - data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float) + data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float) # Add bias term data_y = data[:, -1].astype(float) theta = run_linear_regression(data_x, data_y)