From 89f8de564f8405e81d2e91caa5bd870463280474 Mon Sep 17 00:00:00 2001
From: Mahima Agarwal <agarwalmahima01@gmail.com>
Date: Thu, 28 Sep 2023 11:30:14 +0530
Subject: [PATCH] Fixes#8847

---
 machine_learning/linear_regression.py | 121 +++++++++++---------------
 1 file changed, 53 insertions(+), 68 deletions(-)

diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py
index 0847112ad538..e55a67f817f6 100644
--- a/machine_learning/linear_regression.py
+++ b/machine_learning/linear_regression.py
@@ -3,14 +3,16 @@
 predictive analysis. The idea is pretty simple: we have a dataset and we have
 features associated with it. Features should be chosen very cautiously
 as they determine how much our model will be able to make future predictions.
-We try to set the weight of these features, over many iterations, so that they best
-fit our dataset. In this particular code, I had used a CSGO dataset (ADR vs
-Rating). We try to best fit a line through dataset and estimate the parameters.
+We try to set the weight of these features, using "sum of rectangular area
+over sum of square area" method which is a direct method.
+In this particular code, I had used a CSGO dataset (ADR vs Rating).
+We try to best fit a line through dataset and estimate the parameters.
 """
 import numpy as np
 import requests
 
 
+# Function to collect the CSGO dataset
 def collect_dataset():
     """Collect dataset of CSGO
     The dataset contains ADR vs Rating of a Player
@@ -22,93 +24,76 @@ def collect_dataset():
     )
     lines = response.text.splitlines()
     data = []
+
     for item in lines:
         item = item.split(",")
         data.append(item)
-    data.pop(0)  # This is for removing the labels from the list
+
+    # Remove the labels (headers) from the list
+    data.pop(0)
+
+    # Convert data to a NumPy matrix
     dataset = np.matrix(data)
     return dataset
 
 
-def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta):
-    """Run steep gradient descent and updates the Feature vector accordingly_
-    :param data_x   : contains the dataset
-    :param data_y   : contains the output associated with each data-entry
-    :param len_data : length of the data_
-    :param alpha    : Learning rate of the model
-    :param theta    : Feature vector (weight's for our model)
-    ;param return    : Updated Feature's, using
-                       curr_features - alpha_ * gradient(w.r.t. feature)
+# Function to calculate Mean Absolute Error (MAE)
+def calculate_mae(predicted_y, original_y):
+    """Calculate Mean Absolute Error (MAE)
+    :param predicted_y: Contains the output of prediction (result vector)
+    :param original_y: Contains values of expected outcome
+    :return: MAE computed from given features
     """
-    n = len_data
-
-    prod = np.dot(theta, data_x.transpose())
-    prod -= data_y.transpose()
-    sum_grad = np.dot(prod, data_x)
-    theta = theta - (alpha / n) * sum_grad
-    return theta
-
-
-def sum_of_square_error(data_x, data_y, len_data, theta):
-    """Return sum of square error for error calculation
-    :param data_x    : contains our dataset
-    :param data_y    : contains the output (result vector)
-    :param len_data  : len of the dataset
-    :param theta     : contains the feature vector
-    :return          : sum of square error computed from given feature's
+    return sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y)) / len(
+        original_y
+    )
+
+
+# Function to perform simple linear regression
+def simple_solve(data_x, data_y):
     """
-    prod = np.dot(theta, data_x.transpose())
-    prod -= data_y.transpose()
-    sum_elem = np.sum(np.square(prod))
-    error = sum_elem / (2 * len_data)
-    return error
-
-
-def run_linear_regression(data_x, data_y):
-    """Implement Linear regression over the dataset
-    :param data_x  : contains our dataset
-    :param data_y  : contains the output (result vector)
-    :return        : feature for line of best fit (Feature vector)
+    Simple method of solving the univariate linear regression (like this problem)
+    Gradient is sum of rectangular area over the sum of square area from the centroid
+    Intercept can be worked out by using the centroid and solving c = y - mx
     """
-    iterations = 100000
-    alpha = 0.0001550
+    rect_area = 0
+    square_area = 0
+    x_bar = np.mean(data_x)
+    y_bar = np.mean(data_y)
 
-    no_features = data_x.shape[1]
-    len_data = data_x.shape[0] - 1
+    for idx, val in enumerate(data_x):
+        rect_area += (val - x_bar) * (data_y[idx] - y_bar)
+        square_area += (val - x_bar) ** 2
 
-    theta = np.zeros((1, no_features))
+    beta_1 = float(rect_area / square_area)
+    beta_0 = y_bar - beta_1 * x_bar
 
-    for i in range(iterations):
-        theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
-        error = sum_of_square_error(data_x, data_y, len_data, theta)
-        print(f"At Iteration {i + 1} - Error is {error:.5f}")
+    # Calculate sse (Sum of squares Error)
+    sse = sum(
+        (data_y[idx] - (beta_1 * val + beta_0)) ** 2 for idx, val in enumerate(data_x)
+    )
 
-    return theta
+    # Calculate mse (Mean square Error)
+    mse = sse / (
+        len(data_x) - 2
+    )  # Degrees of freedom is len(data_x) - 2 for simple linear regression
 
+    # Calculate half of mse
+    half_mse = mse / 2
 
-def mean_absolute_error(predicted_y, original_y):
-    """Return sum of square error for error calculation
-    :param predicted_y   : contains the output of prediction (result vector)
-    :param original_y    : contains values of expected outcome
-    :return          : mean absolute error computed from given feature's
-    """
-    total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y))
-    return total / len(original_y)
+    print(f"sse is: {sse}")
+    print(f"Half mse is: {half_mse}")
+    print(f"Coefficient is: {beta_1}")
+    print(f"Intercept is: {beta_0}")
 
 
+# Main driver function
 def main():
     """Driver function"""
     data = collect_dataset()
-
-    len_data = data.shape[0]
-    data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float)
     data_y = data[:, -1].astype(float)
-
-    theta = run_linear_regression(data_x, data_y)
-    len_result = theta.shape[1]
-    print("Resultant Feature vector : ")
-    for i in range(len_result):
-        print(f"{theta[0, i]:.5f}")
+    data_x = data[:, :-1].astype(float)
+    simple_solve(data_x, data_y)
 
 
 if __name__ == "__main__":