modified linear regression to work on OLS

smruthi-sumanth · smruthi-sumanth · commit 1486a54d654b · 2024-03-27T23:26:09.000+05:30
diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py
@@ -31,85 +31,32 @@ def collect_dataset():
     return dataset
 
 
-def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta):
-    """Run steep gradient descent and updates the Feature vector accordingly_
-    :param data_x   : contains the dataset
-    :param data_y   : contains the output associated with each data-entry
-    :param len_data : length of the data_
-    :param alpha    : Learning rate of the model
-    :param theta    : Feature vector (weight's for our model)
-    ;param return    : Updated Feature's, using
-                       curr_features - alpha_ * gradient(w.r.t. feature)
-    """
-    n = len_data
-
-    prod = np.dot(theta, data_x.transpose())
-    prod -= data_y.transpose()
-    sum_grad = np.dot(prod, data_x)
-    theta = theta - (alpha / n) * sum_grad
-    return theta
-
-
-def sum_of_square_error(data_x, data_y, len_data, theta):
-    """Return sum of square error for error calculation
-    :param data_x    : contains our dataset
-    :param data_y    : contains the output (result vector)
-    :param len_data  : len of the dataset
-    :param theta     : contains the feature vector
-    :return          : sum of square error computed from given feature's
-    """
-    prod = np.dot(theta, data_x.transpose())
-    prod -= data_y.transpose()
-    sum_elem = np.sum(np.square(prod))
-    error = sum_elem / (2 * len_data)
-    return error
-
-
-def run_linear_regression(data_x, data_y):
-    """Implement Linear regression over the dataset
-    :param data_x  : contains our dataset
-    :param data_y  : contains the output (result vector)
+def run_linear_regression_ols(data_x, data_y):
+    """Implement Linear regression using OLS over the dataset
+    :param data_x : contains our dataset
+    :param data_y : contains the output (result vector)
     :return        : feature for line of best fit (Feature vector)
     """
-    iterations = 100000
-    alpha = 0.0001550
-
-    no_features = data_x.shape[1]
-    len_data = data_x.shape[0] - 1
+    # Add a column of ones to data_x for the bias term
+    data_x = np.c_[np.ones(data_x.shape[0]), data_x].astype(float)
 
-    theta = np.zeros((1, no_features))
-
-    for i in range(iterations):
-        theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
-        error = sum_of_square_error(data_x, data_y, len_data, theta)
-        print(f"At Iteration {i + 1} - Error is {error:.5f}")
+    # Use NumPy's built-in function to solve the linear regression problem
+    theta = np.linalg.inv(data_x.T.dot(data_x)).dot(data_x.T).dot(data_y)
 
     return theta
 
 
-def mean_absolute_error(predicted_y, original_y):
-    """Return sum of square error for error calculation
-    :param predicted_y   : contains the output of prediction (result vector)
-    :param original_y    : contains values of expected outcome
-    :return          : mean absolute error computed from given feature's
-    """
-    total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y))
-    return total / len(original_y)
-
-
 def main():
     """Driver function"""
     data = collect_dataset()
-
-    len_data = data.shape[0]
-    data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float)
+    data_x = data[:, :-1].astype(float)
     data_y = data[:, -1].astype(float)
 
-    theta = run_linear_regression(data_x, data_y)
-    len_result = theta.shape[1]
-    print("Resultant Feature vector : ")
-    for i in range(len_result):
-        print(f"{theta[0, i]:.5f}")
+    theta = run_linear_regression_ols(data_x, data_y)
+    print("Resultant Feature vector (weights): ")
+    theta_list = theta.tolist()[0]
+    for i in range(len(theta_list)):
+        print(f"{theta_list[i]:.5f}")
 
 
 if __name__ == "__main__":