Skip to content

Fixed machine_learning/linear_regression.py doesn't give optimal coefficients #8847 #11684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 75 additions & 77 deletions machine_learning/linear_regression.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,13 @@
"""
Linear regression is the most basic type of regression commonly used for
predictive analysis. The idea is pretty simple: we have a dataset and we have
features associated with it. Features should be chosen very cautiously
as they determine how much our model will be able to make future predictions.
We try to set the weight of these features, over many iterations, so that they best
fit our dataset. In this particular code, I had used a CSGO dataset (ADR vs
Rating). We try to best fit a line through dataset and estimate the parameters.
"""

import numpy as np
import requests
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


def collect_dataset():

Check failure on line 7 in machine_learning/linear_regression.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/linear_regression.py:1:1: I001 Import block is un-sorted or un-formatted
"""Collect dataset of CSGO
The dataset contains ADR vs Rating of a Player
:return : dataset obtained from the link, as matrix
:return : dataset obtained from the link, as numpy array
"""
response = requests.get(
"https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/"
Expand All @@ -24,93 +16,99 @@
)
lines = response.text.splitlines()
data = []
for item in lines:
for item in lines[1:]: # Skip the header
item = item.split(",")
data.append(item)
data.pop(0) # This is for removing the labels from the list
dataset = np.matrix(data)
dataset = np.array(data, dtype=float)
return dataset


def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta):
"""Run steep gradient descent and updates the Feature vector accordingly_
:param data_x : contains the dataset
:param data_y : contains the output associated with each data-entry
:param len_data : length of the data_
:param alpha : Learning rate of the model
:param theta : Feature vector (weight's for our model)
;param return : Updated Feature's, using
curr_features - alpha_ * gradient(w.r.t. feature)
def run_gradient_descent(X, y, learning_rate=0.0001550, iterations=100000):

Check failure on line 26 in machine_learning/linear_regression.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N803)

machine_learning/linear_regression.py:26:26: N803 Argument name `X` should be lowercase
"""Run gradient descent to find approximate coefficients
:param X: feature matrix
:param y: target vector
:param learning_rate: learning rate for gradient descent
:param iterations: number of iterations
:return: coefficients (intercept and slope)
"""
n = len_data
m = X.shape[0]
theta = np.zeros(X.shape[1])

for i in range(iterations):
h = np.dot(X, theta)
gradient = np.dot(X.T, (h - y)) / m
theta -= learning_rate * gradient

if i % 10000 == 0:
mse = np.mean((h - y) ** 2)
print(f"Iteration {i}: MSE = {mse:.5f}")

prod = np.dot(theta, data_x.transpose())
prod -= data_y.transpose()
sum_grad = np.dot(prod, data_x)
theta = theta - (alpha / n) * sum_grad
return theta


def sum_of_square_error(data_x, data_y, len_data, theta):
"""Return sum of square error for error calculation
:param data_x : contains our dataset
:param data_y : contains the output (result vector)
:param len_data : len of the dataset
:param theta : contains the feature vector
:return : sum of square error computed from given feature's
"""
prod = np.dot(theta, data_x.transpose())
prod -= data_y.transpose()
sum_elem = np.sum(np.square(prod))
error = sum_elem / (2 * len_data)
return error


def run_linear_regression(data_x, data_y):
"""Implement Linear regression over the dataset
:param data_x : contains our dataset
:param data_y : contains the output (result vector)
:return : feature for line of best fit (Feature vector)
def calculate_ols_coefficients(X, y):

Check failure on line 49 in machine_learning/linear_regression.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N803)

machine_learning/linear_regression.py:49:32: N803 Argument name `X` should be lowercase
"""Calculate optimal coefficients using the normal equation
:param X: feature matrix
:param y: target vector
:return: coefficients (intercept and slope)
"""
iterations = 100000
alpha = 0.0001550
return np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

no_features = data_x.shape[1]
len_data = data_x.shape[0] - 1

theta = np.zeros((1, no_features))
def main():
"""Driver function"""
data = collect_dataset()

for i in range(iterations):
theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
error = sum_of_square_error(data_x, data_y, len_data, theta)
print(f"At Iteration {i + 1} - Error is {error:.5f}")
X = data[:, 0].reshape(-1, 1)

Check failure on line 62 in machine_learning/linear_regression.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N806)

machine_learning/linear_regression.py:62:5: N806 Variable `X` in function should be lowercase
y = data[:, 1]

return theta
# Add intercept term to X
X_with_intercept = np.c_[np.ones(X.shape[0]), X]

Check failure on line 66 in machine_learning/linear_regression.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N806)

machine_learning/linear_regression.py:66:5: N806 Variable `X_with_intercept` in function should be lowercase

# Gradient Descent
gd_theta = run_gradient_descent(X_with_intercept, y)
print(
f"Gradient Descent coefficients: intercept = {gd_theta[0]:.5f}, slope = {gd_theta[1]:.5f}"

Check failure on line 71 in machine_learning/linear_regression.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/linear_regression.py:71:89: E501 Line too long (98 > 88)
)

def mean_absolute_error(predicted_y, original_y):
"""Return sum of square error for error calculation
:param predicted_y : contains the output of prediction (result vector)
:param original_y : contains values of expected outcome
:return : mean absolute error computed from given feature's
"""
total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y))
return total / len(original_y)
# Ordinary Least Squares (Normal Equation)
ols_theta = calculate_ols_coefficients(X_with_intercept, y)
print(
f"OLS coefficients: intercept = {ols_theta[0]:.5f}, slope = {ols_theta[1]:.5f}"
)

# Sklearn for comparison
reg = LinearRegression().fit(X, y)
print(
f"Sklearn coefficients: intercept = {reg.intercept_:.5f}, slope = {reg.coef_[0]:.5f}"

Check failure on line 83 in machine_learning/linear_regression.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/linear_regression.py:83:89: E501 Line too long (93 > 88)
)

def main():
"""Driver function"""
data = collect_dataset()
# Calculate and print MSE for each method
gd_mse = np.mean((np.dot(X_with_intercept, gd_theta) - y) ** 2)
ols_mse = np.mean((np.dot(X_with_intercept, ols_theta) - y) ** 2)
sklearn_mse = np.mean((reg.predict(X) - y) ** 2)

len_data = data.shape[0]
data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float)
data_y = data[:, -1].astype(float)
print(f"Gradient Descent MSE: {gd_mse:.5f}")
print(f"OLS MSE: {ols_mse:.5f}")
print(f"Sklearn MSE: {sklearn_mse:.5f}")

theta = run_linear_regression(data_x, data_y)
len_result = theta.shape[1]
print("Resultant Feature vector : ")
for i in range(len_result):
print(f"{theta[0, i]:.5f}")
# Plotting
plt.scatter(X, y, color="lightgray", label="Data points")
plt.plot(
X, np.dot(X_with_intercept, gd_theta), color="red", label="Gradient Descent"
)
plt.plot(
X,
np.dot(X_with_intercept, ols_theta),
color="green",
label="OLS (Normal Equation)",
)
plt.plot(X, reg.predict(X), color="blue", label="Sklearn")
plt.legend()
plt.xlabel("ADR")
plt.ylabel("Rating")
plt.title("Linear Regression: ADR vs Rating")
plt.show()


if __name__ == "__main__":
Expand Down
5 changes: 5 additions & 0 deletions maths/pi_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def calculate_pi(limit: int) -> str:
'3.14159265358979323846264338327950288419716939937510'
>>> calculate_pi(80)
'3.14159265358979323846264338327950288419716939937510582097494459230781640628620899'
>>> calculate_pi(100)
'3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679'
>>> calculate_pi(150)
'3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233785'

"""
# Variables used for the iteration process
q = 1
Expand Down
Loading