From 7e69e5f626101d4d571510e1522ffc70f0768708 Mon Sep 17 00:00:00 2001 From: Epik-Whale463 Date: Thu, 26 Sep 2024 12:58:40 +0530 Subject: [PATCH 1/5] Add tests to pi_generator #9947 --- maths/pi_generator.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/maths/pi_generator.py b/maths/pi_generator.py index 97f2c540c1ce..a754457572fc 100644 --- a/maths/pi_generator.py +++ b/maths/pi_generator.py @@ -34,6 +34,13 @@ def calculate_pi(limit: int) -> str: '3.14159265358979323846264338327950288419716939937510' >>> calculate_pi(80) '3.14159265358979323846264338327950288419716939937510582097494459230781640628620899' + >>> calculate_pi(100) + '3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679' + >>> calculate_pi(150) + '3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233785' + >>> calculate_pi(200) + '3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233785661758005171150968760911718138199318110487701902711116020288271298794483246506877457713886116427171479924442928230863465674818467669405132000568127145263560827785771342757789609173637178721468440901224953431338055097754720181942947403241217' + """ # Variables used for the iteration process q = 1 From 0e1b238efb84c049d229a05b509efab630531372 Mon Sep 17 00:00:00 2001 From: Epik-Whale463 Date: Thu, 26 Sep 2024 13:13:13 +0530 Subject: [PATCH 2/5] Changed tests in pi_generator --- maths/pi_generator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/maths/pi_generator.py b/maths/pi_generator.py index a754457572fc..6744a99fff2b 100644 --- a/maths/pi_generator.py +++ b/maths/pi_generator.py @@ -38,9 +38,7 @@ def calculate_pi(limit: int) -> str: '3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679' >>> calculate_pi(150) '3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233785' - >>> calculate_pi(200) - '3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233785661758005171150968760911718138199318110487701902711116020288271298794483246506877457713886116427171479924442928230863465674818467669405132000568127145263560827785771342757789609173637178721468440901224953431338055097754720181942947403241217' - + """ # Variables used for the iteration process q = 1 From 25a13fee51b101ce377778a1fe89b2aa6aa3c275 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 07:44:18 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- maths/pi_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maths/pi_generator.py b/maths/pi_generator.py index 6744a99fff2b..b633d97d3983 100644 --- a/maths/pi_generator.py +++ b/maths/pi_generator.py @@ -38,7 +38,7 @@ def calculate_pi(limit: int) -> str: '3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679' >>> calculate_pi(150) '3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233785' - + """ # Variables used for the iteration process q = 1 From ece859d400758ca16c21a2979426648672c90f5e Mon Sep 17 00:00:00 2001 From: Epik-Whale463 Date: Wed, 2 Oct 2024 23:13:22 +0530 Subject: [PATCH 4/5] Refactor linear_regression.py: Add OLS, fix gradient descent, compare methods Improve accuracy, add sklearn benchmark, and visualize results --- machine_learning/linear_regression.py | 156 +++++++++++--------------- 1 file changed, 68 insertions(+), 88 deletions(-) diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py index 839a5366d1cc..cfcd17b48f72 100644 --- a/machine_learning/linear_regression.py +++ b/machine_learning/linear_regression.py @@ -1,21 +1,12 @@ -""" -Linear regression is the most basic type of regression commonly used for -predictive analysis. The idea is pretty simple: we have a dataset and we have -features associated with it. Features should be chosen very cautiously -as they determine how much our model will be able to make future predictions. -We try to set the weight of these features, over many iterations, so that they best -fit our dataset. In this particular code, I had used a CSGO dataset (ADR vs -Rating). We try to best fit a line through dataset and estimate the parameters. -""" - import numpy as np import requests - +from sklearn.linear_model import LinearRegression +import matplotlib.pyplot as plt def collect_dataset(): """Collect dataset of CSGO The dataset contains ADR vs Rating of a Player - :return : dataset obtained from the link, as matrix + :return : dataset obtained from the link, as numpy array """ response = requests.get( "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" @@ -24,94 +15,83 @@ def collect_dataset(): ) lines = response.text.splitlines() data = [] - for item in lines: + for item in lines[1:]: # Skip the header item = item.split(",") data.append(item) - data.pop(0) # This is for removing the labels from the list - dataset = np.matrix(data) + dataset = np.array(data, dtype=float) return dataset - -def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta): - """Run steep gradient descent and updates the Feature vector accordingly_ - :param data_x : contains the dataset - :param data_y : contains the output associated with each data-entry - :param len_data : length of the data_ - :param alpha : Learning rate of the model - :param theta : Feature vector (weight's for our model) - ;param return : Updated Feature's, using - curr_features - alpha_ * gradient(w.r.t. feature) - """ - n = len_data - - prod = np.dot(theta, data_x.transpose()) - prod -= data_y.transpose() - sum_grad = np.dot(prod, data_x) - theta = theta - (alpha / n) * sum_grad - return theta - - -def sum_of_square_error(data_x, data_y, len_data, theta): - """Return sum of square error for error calculation - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :param len_data : len of the dataset - :param theta : contains the feature vector - :return : sum of square error computed from given feature's - """ - prod = np.dot(theta, data_x.transpose()) - prod -= data_y.transpose() - sum_elem = np.sum(np.square(prod)) - error = sum_elem / (2 * len_data) - return error - - -def run_linear_regression(data_x, data_y): - """Implement Linear regression over the dataset - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :return : feature for line of best fit (Feature vector) +def run_gradient_descent(X, y, learning_rate=0.0001550, iterations=100000): + """Run gradient descent to find approximate coefficients + :param X: feature matrix + :param y: target vector + :param learning_rate: learning rate for gradient descent + :param iterations: number of iterations + :return: coefficients (intercept and slope) """ - iterations = 100000 - alpha = 0.0001550 - - no_features = data_x.shape[1] - len_data = data_x.shape[0] - 1 - - theta = np.zeros((1, no_features)) - + m = X.shape[0] + theta = np.zeros(X.shape[1]) + for i in range(iterations): - theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) - error = sum_of_square_error(data_x, data_y, len_data, theta) - print(f"At Iteration {i + 1} - Error is {error:.5f}") - + h = np.dot(X, theta) + gradient = np.dot(X.T, (h - y)) / m + theta -= learning_rate * gradient + + if i % 10000 == 0: + mse = np.mean((h - y) ** 2) + print(f"Iteration {i}: MSE = {mse:.5f}") + return theta - -def mean_absolute_error(predicted_y, original_y): - """Return sum of square error for error calculation - :param predicted_y : contains the output of prediction (result vector) - :param original_y : contains values of expected outcome - :return : mean absolute error computed from given feature's +def calculate_ols_coefficients(X, y): + """Calculate optimal coefficients using the normal equation + :param X: feature matrix + :param y: target vector + :return: coefficients (intercept and slope) """ - total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y)) - return total / len(original_y) - + return np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y) def main(): """Driver function""" data = collect_dataset() - - len_data = data.shape[0] - data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float) - data_y = data[:, -1].astype(float) - - theta = run_linear_regression(data_x, data_y) - len_result = theta.shape[1] - print("Resultant Feature vector : ") - for i in range(len_result): - print(f"{theta[0, i]:.5f}") - + + X = data[:, 0].reshape(-1, 1) + y = data[:, 1] + + # Add intercept term to X + X_with_intercept = np.c_[np.ones(X.shape[0]), X] + + # Gradient Descent + gd_theta = run_gradient_descent(X_with_intercept, y) + print(f"Gradient Descent coefficients: intercept = {gd_theta[0]:.5f}, slope = {gd_theta[1]:.5f}") + + # Ordinary Least Squares (Normal Equation) + ols_theta = calculate_ols_coefficients(X_with_intercept, y) + print(f"OLS coefficients: intercept = {ols_theta[0]:.5f}, slope = {ols_theta[1]:.5f}") + + # Sklearn for comparison + reg = LinearRegression().fit(X, y) + print(f"Sklearn coefficients: intercept = {reg.intercept_:.5f}, slope = {reg.coef_[0]:.5f}") + + # Calculate and print MSE for each method + gd_mse = np.mean((np.dot(X_with_intercept, gd_theta) - y) ** 2) + ols_mse = np.mean((np.dot(X_with_intercept, ols_theta) - y) ** 2) + sklearn_mse = np.mean((reg.predict(X) - y) ** 2) + + print(f"Gradient Descent MSE: {gd_mse:.5f}") + print(f"OLS MSE: {ols_mse:.5f}") + print(f"Sklearn MSE: {sklearn_mse:.5f}") + + # Plotting + plt.scatter(X, y, color="lightgray", label="Data points") + plt.plot(X, np.dot(X_with_intercept, gd_theta), color="red", label="Gradient Descent") + plt.plot(X, np.dot(X_with_intercept, ols_theta), color="green", label="OLS (Normal Equation)") + plt.plot(X, reg.predict(X), color="blue", label="Sklearn") + plt.legend() + plt.xlabel("ADR") + plt.ylabel("Rating") + plt.title("Linear Regression: ADR vs Rating") + plt.show() if __name__ == "__main__": - main() + main() \ No newline at end of file From 3a07610ae6e4cdb2bcdb7d1cf08b8ae5d22933cd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:46:58 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/linear_regression.py | 52 ++++++++++++++++++--------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py index cfcd17b48f72..b71dcc07ecb9 100644 --- a/machine_learning/linear_regression.py +++ b/machine_learning/linear_regression.py @@ -3,6 +3,7 @@ from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt + def collect_dataset(): """Collect dataset of CSGO The dataset contains ADR vs Rating of a Player @@ -21,6 +22,7 @@ def collect_dataset(): dataset = np.array(data, dtype=float) return dataset + def run_gradient_descent(X, y, learning_rate=0.0001550, iterations=100000): """Run gradient descent to find approximate coefficients :param X: feature matrix @@ -31,18 +33,19 @@ def run_gradient_descent(X, y, learning_rate=0.0001550, iterations=100000): """ m = X.shape[0] theta = np.zeros(X.shape[1]) - + for i in range(iterations): h = np.dot(X, theta) gradient = np.dot(X.T, (h - y)) / m theta -= learning_rate * gradient - + if i % 10000 == 0: mse = np.mean((h - y) ** 2) print(f"Iteration {i}: MSE = {mse:.5f}") - + return theta + def calculate_ols_coefficients(X, y): """Calculate optimal coefficients using the normal equation :param X: feature matrix @@ -51,41 +54,55 @@ def calculate_ols_coefficients(X, y): """ return np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y) + def main(): """Driver function""" data = collect_dataset() - + X = data[:, 0].reshape(-1, 1) y = data[:, 1] - + # Add intercept term to X X_with_intercept = np.c_[np.ones(X.shape[0]), X] - + # Gradient Descent gd_theta = run_gradient_descent(X_with_intercept, y) - print(f"Gradient Descent coefficients: intercept = {gd_theta[0]:.5f}, slope = {gd_theta[1]:.5f}") - + print( + f"Gradient Descent coefficients: intercept = {gd_theta[0]:.5f}, slope = {gd_theta[1]:.5f}" + ) + # Ordinary Least Squares (Normal Equation) ols_theta = calculate_ols_coefficients(X_with_intercept, y) - print(f"OLS coefficients: intercept = {ols_theta[0]:.5f}, slope = {ols_theta[1]:.5f}") - + print( + f"OLS coefficients: intercept = {ols_theta[0]:.5f}, slope = {ols_theta[1]:.5f}" + ) + # Sklearn for comparison reg = LinearRegression().fit(X, y) - print(f"Sklearn coefficients: intercept = {reg.intercept_:.5f}, slope = {reg.coef_[0]:.5f}") - + print( + f"Sklearn coefficients: intercept = {reg.intercept_:.5f}, slope = {reg.coef_[0]:.5f}" + ) + # Calculate and print MSE for each method gd_mse = np.mean((np.dot(X_with_intercept, gd_theta) - y) ** 2) ols_mse = np.mean((np.dot(X_with_intercept, ols_theta) - y) ** 2) sklearn_mse = np.mean((reg.predict(X) - y) ** 2) - + print(f"Gradient Descent MSE: {gd_mse:.5f}") print(f"OLS MSE: {ols_mse:.5f}") print(f"Sklearn MSE: {sklearn_mse:.5f}") - + # Plotting plt.scatter(X, y, color="lightgray", label="Data points") - plt.plot(X, np.dot(X_with_intercept, gd_theta), color="red", label="Gradient Descent") - plt.plot(X, np.dot(X_with_intercept, ols_theta), color="green", label="OLS (Normal Equation)") + plt.plot( + X, np.dot(X_with_intercept, gd_theta), color="red", label="Gradient Descent" + ) + plt.plot( + X, + np.dot(X_with_intercept, ols_theta), + color="green", + label="OLS (Normal Equation)", + ) plt.plot(X, reg.predict(X), color="blue", label="Sklearn") plt.legend() plt.xlabel("ADR") @@ -93,5 +110,6 @@ def main(): plt.title("Linear Regression: ADR vs Rating") plt.show() + if __name__ == "__main__": - main() \ No newline at end of file + main()