Skip to content

Commit 6df5956

Browse files
authored
Update linear_regression.py
1 parent 40f65e8 commit 6df5956

File tree

1 file changed

+32
-93
lines changed

1 file changed

+32
-93
lines changed

Diff for: machine_learning/linear_regression.py

+32-93
Original file line numberDiff line numberDiff line change
@@ -1,117 +1,56 @@
1-
"""
2-
Linear regression is the most basic type of regression commonly used for
3-
predictive analysis. The idea is pretty simple: we have a dataset and we have
4-
features associated with it. Features should be chosen very cautiously
5-
as they determine how much our model will be able to make future predictions.
6-
We try to set the weight of these features, over many iterations, so that they best
7-
fit our dataset. In this particular code, I had used a CSGO dataset (ADR vs
8-
Rating). We try to best fit a line through dataset and estimate the parameters.
9-
"""
10-
111
import numpy as np
122
import requests
133

14-
154
def collect_dataset():
16-
"""Collect dataset of CSGO
17-
The dataset contains ADR vs Rating of a Player
18-
:return : dataset obtained from the link, as matrix
19-
"""
5+
"""Collect dataset of CSGO (ADR vs Rating of a Player)"""
206
response = requests.get(
21-
"https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/"
22-
"master/Week1/ADRvsRating.csv",
7+
"https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/master/Week1/ADRvsRating.csv",
238
timeout=10,
249
)
25-
lines = response.text.splitlines()
26-
data = []
27-
for item in lines:
28-
item = item.split(",")
29-
data.append(item)
30-
data.pop(0) # This is for removing the labels from the list
31-
dataset = np.matrix(data)
32-
return dataset
33-
34-
35-
def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta):
36-
"""Run steep gradient descent and updates the Feature vector accordingly_
37-
:param data_x : contains the dataset
38-
:param data_y : contains the output associated with each data-entry
39-
:param len_data : length of the data_
40-
:param alpha : Learning rate of the model
41-
:param theta : Feature vector (weight's for our model)
42-
;param return : Updated Feature's, using
43-
curr_features - alpha_ * gradient(w.r.t. feature)
44-
"""
45-
n = len_data
46-
47-
prod = np.dot(theta, data_x.transpose())
48-
prod -= data_y.transpose()
49-
sum_grad = np.dot(prod, data_x)
50-
theta = theta - (alpha / n) * sum_grad
51-
return theta
52-
10+
data = np.loadtxt(response.text.splitlines()[1:], delimiter=",") # Skip the header
11+
return data
5312

54-
def sum_of_square_error(data_x, data_y, len_data, theta):
55-
"""Return sum of square error for error calculation
56-
:param data_x : contains our dataset
57-
:param data_y : contains the output (result vector)
58-
:param len_data : len of the dataset
59-
:param theta : contains the feature vector
60-
:return : sum of square error computed from given feature's
61-
"""
62-
prod = np.dot(theta, data_x.transpose())
63-
prod -= data_y.transpose()
64-
sum_elem = np.sum(np.square(prod))
65-
error = sum_elem / (2 * len_data)
66-
return error
13+
def normalize_features(data):
14+
"""Normalize feature values to have mean 0 and variance 1"""
15+
means = np.mean(data[:, :-1], axis=0)
16+
stds = np.std(data[:, :-1], axis=0)
17+
data[:, :-1] = (data[:, :-1] - means) / stds
18+
return data
6719

68-
69-
def run_linear_regression(data_x, data_y):
70-
"""Implement Linear regression over the dataset
71-
:param data_x : contains our dataset
72-
:param data_y : contains the output (result vector)
73-
:return : feature for line of best fit (Feature vector)
74-
"""
75-
iterations = 100000
76-
alpha = 0.0001550
77-
78-
no_features = data_x.shape[1]
79-
len_data = data_x.shape[0] - 1
80-
81-
theta = np.zeros((1, no_features))
20+
def run_gradient_descent(data_x, data_y, alpha=0.01, iterations=1000, batch_size=32):
21+
"""Run gradient descent with mini-batch optimization"""
22+
len_data, no_features = data_x.shape
23+
theta = np.zeros(no_features)
8224

8325
for i in range(iterations):
84-
theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
85-
error = sum_of_square_error(data_x, data_y, len_data, theta)
86-
print(f"At Iteration {i + 1} - Error is {error:.5f}")
26+
indices = np.random.choice(len_data, batch_size, replace=False) # Randomly sample indices
27+
x_batch = data_x[indices]
28+
y_batch = data_y[indices]
29+
30+
predictions = x_batch @ theta # Vectorized predictions
31+
errors = predictions - y_batch
32+
33+
gradient = (1 / batch_size) * (x_batch.T @ errors) # Vectorized gradient
34+
theta -= alpha * gradient # Update theta
35+
36+
if i % 100 == 0: # Print error every 100 iterations
37+
error = np.mean(errors ** 2) # Mean Squared Error
38+
print(f"Iteration {i}: MSE = {error:.5f}")
8739

8840
return theta
8941

90-
91-
def mean_absolute_error(predicted_y, original_y):
92-
"""Return sum of square error for error calculation
93-
:param predicted_y : contains the output of prediction (result vector)
94-
:param original_y : contains values of expected outcome
95-
:return : mean absolute error computed from given feature's
96-
"""
97-
total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y))
98-
return total / len(original_y)
99-
100-
10142
def main():
10243
"""Driver function"""
10344
data = collect_dataset()
45+
data = normalize_features(data) # Normalize the features
10446

10547
len_data = data.shape[0]
106-
data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float)
107-
data_y = data[:, -1].astype(float)
48+
data_x = np.c_[np.ones(len_data), data[:, :-1]] # Add bias term
49+
data_y = data[:, -1]
10850

109-
theta = run_linear_regression(data_x, data_y)
110-
len_result = theta.shape[1]
51+
theta = run_gradient_descent(data_x, data_y)
11152
print("Resultant Feature vector : ")
112-
for i in range(len_result):
113-
print(f"{theta[0, i]:.5f}")
114-
53+
print(theta)
11554

11655
if __name__ == "__main__":
11756
main()

0 commit comments

Comments
 (0)