Skip to content

Commit 87262e9

Browse files
committed
issues TheAlgorithms#8067 Implemented XGBoostRegressor from scratch
1 parent 8b4342d commit 87262e9

File tree

1 file changed

+28
-23
lines changed

1 file changed

+28
-23
lines changed

machine_learning/xgboost_regressor.py

+28-23
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import numpy as np
22
import pandas as pd
3+
from collections import defaultdict
4+
import math
35

4-
class XGBoostRegressor:
5-
'''Custom implementation of XGBoost regressor.
6+
class XGBoostRegressor():
7+
'''Implementation of XGBoost regressor.
68
79
This implementation includes a simplified version of the XGBoost algorithm
8-
for regression tasks. It employs gradient boosting with decision trees as base learners.
10+
for regression tasks. It includes gradient boosting with decision trees as base learners.
911
'''
1012

1113
def __init__(self, params=None, random_seed=None):
@@ -16,11 +18,11 @@ def __init__(self, params=None, random_seed=None):
1618
random_seed (int): Seed for random number generation.
1719
'''
1820
# Set hyperparameters with defaults
19-
self.params = params or {}
20-
self.subsample = self.params.get('subsample', 1.0)
21-
self.learning_rate = self.params.get('learning_rate', 0.3)
22-
self.base_prediction = self.params.get('base_score', 0.5)
23-
self.max_depth = self.params.get('max_depth', 5)
21+
self.params = defaultdict(lambda: None, params)
22+
self.subsample = self.params['subsample'] or 1.0
23+
self.learning_rate = self.params['learning_rate'] or 0.3
24+
self.base_prediction = self.params['base_score'] or 0.5
25+
self.max_depth = self.params['max_depth'] or 5
2426
self.random_seed = random_seed
2527
self.boosters = []
2628

@@ -44,7 +46,7 @@ def fit(self, X, y, objective, num_boost_round, verbose=False):
4446
if self.subsample < 1.0:
4547
sample_idxs = np.random.choice(len(y), size=int(self.subsample * len(y)), replace=False)
4648
gradients, hessians = gradients[sample_idxs], hessians[sample_idxs]
47-
booster = TreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed)
49+
booster = DecisionTreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed)
4850
# Update predictions using learning rate and booster predictions
4951
current_predictions += self.learning_rate * booster.predict(X)
5052
self.boosters.append(booster)
@@ -65,14 +67,14 @@ def predict(self, X):
6567
np.sum([booster.predict(X) for booster in self.boosters], axis=0))
6668

6769

68-
class TreeBooster:
70+
class DecisionTreeBooster:
6971
'''Decision tree booster for XGBoost regressor.'''
7072

7173
def __init__(self, X, g, h, params, max_depth, random_seed=None):
7274
'''Initialize a decision tree booster.
7375
7476
Parameters:
75-
X (pd.DataFrame): Feature matrix.
77+
X (np.ndarray): Feature matrix.
7678
g (np.ndarray): Gradient values.
7779
h (np.ndarray): Hessian values.
7880
params (dict): Hyperparameters for the booster.
@@ -91,7 +93,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
9193
np.random.seed(self.random_seed)
9294

9395
# Set data and indices
94-
self.X, self.g, self.h = X.values, g, h
96+
self.X, self.g, self.h = X, g, h
9597
self.n, self.c = X.shape[0], X.shape[1]
9698
self.idxs = np.arange(self.n)
9799

@@ -103,6 +105,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
103105
if self.max_depth > 0:
104106
self._maybe_insert_child_nodes()
105107

108+
106109
@property
107110
def is_leaf(self):
108111
'''Check if the node is a leaf.'''
@@ -115,21 +118,22 @@ def _maybe_insert_child_nodes(self):
115118
if self.is_leaf:
116119
return
117120
# Split the data based on the best feature and threshold
118-
x = self.X[self.idxs, self.split_feature_idx]
121+
x = self.X.values[self.idxs, self.split_feature_idx]
119122
left_idx = np.nonzero(x <= self.threshold)[0]
120123
right_idx = np.nonzero(x > self.threshold)[0]
121124
# Recur for left and right subtrees
122-
self.left = TreeBooster(self.X[left_idx], self.g[left_idx], self.h[left_idx], self.params,
123-
self.max_depth - 1, self.random_seed)
124-
self.right = TreeBooster(self.X[right_idx], self.g[right_idx], self.h[right_idx], self.params,
125-
self.max_depth - 1, self.random_seed)
125+
self.left = DecisionTreeBooster(self.X, self.g, self.h, self.params,
126+
self.max_depth - 1, self.idxs[left_idx])
127+
self.right = DecisionTreeBooster(self.X, self.g, self.h, self.params,
128+
self.max_depth - 1, self.idxs[right_idx])
126129

127130
def _find_better_split(self, feature_idx):
128131
'''Find the best split for a feature.'''
129-
x = self.X[self.idxs, feature_idx]
132+
x = self.X.values[self.idxs, feature_idx]
133+
g, h = self.g[self.idxs], self.h[self.idxs]
130134
sort_idx = np.argsort(x)
131-
sort_g, sort_h, sort_x = self.g[self.idxs][sort_idx], self.h[self.idxs][sort_idx], x[sort_idx]
132-
sum_g, sum_h = np.sum(sort_g), np.sum(sort_h)
135+
sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx]
136+
sum_g, sum_h = g.sum(), h.sum()
133137
sum_g_right, sum_h_right = sum_g, sum_h
134138
sum_g_left, sum_h_left = 0., 0.
135139

@@ -147,19 +151,20 @@ def _find_better_split(self, feature_idx):
147151
gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda))
148152
+ (sum_g_right**2 / (sum_h_right + self.reg_lambda))
149153
- (sum_g**2 / (sum_h + self.reg_lambda))
150-
) - self.gamma/2 # Eq(7) in the xgboost paper
154+
) - self.gamma/2 # Eq(7) in the xgboost paper
151155
if gain > self.best_score_so_far:
152156
self.split_feature_idx = feature_idx
153157
self.best_score_so_far = gain
154158
self.threshold = (x_i + x_i_next) / 2
155159

156160
def predict(self, X):
157161
'''Make predictions using the trained booster.'''
158-
return np.array([self._predict_row(row) for row in X])
162+
return np.array([self._predict_row(row) for _, row in X.iterrows()])
159163

160164
def _predict_row(self, row):
161165
'''Recursively predict a single data point.'''
162166
if self.is_leaf:
163167
return self.value
164-
child = self.left if row[self.split_feature_idx] <= self.threshold else self.right
168+
child = self.left if row[self.split_feature_idx] <= self.threshold \
169+
else self.right
165170
return child._predict_row(row)

0 commit comments

Comments
 (0)