1
1
import numpy as np
2
2
import pandas as pd
3
+ from collections import defaultdict
4
+ import math
3
5
4
- class XGBoostRegressor :
5
- '''Custom implementation of XGBoost regressor.
6
+ class XGBoostRegressor () :
7
+ '''Implementation of XGBoost regressor.
6
8
7
9
This implementation includes a simplified version of the XGBoost algorithm
8
- for regression tasks. It employs gradient boosting with decision trees as base learners.
10
+ for regression tasks. It includes gradient boosting with decision trees as base learners.
9
11
'''
10
12
11
13
def __init__ (self , params = None , random_seed = None ):
@@ -16,11 +18,11 @@ def __init__(self, params=None, random_seed=None):
16
18
random_seed (int): Seed for random number generation.
17
19
'''
18
20
# Set hyperparameters with defaults
19
- self .params = params or {}
20
- self .subsample = self .params . get ( 'subsample' , 1.0 )
21
- self .learning_rate = self .params . get ( 'learning_rate' , 0.3 )
22
- self .base_prediction = self .params . get ( 'base_score' , 0.5 )
23
- self .max_depth = self .params . get ( 'max_depth' , 5 )
21
+ self .params = defaultdict ( lambda : None , params )
22
+ self .subsample = self .params [ 'subsample' ] or 1.0
23
+ self .learning_rate = self .params [ 'learning_rate' ] or 0.3
24
+ self .base_prediction = self .params [ 'base_score' ] or 0.5
25
+ self .max_depth = self .params [ 'max_depth' ] or 5
24
26
self .random_seed = random_seed
25
27
self .boosters = []
26
28
@@ -44,7 +46,7 @@ def fit(self, X, y, objective, num_boost_round, verbose=False):
44
46
if self .subsample < 1.0 :
45
47
sample_idxs = np .random .choice (len (y ), size = int (self .subsample * len (y )), replace = False )
46
48
gradients , hessians = gradients [sample_idxs ], hessians [sample_idxs ]
47
- booster = TreeBooster (X , gradients , hessians , self .params , self .max_depth , self .random_seed )
49
+ booster = DecisionTreeBooster (X , gradients , hessians , self .params , self .max_depth , self .random_seed )
48
50
# Update predictions using learning rate and booster predictions
49
51
current_predictions += self .learning_rate * booster .predict (X )
50
52
self .boosters .append (booster )
@@ -65,14 +67,14 @@ def predict(self, X):
65
67
np .sum ([booster .predict (X ) for booster in self .boosters ], axis = 0 ))
66
68
67
69
68
- class TreeBooster :
70
+ class DecisionTreeBooster :
69
71
'''Decision tree booster for XGBoost regressor.'''
70
72
71
73
def __init__ (self , X , g , h , params , max_depth , random_seed = None ):
72
74
'''Initialize a decision tree booster.
73
75
74
76
Parameters:
75
- X (pd.DataFrame ): Feature matrix.
77
+ X (np.ndarray ): Feature matrix.
76
78
g (np.ndarray): Gradient values.
77
79
h (np.ndarray): Hessian values.
78
80
params (dict): Hyperparameters for the booster.
@@ -91,7 +93,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
91
93
np .random .seed (self .random_seed )
92
94
93
95
# Set data and indices
94
- self .X , self .g , self .h = X . values , g , h
96
+ self .X , self .g , self .h = X , g , h
95
97
self .n , self .c = X .shape [0 ], X .shape [1 ]
96
98
self .idxs = np .arange (self .n )
97
99
@@ -103,6 +105,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
103
105
if self .max_depth > 0 :
104
106
self ._maybe_insert_child_nodes ()
105
107
108
+
106
109
@property
107
110
def is_leaf (self ):
108
111
'''Check if the node is a leaf.'''
@@ -115,21 +118,22 @@ def _maybe_insert_child_nodes(self):
115
118
if self .is_leaf :
116
119
return
117
120
# Split the data based on the best feature and threshold
118
- x = self .X [self .idxs , self .split_feature_idx ]
121
+ x = self .X . values [self .idxs , self .split_feature_idx ]
119
122
left_idx = np .nonzero (x <= self .threshold )[0 ]
120
123
right_idx = np .nonzero (x > self .threshold )[0 ]
121
124
# Recur for left and right subtrees
122
- self .left = TreeBooster (self .X [ left_idx ] , self .g [ left_idx ] , self .h [ left_idx ] , self .params ,
123
- self .max_depth - 1 , self .random_seed )
124
- self .right = TreeBooster (self .X [ right_idx ] , self .g [ right_idx ] , self .h [ right_idx ] , self .params ,
125
- self .max_depth - 1 , self .random_seed )
125
+ self .left = DecisionTreeBooster (self .X , self .g , self .h , self .params ,
126
+ self .max_depth - 1 , self .idxs [ left_idx ] )
127
+ self .right = DecisionTreeBooster (self .X , self .g , self .h , self .params ,
128
+ self .max_depth - 1 , self .idxs [ right_idx ] )
126
129
127
130
def _find_better_split (self , feature_idx ):
128
131
'''Find the best split for a feature.'''
129
- x = self .X [self .idxs , feature_idx ]
132
+ x = self .X .values [self .idxs , feature_idx ]
133
+ g , h = self .g [self .idxs ], self .h [self .idxs ]
130
134
sort_idx = np .argsort (x )
131
- sort_g , sort_h , sort_x = self . g [ self . idxs ][ sort_idx ], self . h [ self . idxs ] [sort_idx ], x [sort_idx ]
132
- sum_g , sum_h = np .sum (sort_g ), np .sum (sort_h )
135
+ sort_g , sort_h , sort_x = g [ sort_idx ], h [sort_idx ], x [sort_idx ]
136
+ sum_g , sum_h = g .sum (), h .sum ()
133
137
sum_g_right , sum_h_right = sum_g , sum_h
134
138
sum_g_left , sum_h_left = 0. , 0.
135
139
@@ -147,19 +151,20 @@ def _find_better_split(self, feature_idx):
147
151
gain = 0.5 * ((sum_g_left ** 2 / (sum_h_left + self .reg_lambda ))
148
152
+ (sum_g_right ** 2 / (sum_h_right + self .reg_lambda ))
149
153
- (sum_g ** 2 / (sum_h + self .reg_lambda ))
150
- ) - self .gamma / 2 # Eq(7) in the xgboost paper
154
+ ) - self .gamma / 2 # Eq(7) in the xgboost paper
151
155
if gain > self .best_score_so_far :
152
156
self .split_feature_idx = feature_idx
153
157
self .best_score_so_far = gain
154
158
self .threshold = (x_i + x_i_next ) / 2
155
159
156
160
def predict (self , X ):
157
161
'''Make predictions using the trained booster.'''
158
- return np .array ([self ._predict_row (row ) for row in X ])
162
+ return np .array ([self ._predict_row (row ) for _ , row in X . iterrows () ])
159
163
160
164
def _predict_row (self , row ):
161
165
'''Recursively predict a single data point.'''
162
166
if self .is_leaf :
163
167
return self .value
164
- child = self .left if row [self .split_feature_idx ] <= self .threshold else self .right
168
+ child = self .left if row [self .split_feature_idx ] <= self .threshold \
169
+ else self .right
165
170
return child ._predict_row (row )
0 commit comments