issues TheAlgorithms#8067 Implemented XGBoostRegressor from scratch

YugantGotmare · YugantGotmare · commit 87262e94f9c6 · 2023-10-05T11:55:12.000+05:30
diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py
@@ -1,11 +1,13 @@
 import numpy as np
 import pandas as pd
+from collections import defaultdict
+import math
 
-class XGBoostRegressor:
-    '''Custom implementation of XGBoost regressor.
+class XGBoostRegressor():
+    '''Implementation of XGBoost regressor.
     
     This implementation includes a simplified version of the XGBoost algorithm
-    for regression tasks. It employs gradient boosting with decision trees as base learners.
+    for regression tasks. It includes gradient boosting with decision trees as base learners.
     '''
     
     def __init__(self, params=None, random_seed=None):
@@ -16,11 +18,11 @@ def __init__(self, params=None, random_seed=None):
             random_seed (int): Seed for random number generation.
         '''
         # Set hyperparameters with defaults
-        self.params = params or {}
-        self.subsample = self.params.get('subsample', 1.0)
-        self.learning_rate = self.params.get('learning_rate', 0.3)
-        self.base_prediction = self.params.get('base_score', 0.5)
-        self.max_depth = self.params.get('max_depth', 5)
+        self.params = defaultdict(lambda: None, params)
+        self.subsample = self.params['subsample'] or 1.0
+        self.learning_rate = self.params['learning_rate'] or 0.3
+        self.base_prediction = self.params['base_score'] or 0.5
+        self.max_depth = self.params['max_depth'] or 5
         self.random_seed = random_seed
         self.boosters = []
                 
@@ -44,7 +46,7 @@ def fit(self, X, y, objective, num_boost_round, verbose=False):
             if self.subsample < 1.0:
                 sample_idxs = np.random.choice(len(y), size=int(self.subsample * len(y)), replace=False)
                 gradients, hessians = gradients[sample_idxs], hessians[sample_idxs]
-            booster = TreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed)
+            booster = DecisionTreeBooster(X, gradients, hessians, self.params, self.max_depth, self.random_seed)
             # Update predictions using learning rate and booster predictions
             current_predictions += self.learning_rate * booster.predict(X)
             self.boosters.append(booster)
@@ -65,14 +67,14 @@ def predict(self, X):
                 np.sum([booster.predict(X) for booster in self.boosters], axis=0))
 
 
-class TreeBooster:
+class DecisionTreeBooster:
     '''Decision tree booster for XGBoost regressor.'''
     
     def __init__(self, X, g, h, params, max_depth, random_seed=None):
         '''Initialize a decision tree booster.
         
         Parameters:
-            X (pd.DataFrame): Feature matrix.
+            X (np.ndarray): Feature matrix.
             g (np.ndarray): Gradient values.
             h (np.ndarray): Hessian values.
             params (dict): Hyperparameters for the booster.
@@ -91,7 +93,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
         np.random.seed(self.random_seed)
         
         # Set data and indices
-        self.X, self.g, self.h = X.values, g, h
+        self.X, self.g, self.h = X, g, h
         self.n, self.c = X.shape[0], X.shape[1]
         self.idxs = np.arange(self.n)
         
@@ -103,6 +105,7 @@ def __init__(self, X, g, h, params, max_depth, random_seed=None):
         if self.max_depth > 0:
             self._maybe_insert_child_nodes()
 
+
     @property
     def is_leaf(self):
         '''Check if the node is a leaf.'''
@@ -115,21 +118,22 @@ def _maybe_insert_child_nodes(self):
         if self.is_leaf:
             return
         # Split the data based on the best feature and threshold
-        x = self.X[self.idxs, self.split_feature_idx]
+        x = self.X.values[self.idxs, self.split_feature_idx]
         left_idx = np.nonzero(x <= self.threshold)[0]
         right_idx = np.nonzero(x > self.threshold)[0]
         # Recur for left and right subtrees
-        self.left = TreeBooster(self.X[left_idx], self.g[left_idx], self.h[left_idx], self.params, 
-                                self.max_depth - 1, self.random_seed)
-        self.right = TreeBooster(self.X[right_idx], self.g[right_idx], self.h[right_idx], self.params, 
-                                 self.max_depth - 1, self.random_seed)
+        self.left = DecisionTreeBooster(self.X, self.g, self.h, self.params, 
+                                self.max_depth - 1, self.idxs[left_idx])
+        self.right = DecisionTreeBooster(self.X, self.g, self.h, self.params, 
+                                 self.max_depth - 1, self.idxs[right_idx])
 
     def _find_better_split(self, feature_idx):
         '''Find the best split for a feature.'''
-        x = self.X[self.idxs, feature_idx]
+        x = self.X.values[self.idxs, feature_idx]
+        g, h = self.g[self.idxs], self.h[self.idxs]
         sort_idx = np.argsort(x)
-        sort_g, sort_h, sort_x = self.g[self.idxs][sort_idx], self.h[self.idxs][sort_idx], x[sort_idx]
-        sum_g, sum_h = np.sum(sort_g), np.sum(sort_h)
+        sort_g, sort_h, sort_x = g[sort_idx], h[sort_idx], x[sort_idx]
+        sum_g, sum_h = g.sum(), h.sum()
         sum_g_right, sum_h_right = sum_g, sum_h
         sum_g_left, sum_h_left = 0., 0.
 
@@ -147,19 +151,20 @@ def _find_better_split(self, feature_idx):
             gain = 0.5 * ((sum_g_left**2 / (sum_h_left + self.reg_lambda))
                           + (sum_g_right**2 / (sum_h_right + self.reg_lambda))
                           - (sum_g**2 / (sum_h + self.reg_lambda))
-                          ) - self.gamma/2  # Eq(7) in the xgboost paper
+                          ) - self.gamma/2 # Eq(7) in the xgboost paper
             if gain > self.best_score_so_far: 
                 self.split_feature_idx = feature_idx
                 self.best_score_so_far = gain
                 self.threshold = (x_i + x_i_next) / 2
                 
     def predict(self, X):
         '''Make predictions using the trained booster.'''
-        return np.array([self._predict_row(row) for row in X])
+        return np.array([self._predict_row(row) for _, row in X.iterrows()])
 
     def _predict_row(self, row):
         '''Recursively predict a single data point.'''
         if self.is_leaf: 
             return self.value
-        child = self.left if row[self.split_feature_idx] <= self.threshold else self.right
+        child = self.left if row[self.split_feature_idx] <= self.threshold \
+            else self.right
         return child._predict_row(row)