Skip to content

Commit 3a7869b

Browse files
Customized XGBoost Classifier
1 parent 616e284 commit 3a7869b

File tree

1 file changed

+126
-1
lines changed

1 file changed

+126
-1
lines changed

machine_learning/xgboost_classifier.py

+126-1
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# XGBoost Classifier Example
22
import numpy as np
3+
from decision_tree import DecisionTree
34
from matplotlib import pyplot as plt
45
from sklearn.datasets import load_iris
56
from sklearn.metrics import ConfusionMatrixDisplay
67
from sklearn.model_selection import train_test_split
78
from xgboost import XGBClassifier
89

9-
1010
def data_handling(data: dict) -> tuple:
1111
# Split dataset into features and target
1212
# data is features
@@ -20,6 +20,131 @@ def data_handling(data: dict) -> tuple:
2020
"""
2121
return (data["data"], data["target"])
2222

23+
class XGBClassifier:
24+
"""
25+
An implementation of a gradient boosting classifier inspired by XGBoost.
26+
27+
This implementation uses multi-class boosting with a logistic (softmax) loss.
28+
It trains one regression tree per class on the negative gradient (residual)
29+
at each boosting iteration.
30+
31+
Parameters
32+
----------
33+
n_estimators : int, default=100
34+
The number of boosting rounds.
35+
learning_rate : float, default=0.3
36+
Step size shrinkage used in updates to prevent overfitting.
37+
max_depth : int, default=3
38+
Maximum depth of the regression trees.
39+
random_state : int, default=0
40+
Random seed.
41+
42+
**Important:**
43+
Due to limitations of our custom DecisionTree (which only supports one-dimensional input),
44+
only the first feature (column 0) of the dataset is used when training each tree.
45+
"""
46+
47+
def __init__(self, n_estimators: int = 100, learning_rate: float = 0.3,
48+
max_depth: int = 3, random_state: int = 0):
49+
self.n_estimators = n_estimators
50+
self.learning_rate = learning_rate
51+
self.max_depth = max_depth
52+
self.random_state = random_state
53+
54+
# List of lists of trees; for each boosting round, we have one tree per class.
55+
self.trees = []
56+
self.num_class = None
57+
self.initial_pred = None # Initial log-odds per class
58+
59+
def fit(self, X: np.ndarray, y: np.ndarray) -> None:
60+
"""
61+
Fit the gradient boosting model.
62+
63+
Parameters
64+
----------
65+
X : np.ndarray, shape = (n_samples, n_features)
66+
Training data.
67+
y : np.ndarray, shape = (n_samples,)
68+
Class labels (assumed to be integers 0, 1, ..., K-1).
69+
"""
70+
n_samples = X.shape[0]
71+
self.num_class = np.unique(y).shape[0]
72+
73+
# One-hot encode the labels.
74+
y_onehot = np.zeros((n_samples, self.num_class))
75+
y_onehot[np.arange(n_samples), y] = 1
76+
77+
# Initialize predictions F with the log class probabilities (log-odds).
78+
class_counts = np.bincount(y, minlength=self.num_class)
79+
class_prob = class_counts / n_samples
80+
initial_score = np.log(class_prob + 1e-10) # add small constant to avoid log(0)
81+
self.initial_pred = initial_score # shape: (num_class,)
82+
F = np.tile(initial_score, (n_samples, 1)) # shape: (n_samples, num_class)
83+
84+
# Boosting rounds.
85+
for t in range(self.n_estimators):
86+
# Compute probabilities using softmax.
87+
exp_F = np.exp(F)
88+
p = exp_F / np.sum(exp_F, axis=1, keepdims=True) # shape: (n_samples, num_class)
89+
trees_per_class = []
90+
91+
for k in range(self.num_class):
92+
# The negative gradient for class k (logistic loss): (y_true - p)
93+
gradient = y_onehot[:, k] - p[:, k]
94+
95+
# **Note:** Due to our custom DecisionTree limitations, we use only the first feature.
96+
feature_for_tree = X[:, 0]
97+
98+
# Instantiate and train the decision tree on (feature, gradient) pair.
99+
tree = DecisionTree(depth=self.max_depth, min_leaf_size=5)
100+
tree.train(feature_for_tree, gradient)
101+
# Predict the update values using the tree.
102+
update = np.array([tree.predict(x_val) for x_val in feature_for_tree])
103+
# Update the scores for class k.
104+
F[:, k] += self.learning_rate * update
105+
trees_per_class.append(tree)
106+
self.trees.append(trees_per_class)
107+
108+
def predict_proba(self, X: np.ndarray) -> np.ndarray:
109+
"""
110+
Predict class probabilities for X.
111+
112+
Parameters
113+
----------
114+
X : np.ndarray, shape = (n_samples, n_features)
115+
116+
Returns
117+
-------
118+
proba : np.ndarray, shape = (n_samples, num_class)
119+
The class probabilities.
120+
"""
121+
n_samples = X.shape[0]
122+
F = np.tile(self.initial_pred, (n_samples, 1))
123+
# Use the first feature for prediction as done in training.
124+
feature_for_tree = X[:, 0]
125+
for trees_per_class in self.trees:
126+
for k, tree in enumerate(trees_per_class):
127+
update = np.array([tree.predict(x_val) for x_val in feature_for_tree])
128+
F[:, k] += self.learning_rate * update
129+
exp_F = np.exp(F)
130+
proba = exp_F / np.sum(exp_F, axis=1, keepdims=True)
131+
return proba
132+
133+
def predict(self, X: np.ndarray) -> np.ndarray:
134+
"""
135+
Predict class labels for X.
136+
137+
Parameters
138+
----------
139+
X : np.ndarray, shape = (n_samples, n_features)
140+
141+
Returns
142+
-------
143+
labels : np.ndarray, shape = (n_samples,)
144+
The predicted class labels.
145+
"""
146+
proba = self.predict_proba(X)
147+
return np.argmax(proba, axis=1)
23148

24149
def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier:
25150
"""

0 commit comments

Comments
 (0)