Skip to content

Added Customized XGBoostClassifier #12518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
135 changes: 135 additions & 0 deletions machine_learning/xgboost_classifier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# XGBoost Classifier Example
import numpy as np
from decision_tree import DecisionTree
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import ConfusionMatrixDisplay
Expand All @@ -21,6 +22,140 @@
return (data["data"], data["target"])


class XGBClassifier:

Check failure on line 25 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F811)

machine_learning/xgboost_classifier.py:25:7: F811 Redefinition of unused `XGBClassifier` from line 8
"""
An implementation of a gradient boosting classifier inspired by XGBoost.

This implementation uses multi-class boosting with a logistic (softmax) loss.
It trains one regression tree per class on the negative gradient (residual)
at each boosting iteration.

Parameters
----------
n_estimators : int, default=100
The number of boosting rounds.
learning_rate : float, default=0.3
Step size shrinkage used in updates to prevent overfitting.
max_depth : int, default=3
Maximum depth of the regression trees.
random_state : int, default=0
Random seed.

**Important:**
Due to limitations of our custom DecisionTree (which only supports one-dimensional input),

Check failure on line 45 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/xgboost_classifier.py:45:89: E501 Line too long (94 > 88)
only the first feature (column 0) of the dataset is used when training each tree.
"""

def __init__(
self,
n_estimators: int = 100,
learning_rate: float = 0.3,
max_depth: int = 3,
random_state: int = 0,
):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.random_state = random_state

# List of lists of trees; for each boosting round, we have one tree per class.
self.trees = []
self.num_class = None
self.initial_pred = None # Initial log-odds per class

def fit(self, X: np.ndarray, y: np.ndarray) -> None:

Check failure on line 66 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N803)

machine_learning/xgboost_classifier.py:66:19: N803 Argument name `X` should be lowercase
"""
Fit the gradient boosting model.

Parameters
----------
X : np.ndarray, shape = (n_samples, n_features)
Training data.
y : np.ndarray, shape = (n_samples,)
Class labels (assumed to be integers 0, 1, ..., K-1).
"""
n_samples = X.shape[0]
self.num_class = np.unique(y).shape[0]

# One-hot encode the labels.
y_onehot = np.zeros((n_samples, self.num_class))
y_onehot[np.arange(n_samples), y] = 1

# Initialize predictions F with the log class probabilities (log-odds).
class_counts = np.bincount(y, minlength=self.num_class)
class_prob = class_counts / n_samples
initial_score = np.log(class_prob + 1e-10) # add small constant to avoid log(0)
self.initial_pred = initial_score # shape: (num_class,)
F = np.tile(initial_score, (n_samples, 1)) # shape: (n_samples, num_class)

Check failure on line 89 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N806)

machine_learning/xgboost_classifier.py:89:9: N806 Variable `F` in function should be lowercase

# Boosting rounds.
for t in range(self.n_estimators):

Check failure on line 92 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (B007)

machine_learning/xgboost_classifier.py:92:13: B007 Loop control variable `t` not used within loop body
# Compute probabilities using softmax.
exp_F = np.exp(F)

Check failure on line 94 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N806)

machine_learning/xgboost_classifier.py:94:13: N806 Variable `exp_F` in function should be lowercase
p = exp_F / np.sum(
exp_F, axis=1, keepdims=True
) # shape: (n_samples, num_class)
trees_per_class = []

for k in range(self.num_class):
# The negative gradient for class k (logistic loss): (y_true - p)
gradient = y_onehot[:, k] - p[:, k]

# **Note:** Due to our custom DecisionTree limitations, we use only the first feature.

Check failure on line 104 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/xgboost_classifier.py:104:89: E501 Line too long (102 > 88)
feature_for_tree = X[:, 0]

# Instantiate and train the decision tree on (feature, gradient) pair.
tree = DecisionTree(depth=self.max_depth, min_leaf_size=5)
tree.train(feature_for_tree, gradient)
# Predict the update values using the tree.
update = np.array([tree.predict(x_val) for x_val in feature_for_tree])
# Update the scores for class k.
F[:, k] += self.learning_rate * update
trees_per_class.append(tree)
self.trees.append(trees_per_class)

def predict_proba(self, X: np.ndarray) -> np.ndarray:

Check failure on line 117 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N803)

machine_learning/xgboost_classifier.py:117:29: N803 Argument name `X` should be lowercase
"""
Predict class probabilities for X.

Parameters
----------
X : np.ndarray, shape = (n_samples, n_features)

Returns
-------
proba : np.ndarray, shape = (n_samples, num_class)
The class probabilities.
"""
n_samples = X.shape[0]
F = np.tile(self.initial_pred, (n_samples, 1))

Check failure on line 131 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N806)

machine_learning/xgboost_classifier.py:131:9: N806 Variable `F` in function should be lowercase
# Use the first feature for prediction as done in training.
feature_for_tree = X[:, 0]
for trees_per_class in self.trees:
for k, tree in enumerate(trees_per_class):
update = np.array([tree.predict(x_val) for x_val in feature_for_tree])
F[:, k] += self.learning_rate * update
exp_F = np.exp(F)

Check failure on line 138 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (N806)

machine_learning/xgboost_classifier.py:138:9: N806 Variable `exp_F` in function should be lowercase
proba = exp_F / np.sum(exp_F, axis=1, keepdims=True)
return proba

def predict(self, X: np.ndarray) -> np.ndarray:
"""
Predict class labels for X.

Parameters
----------
X : np.ndarray, shape = (n_samples, n_features)

Returns
-------
labels : np.ndarray, shape = (n_samples,)
The predicted class labels.
"""
proba = self.predict_proba(X)
return np.argmax(proba, axis=1)


def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier:
"""
# THIS TEST IS BROKEN!! >>> xgboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0]))
Expand Down
Loading