From cde8776a564bb5ce8081c307633cd9aaf43ac801 Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 5 Jan 2025 19:20:26 -0500 Subject: [PATCH 1/7] doctest in all_combinations.py --- backtracking/all_combinations.py | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/backtracking/all_combinations.py b/backtracking/all_combinations.py index 390decf3a05b..8bfba7c48fa7 100644 --- a/backtracking/all_combinations.py +++ b/backtracking/all_combinations.py @@ -12,14 +12,32 @@ def combination_lists(n: int, k: int) -> list[list[int]]: """ + Generates all possible combinations of k numbers out of 1 ... n using itertools. + >>> combination_lists(n=4, k=2) [[1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]] + >>> combination_lists(n=5, k=3) + [[1, 2, 3], [1, 2, 4], [1, 2, 5], [1, 3, 4], [1, 3, 5], [1, 4, 5], [2, 3, 4], [2, 3, 5], [2, 4, 5], [3, 4, 5]] + >>> combination_lists(n=0, k=0) + [] + >>> combination_lists(n=1, k=1) + [[1]] + >>> combination_lists(n=3, k=0) + [] + >>> combination_lists(n=3, k=4) + [] + >>> combination_lists(n=-1, k=2) + [] + >>> combination_lists(n=4, k=-1) + [] """ return [list(x) for x in combinations(range(1, n + 1), k)] def generate_all_combinations(n: int, k: int) -> list[list[int]]: """ + Generates all possible combinations of k numbers out of 1 ... n using backtracking. + >>> generate_all_combinations(n=4, k=2) [[1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]] >>> generate_all_combinations(n=0, k=0) @@ -34,6 +52,14 @@ def generate_all_combinations(n: int, k: int) -> list[list[int]]: ValueError: n must not be negative >>> generate_all_combinations(n=5, k=4) [[1, 2, 3, 4], [1, 2, 3, 5], [1, 2, 4, 5], [1, 3, 4, 5], [2, 3, 4, 5]] + >>> generate_all_combinations(n=3, k=3) + [[1, 2, 3]] + >>> generate_all_combinations(n=3, k=1) + [[1], [2], [3]] + >>> generate_all_combinations(n=1, k=0) + [[]] + >>> generate_all_combinations(n=1, k=1) + [[1]] >>> from itertools import combinations >>> all(generate_all_combinations(n, k) == combination_lists(n, k) ... for n in range(1, 6) for k in range(1, 6)) @@ -56,6 +82,28 @@ def create_all_state( current_list: list[int], total_list: list[list[int]], ) -> None: + """ + Helper function to recursively build all combinations. + + >>> create_all_state(1, 4, 2, [], result := []) + >>> result + [[1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]] + >>> create_all_state(1, 3, 3, [], result := []) + >>> result + [[1, 2, 3]] + >>> create_all_state(2, 2, 1, [1], result := []) + >>> result + [[1, 2]] + >>> create_all_state(1, 0, 0, [], result := []) + >>> result + [[]] + >>> create_all_state(1, 4, 0, [1, 2], result := []) + >>> result + [[1, 2]] + >>> create_all_state(5, 4, 2, [1, 2], result := []) + >>> result + [] + """ if level == 0: total_list.append(current_list[:]) return From 3b593f1e8924bd3f29842d363758d9d9fd25689f Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 5 Jan 2025 19:41:10 -0500 Subject: [PATCH 2/7] added doctest in all_combinations.py --- backtracking/all_combinations.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backtracking/all_combinations.py b/backtracking/all_combinations.py index 8bfba7c48fa7..0e7a36109191 100644 --- a/backtracking/all_combinations.py +++ b/backtracking/all_combinations.py @@ -17,8 +17,10 @@ def combination_lists(n: int, k: int) -> list[list[int]]: >>> combination_lists(n=4, k=2) [[1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]] >>> combination_lists(n=5, k=3) - [[1, 2, 3], [1, 2, 4], [1, 2, 5], [1, 3, 4], [1, 3, 5], [1, 4, 5], [2, 3, 4], [2, 3, 5], [2, 4, 5], [3, 4, 5]] - >>> combination_lists(n=0, k=0) + [[1, 2, 3], [1, 2, 4], [1, 2, 5], + [1, 3, 4], [1, 3, 5], [1, 4, 5], + [2, 3, 4], [2, 3, 5], [2, 4, 5], + [3, 4, 5]] [] >>> combination_lists(n=1, k=1) [[1]] From 9a3687c1da14d79449c9ad812b72c3e2dedfcddf Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 5 Jan 2025 19:56:03 -0500 Subject: [PATCH 3/7] doctests in all_combinations.py --- backtracking/all_combinations.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/backtracking/all_combinations.py b/backtracking/all_combinations.py index 0e7a36109191..259ff22b7f4c 100644 --- a/backtracking/all_combinations.py +++ b/backtracking/all_combinations.py @@ -16,12 +16,6 @@ def combination_lists(n: int, k: int) -> list[list[int]]: >>> combination_lists(n=4, k=2) [[1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]] - >>> combination_lists(n=5, k=3) - [[1, 2, 3], [1, 2, 4], [1, 2, 5], - [1, 3, 4], [1, 3, 5], [1, 4, 5], - [2, 3, 4], [2, 3, 5], [2, 4, 5], - [3, 4, 5]] - [] >>> combination_lists(n=1, k=1) [[1]] >>> combination_lists(n=3, k=0) From ec1951dfdc3dd93b01323607be1c426c48ca78fe Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 5 Jan 2025 20:10:34 -0500 Subject: [PATCH 4/7] add doctest all_combinations.py --- backtracking/all_combinations.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/backtracking/all_combinations.py b/backtracking/all_combinations.py index 259ff22b7f4c..0d0fd259bb5f 100644 --- a/backtracking/all_combinations.py +++ b/backtracking/all_combinations.py @@ -16,16 +16,11 @@ def combination_lists(n: int, k: int) -> list[list[int]]: >>> combination_lists(n=4, k=2) [[1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]] - >>> combination_lists(n=1, k=1) - [[1]] - >>> combination_lists(n=3, k=0) - [] - >>> combination_lists(n=3, k=4) - [] - >>> combination_lists(n=-1, k=2) - [] - >>> combination_lists(n=4, k=-1) - [] + >>> combination_lists(n=5, k=3) + [[1, 2, 3], [1, 2, 4], [1, 2, 5], + [1, 3, 4], [1, 3, 5], [1, 4, 5], + [2, 3, 4], [2, 3, 5], [2, 4, 5], + [3, 4, 5]] """ return [list(x) for x in combinations(range(1, n + 1), k)] From b901cc072acc38a8f36db1dd19daa82dce08a75f Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 5 Jan 2025 20:17:14 -0500 Subject: [PATCH 5/7] add --- backtracking/all_combinations.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/backtracking/all_combinations.py b/backtracking/all_combinations.py index 0d0fd259bb5f..1d15c6263e14 100644 --- a/backtracking/all_combinations.py +++ b/backtracking/all_combinations.py @@ -16,11 +16,6 @@ def combination_lists(n: int, k: int) -> list[list[int]]: >>> combination_lists(n=4, k=2) [[1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]] - >>> combination_lists(n=5, k=3) - [[1, 2, 3], [1, 2, 4], [1, 2, 5], - [1, 3, 4], [1, 3, 5], [1, 4, 5], - [2, 3, 4], [2, 3, 5], [2, 4, 5], - [3, 4, 5]] """ return [list(x) for x in combinations(range(1, n + 1), k)] From 3a7869ba2f4b4df060301f4d3e5521f8c7b0e1a2 Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 12 Jan 2025 21:40:13 -0500 Subject: [PATCH 6/7] Customized XGBoost Classifier --- machine_learning/xgboost_classifier.py | 127 ++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 1 deletion(-) diff --git a/machine_learning/xgboost_classifier.py b/machine_learning/xgboost_classifier.py index 1da933cf690f..38217f93ef39 100644 --- a/machine_learning/xgboost_classifier.py +++ b/machine_learning/xgboost_classifier.py @@ -1,12 +1,12 @@ # XGBoost Classifier Example import numpy as np +from decision_tree import DecisionTree from matplotlib import pyplot as plt from sklearn.datasets import load_iris from sklearn.metrics import ConfusionMatrixDisplay from sklearn.model_selection import train_test_split from xgboost import XGBClassifier - def data_handling(data: dict) -> tuple: # Split dataset into features and target # data is features @@ -20,6 +20,131 @@ def data_handling(data: dict) -> tuple: """ return (data["data"], data["target"]) +class XGBClassifier: + """ + An implementation of a gradient boosting classifier inspired by XGBoost. + + This implementation uses multi-class boosting with a logistic (softmax) loss. + It trains one regression tree per class on the negative gradient (residual) + at each boosting iteration. + + Parameters + ---------- + n_estimators : int, default=100 + The number of boosting rounds. + learning_rate : float, default=0.3 + Step size shrinkage used in updates to prevent overfitting. + max_depth : int, default=3 + Maximum depth of the regression trees. + random_state : int, default=0 + Random seed. + + **Important:** + Due to limitations of our custom DecisionTree (which only supports one-dimensional input), + only the first feature (column 0) of the dataset is used when training each tree. + """ + + def __init__(self, n_estimators: int = 100, learning_rate: float = 0.3, + max_depth: int = 3, random_state: int = 0): + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.random_state = random_state + + # List of lists of trees; for each boosting round, we have one tree per class. + self.trees = [] + self.num_class = None + self.initial_pred = None # Initial log-odds per class + + def fit(self, X: np.ndarray, y: np.ndarray) -> None: + """ + Fit the gradient boosting model. + + Parameters + ---------- + X : np.ndarray, shape = (n_samples, n_features) + Training data. + y : np.ndarray, shape = (n_samples,) + Class labels (assumed to be integers 0, 1, ..., K-1). + """ + n_samples = X.shape[0] + self.num_class = np.unique(y).shape[0] + + # One-hot encode the labels. + y_onehot = np.zeros((n_samples, self.num_class)) + y_onehot[np.arange(n_samples), y] = 1 + + # Initialize predictions F with the log class probabilities (log-odds). + class_counts = np.bincount(y, minlength=self.num_class) + class_prob = class_counts / n_samples + initial_score = np.log(class_prob + 1e-10) # add small constant to avoid log(0) + self.initial_pred = initial_score # shape: (num_class,) + F = np.tile(initial_score, (n_samples, 1)) # shape: (n_samples, num_class) + + # Boosting rounds. + for t in range(self.n_estimators): + # Compute probabilities using softmax. + exp_F = np.exp(F) + p = exp_F / np.sum(exp_F, axis=1, keepdims=True) # shape: (n_samples, num_class) + trees_per_class = [] + + for k in range(self.num_class): + # The negative gradient for class k (logistic loss): (y_true - p) + gradient = y_onehot[:, k] - p[:, k] + + # **Note:** Due to our custom DecisionTree limitations, we use only the first feature. + feature_for_tree = X[:, 0] + + # Instantiate and train the decision tree on (feature, gradient) pair. + tree = DecisionTree(depth=self.max_depth, min_leaf_size=5) + tree.train(feature_for_tree, gradient) + # Predict the update values using the tree. + update = np.array([tree.predict(x_val) for x_val in feature_for_tree]) + # Update the scores for class k. + F[:, k] += self.learning_rate * update + trees_per_class.append(tree) + self.trees.append(trees_per_class) + + def predict_proba(self, X: np.ndarray) -> np.ndarray: + """ + Predict class probabilities for X. + + Parameters + ---------- + X : np.ndarray, shape = (n_samples, n_features) + + Returns + ------- + proba : np.ndarray, shape = (n_samples, num_class) + The class probabilities. + """ + n_samples = X.shape[0] + F = np.tile(self.initial_pred, (n_samples, 1)) + # Use the first feature for prediction as done in training. + feature_for_tree = X[:, 0] + for trees_per_class in self.trees: + for k, tree in enumerate(trees_per_class): + update = np.array([tree.predict(x_val) for x_val in feature_for_tree]) + F[:, k] += self.learning_rate * update + exp_F = np.exp(F) + proba = exp_F / np.sum(exp_F, axis=1, keepdims=True) + return proba + + def predict(self, X: np.ndarray) -> np.ndarray: + """ + Predict class labels for X. + + Parameters + ---------- + X : np.ndarray, shape = (n_samples, n_features) + + Returns + ------- + labels : np.ndarray, shape = (n_samples,) + The predicted class labels. + """ + proba = self.predict_proba(X) + return np.argmax(proba, axis=1) def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier: """ From 02b2859ab36b602bdaf85973c4f0dd1473da4e8e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 13 Jan 2025 02:44:45 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/xgboost_classifier.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/machine_learning/xgboost_classifier.py b/machine_learning/xgboost_classifier.py index 38217f93ef39..df31cb4631b9 100644 --- a/machine_learning/xgboost_classifier.py +++ b/machine_learning/xgboost_classifier.py @@ -7,6 +7,7 @@ from sklearn.model_selection import train_test_split from xgboost import XGBClassifier + def data_handling(data: dict) -> tuple: # Split dataset into features and target # data is features @@ -20,6 +21,7 @@ def data_handling(data: dict) -> tuple: """ return (data["data"], data["target"]) + class XGBClassifier: """ An implementation of a gradient boosting classifier inspired by XGBoost. @@ -38,14 +40,19 @@ class XGBClassifier: Maximum depth of the regression trees. random_state : int, default=0 Random seed. - - **Important:** + + **Important:** Due to limitations of our custom DecisionTree (which only supports one-dimensional input), only the first feature (column 0) of the dataset is used when training each tree. """ - def __init__(self, n_estimators: int = 100, learning_rate: float = 0.3, - max_depth: int = 3, random_state: int = 0): + def __init__( + self, + n_estimators: int = 100, + learning_rate: float = 0.3, + max_depth: int = 3, + random_state: int = 0, + ): self.n_estimators = n_estimators self.learning_rate = learning_rate self.max_depth = max_depth @@ -85,7 +92,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> None: for t in range(self.n_estimators): # Compute probabilities using softmax. exp_F = np.exp(F) - p = exp_F / np.sum(exp_F, axis=1, keepdims=True) # shape: (n_samples, num_class) + p = exp_F / np.sum( + exp_F, axis=1, keepdims=True + ) # shape: (n_samples, num_class) trees_per_class = [] for k in range(self.num_class): @@ -146,6 +155,7 @@ def predict(self, X: np.ndarray) -> np.ndarray: proba = self.predict_proba(X) return np.argmax(proba, axis=1) + def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier: """ # THIS TEST IS BROKEN!! >>> xgboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0]))