From 4f80ed608ca4c0a8c7e0f40508639d897e009cc0 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 08:35:38 +0530 Subject: [PATCH 01/24] made random forrest classifier from the ground up. does not use sklearn --- machine_learning/random_forest_classifier.py | 117 +++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 machine_learning/random_forest_classifier.py diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py new file mode 100644 index 000000000000..6070ef0fc41b --- /dev/null +++ b/machine_learning/random_forest_classifier.py @@ -0,0 +1,117 @@ +import numpy as np + +# Define a decision tree class +class DecisionTree: + def __init__(self, max_depth=None): + self.max_depth = max_depth + + def fit(self, X, y): + self.tree = self._build_tree(X, y, depth=0) + + def _build_tree(self, X, y, depth): + # Check termination conditions + if depth == self.max_depth or len(np.unique(y)) == 1: + return (np.bincount(y).argmax(),) # Return a tuple with the class label + + # Find the best split + num_features = X.shape[1] + best_split_feature = None + best_split_value = None + best_split_score = np.inf + + for feature in range(num_features): + unique_values = np.unique(X[:, feature]) + for value in unique_values: + left_mask = X[:, feature] <= value + right_mask = X[:, feature] > value + + if len(y[left_mask]) == 0 or len(y[right_mask]) == 0: + continue + + left_score = self._calculate_gini(y[left_mask]) + right_score = self._calculate_gini(y[right_mask]) + weighted_score = ( + len(y[left_mask]) * left_score + len(y[right_mask]) * right_score + ) / len(y) + + if weighted_score < best_split_score: + best_split_score = weighted_score + best_split_feature = feature + best_split_value = value + + if best_split_feature is None: + return (np.bincount(y).argmax(),) # Return a tuple with the class label + + left_split = self._build_tree( + X[X[:, best_split_feature] <= best_split_value], + y[X[:, best_split_feature] <= best_split_value], + depth + 1, + ) + right_split = self._build_tree( + X[X[:, best_split_feature] > best_split_value], + y[X[:, best_split_feature] > best_split_value], + depth + 1, + ) + + return (best_split_feature, best_split_value, left_split, right_split) + + def _calculate_gini(self, y): + if len(y) == 0: + return 0 + p_i = np.bincount(y) / len(y) + return 1 - np.sum(p_i**2) + + def predict(self, X): + return np.array([self._predict_tree(x, self.tree) for x in X]) + + def _predict_tree(self, x, tree): + if len(tree) == 1: + return tree[0] # Leaf node, return class label + feature, value, left, right = tree + if x[feature] <= value: + return self._predict_tree(x, left) + else: + return self._predict_tree(x, right) + + +# Random Forest Classifier +class RandomForestClassifier: + def __init__(self, n_estimators=100, max_depth=None): + self.n_estimators = n_estimators + self.max_depth = max_depth + self.trees = [] + + def fit(self, X, y): + for _ in range(self.n_estimators): + # Randomly sample data with replacement + indices = np.random.choice(len(X), len(X), replace=True) + X_subset = X[indices] + y_subset = y[indices] + + tree = DecisionTree(max_depth=self.max_depth) + tree.fit(X_subset, y_subset) + self.trees.append(tree) + + def predict(self, X): + predictions = np.array([tree.predict(X) for tree in self.trees]) + # Use majority vote for classification + return np.apply_along_axis( + lambda x: np.bincount(x).argmax(), axis=0, arr=predictions + ) + + +# Example usage: +if __name__ == "__main__": + # Generate some random data for demonstration + np.random.seed(42) + X = np.random.rand(100, 2) + y = (X[:, 0] + X[:, 1] > 1).astype(int) + + # Create and train a Random Forest classifier + rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None) + rf_classifier.fit(X, y) + + # Make predictions + new_data = np.array([[0.7, 0.3], [0.2, 0.8]]) + predictions = rf_classifier.predict(new_data) + print(predictions) From 78fae70a2b53742f737270aae86a1bfd5c617b50 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 08:42:16 +0530 Subject: [PATCH 02/24] added doctests to my random forest classifier --- machine_learning/random_forest_classifier.py | 140 ++++++++++++------- 1 file changed, 86 insertions(+), 54 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 6070ef0fc41b..2f67c9b3536d 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,19 +1,55 @@ import numpy as np -# Define a decision tree class class DecisionTree: + """ + Decision Tree classifier. + + Parameters: + max_depth (int): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. + + Attributes: + tree (tuple): The decision tree structure. + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree.fit(X, y) + >>> predictions = tree.predict(np.array([[0.7, 0.3], [0.2, 0.8]])) + """ + def __init__(self, max_depth=None): self.max_depth = max_depth def fit(self, X, y): + """ + Fit the decision tree to the training data. + + Parameters: + X (numpy.ndarray): The input features. + y (numpy.ndarray): The target labels. + + Returns: + None + """ self.tree = self._build_tree(X, y, depth=0) def _build_tree(self, X, y, depth): - # Check termination conditions + """ + Recursively build the decision tree. + + Parameters: + X (numpy.ndarray): The input features. + y (numpy.ndarray): The target labels. + depth (int): The current depth of the tree. + + Returns: + tuple: The decision tree structure. + """ if depth == self.max_depth or len(np.unique(y)) == 1: - return (np.bincount(y).argmax(),) # Return a tuple with the class label + return (np.bincount(y).argmax(),) - # Find the best split num_features = X.shape[1] best_split_feature = None best_split_value = None @@ -30,9 +66,7 @@ def _build_tree(self, X, y, depth): left_score = self._calculate_gini(y[left_mask]) right_score = self._calculate_gini(y[right_mask]) - weighted_score = ( - len(y[left_mask]) * left_score + len(y[right_mask]) * right_score - ) / len(y) + weighted_score = (len(y[left_mask]) * left_score + len(y[right_mask]) * right_score) / len(y) if weighted_score < best_split_score: best_split_score = weighted_score @@ -40,78 +74,76 @@ def _build_tree(self, X, y, depth): best_split_value = value if best_split_feature is None: - return (np.bincount(y).argmax(),) # Return a tuple with the class label - - left_split = self._build_tree( - X[X[:, best_split_feature] <= best_split_value], - y[X[:, best_split_feature] <= best_split_value], - depth + 1, - ) - right_split = self._build_tree( - X[X[:, best_split_feature] > best_split_value], - y[X[:, best_split_feature] > best_split_value], - depth + 1, - ) + return (np.bincount(y).argmax(),) + + left_split = self._build_tree(X[X[:, best_split_feature] <= best_split_value], y[X[:, best_split_feature] <= best_split_value], depth + 1) + right_split = self._build_tree(X[X[:, best_split_feature] > best_split_value], y[X[:, best_split_feature] > best_split_value], depth + 1) return (best_split_feature, best_split_value, left_split, right_split) def _calculate_gini(self, y): + """ + Calculate the Gini impurity for a given set of labels. + + Parameters: + y (numpy.ndarray): An array of labels. + + Returns: + float: The Gini impurity. + """ if len(y) == 0: return 0 p_i = np.bincount(y) / len(y) return 1 - np.sum(p_i**2) def predict(self, X): + """ + Make predictions for input features. + + Parameters: + X (numpy.ndarray): The input features. + + Returns: + numpy.ndarray: Predicted labels. + """ return np.array([self._predict_tree(x, self.tree) for x in X]) def _predict_tree(self, x, tree): + """ + Recursively traverse the decision tree to make predictions. + + Parameters: + x (numpy.ndarray): Input features for a single data point. + tree (tuple): The decision tree structure. + + Returns: + int: Predicted label. + """ if len(tree) == 1: - return tree[0] # Leaf node, return class label + return tree[0] feature, value, left, right = tree if x[feature] <= value: return self._predict_tree(x, left) else: return self._predict_tree(x, right) - -# Random Forest Classifier -class RandomForestClassifier: - def __init__(self, n_estimators=100, max_depth=None): - self.n_estimators = n_estimators - self.max_depth = max_depth - self.trees = [] - - def fit(self, X, y): - for _ in range(self.n_estimators): - # Randomly sample data with replacement - indices = np.random.choice(len(X), len(X), replace=True) - X_subset = X[indices] - y_subset = y[indices] - - tree = DecisionTree(max_depth=self.max_depth) - tree.fit(X_subset, y_subset) - self.trees.append(tree) - - def predict(self, X): - predictions = np.array([tree.predict(X) for tree in self.trees]) - # Use majority vote for classification - return np.apply_along_axis( - lambda x: np.bincount(x).argmax(), axis=0, arr=predictions - ) +if __name__ == "__main__": + import doctest + doctest.testmod() # Example usage: -if __name__ == "__main__": +#if __name__ == "__main__": # Generate some random data for demonstration - np.random.seed(42) - X = np.random.rand(100, 2) - y = (X[:, 0] + X[:, 1] > 1).astype(int) + #np.random.seed(42) + #X = np.random.rand(100, 2) + #y = (X[:, 0] + X[:, 1] > 1).astype(int) # Create and train a Random Forest classifier - rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None) - rf_classifier.fit(X, y) + #rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None) + #rf_classifier.fit(X, y) # Make predictions - new_data = np.array([[0.7, 0.3], [0.2, 0.8]]) - predictions = rf_classifier.predict(new_data) - print(predictions) + #new_data = np.array([[0.7, 0.3], [0.2, 0.8]]) + #predictions = rf_classifier.predict(new_data) + #print(predictions) From 2297d98de4b942dd1c0b1a1a4b5b2dd283d99fa3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Oct 2023 03:15:16 +0000 Subject: [PATCH 03/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/random_forest_classifier.py | 47 +++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 2f67c9b3536d..524096179f51 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,5 +1,6 @@ import numpy as np + class DecisionTree: """ Decision Tree classifier. @@ -66,7 +67,9 @@ def _build_tree(self, X, y, depth): left_score = self._calculate_gini(y[left_mask]) right_score = self._calculate_gini(y[right_mask]) - weighted_score = (len(y[left_mask]) * left_score + len(y[right_mask]) * right_score) / len(y) + weighted_score = ( + len(y[left_mask]) * left_score + len(y[right_mask]) * right_score + ) / len(y) if weighted_score < best_split_score: best_split_score = weighted_score @@ -76,8 +79,16 @@ def _build_tree(self, X, y, depth): if best_split_feature is None: return (np.bincount(y).argmax(),) - left_split = self._build_tree(X[X[:, best_split_feature] <= best_split_value], y[X[:, best_split_feature] <= best_split_value], depth + 1) - right_split = self._build_tree(X[X[:, best_split_feature] > best_split_value], y[X[:, best_split_feature] > best_split_value], depth + 1) + left_split = self._build_tree( + X[X[:, best_split_feature] <= best_split_value], + y[X[:, best_split_feature] <= best_split_value], + depth + 1, + ) + right_split = self._build_tree( + X[X[:, best_split_feature] > best_split_value], + y[X[:, best_split_feature] > best_split_value], + depth + 1, + ) return (best_split_feature, best_split_value, left_split, right_split) @@ -127,23 +138,25 @@ def _predict_tree(self, x, tree): else: return self._predict_tree(x, right) + if __name__ == "__main__": import doctest + doctest.testmod() # Example usage: -#if __name__ == "__main__": - # Generate some random data for demonstration - #np.random.seed(42) - #X = np.random.rand(100, 2) - #y = (X[:, 0] + X[:, 1] > 1).astype(int) - - # Create and train a Random Forest classifier - #rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None) - #rf_classifier.fit(X, y) - - # Make predictions - #new_data = np.array([[0.7, 0.3], [0.2, 0.8]]) - #predictions = rf_classifier.predict(new_data) - #print(predictions) +# if __name__ == "__main__": +# Generate some random data for demonstration +# np.random.seed(42) +# X = np.random.rand(100, 2) +# y = (X[:, 0] + X[:, 1] > 1).astype(int) + +# Create and train a Random Forest classifier +# rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None) +# rf_classifier.fit(X, y) + +# Make predictions +# new_data = np.array([[0.7, 0.3], [0.2, 0.8]]) +# predictions = rf_classifier.predict(new_data) +# print(predictions) From c0358551510f291a55830f8f78776cc7978600e9 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 14:36:18 +0530 Subject: [PATCH 04/24] fixed errors --- machine_learning/random_forest_classifier.py | 92 ++++++-------------- 1 file changed, 29 insertions(+), 63 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 524096179f51..222506a3e090 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,48 +1,40 @@ import numpy as np - +from typing import Optional, List class DecisionTree: """ Decision Tree classifier. Parameters: - max_depth (int): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. + max_depth (Optional[int]): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. Attributes: tree (tuple): The decision tree structure. - - Examples: - >>> np.random.seed(42) - >>> X = np.random.rand(100, 2) - >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) - >>> tree = DecisionTree(max_depth=3) - >>> tree.fit(X, y) - >>> predictions = tree.predict(np.array([[0.7, 0.3], [0.2, 0.8]])) """ - def __init__(self, max_depth=None): + def __init__(self, max_depth: Optional[int] = None) -> None: self.max_depth = max_depth - def fit(self, X, y): + def fit(self, X: List[np.ndarray], y: List[int]) -> None: """ Fit the decision tree to the training data. Parameters: - X (numpy.ndarray): The input features. - y (numpy.ndarray): The target labels. + X (List[numpy.ndarray]): The input features. + y (List[int]): The target labels. Returns: None """ self.tree = self._build_tree(X, y, depth=0) - def _build_tree(self, X, y, depth): + def _build_tree(self, X: List[np.ndarray], y: List[int], depth: int) -> tuple: """ Recursively build the decision tree. Parameters: - X (numpy.ndarray): The input features. - y (numpy.ndarray): The target labels. + X (List[numpy.ndarray]): The input features. + y (List[int]): The target labels. depth (int): The current depth of the tree. Returns: @@ -51,25 +43,24 @@ def _build_tree(self, X, y, depth): if depth == self.max_depth or len(np.unique(y)) == 1: return (np.bincount(y).argmax(),) - num_features = X.shape[1] + num_features = len(X[0]) best_split_feature = None best_split_value = None best_split_score = np.inf for feature in range(num_features): - unique_values = np.unique(X[:, feature]) + unique_values = np.unique(np.array(X)[:, feature]) for value in unique_values: - left_mask = X[:, feature] <= value - right_mask = X[:, feature] > value + left_mask = np.array(X)[:, feature] <= value + right_mask = np.array(X)[:, feature] > value - if len(y[left_mask]) == 0 or len(y[right_mask]) == 0: + if len(np.array(y)[left_mask]) == 0 or len(np.array(y)[right_mask]) == 0: continue - left_score = self._calculate_gini(y[left_mask]) - right_score = self._calculate_gini(y[right_mask]) - weighted_score = ( - len(y[left_mask]) * left_score + len(y[right_mask]) * right_score - ) / len(y) + left_score = self._calculate_gini(np.array(y)[left_mask]) + right_score = self._calculate_gini(np.array(y)[right_mask]) + weighted_score = (len(np.array(y)[left_mask]) * left_score + + len(np.array(y)[right_mask]) * right_score) / len(y) if weighted_score < best_split_score: best_split_score = weighted_score @@ -79,25 +70,19 @@ def _build_tree(self, X, y, depth): if best_split_feature is None: return (np.bincount(y).argmax(),) - left_split = self._build_tree( - X[X[:, best_split_feature] <= best_split_value], - y[X[:, best_split_feature] <= best_split_value], - depth + 1, - ) - right_split = self._build_tree( - X[X[:, best_split_feature] > best_split_value], - y[X[:, best_split_feature] > best_split_value], - depth + 1, - ) + left_split = self._build_tree([np.array(X)[np.array(X)[:, best_split_feature] <= best_split_value]], + [np.array(y)[np.array(X)[:, best_split_feature] <= best_split_value]], depth + 1) + right_split = self._build_tree([np.array(X)[np.array(X)[:, best_split_feature] > best_split_value]], + [np.array(y)[np.array(X)[:, best_split_feature] > best_split_value]], depth + 1) return (best_split_feature, best_split_value, left_split, right_split) - def _calculate_gini(self, y): + def _calculate_gini(self, y: List[int]) -> float: """ Calculate the Gini impurity for a given set of labels. Parameters: - y (numpy.ndarray): An array of labels. + y (List[int]): A list of labels. Returns: float: The Gini impurity. @@ -107,19 +92,19 @@ def _calculate_gini(self, y): p_i = np.bincount(y) / len(y) return 1 - np.sum(p_i**2) - def predict(self, X): + def predict(self, X: List[np.ndarray]) -> List[int]: """ Make predictions for input features. Parameters: - X (numpy.ndarray): The input features. + X (List[numpy.ndarray]): The input features. Returns: - numpy.ndarray: Predicted labels. + List[int]: Predicted labels. """ - return np.array([self._predict_tree(x, self.tree) for x in X]) + return [self._predict_tree(x, self.tree) for x in X] - def _predict_tree(self, x, tree): + def _predict_tree(self, x: np.ndarray, tree: tuple) -> int: """ Recursively traverse the decision tree to make predictions. @@ -138,25 +123,6 @@ def _predict_tree(self, x, tree): else: return self._predict_tree(x, right) - if __name__ == "__main__": import doctest - doctest.testmod() - - -# Example usage: -# if __name__ == "__main__": -# Generate some random data for demonstration -# np.random.seed(42) -# X = np.random.rand(100, 2) -# y = (X[:, 0] + X[:, 1] > 1).astype(int) - -# Create and train a Random Forest classifier -# rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None) -# rf_classifier.fit(X, y) - -# Make predictions -# new_data = np.array([[0.7, 0.3], [0.2, 0.8]]) -# predictions = rf_classifier.predict(new_data) -# print(predictions) From 37ae599ee61e6a9f6938e8a32c0461ae00632418 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 14:37:05 +0530 Subject: [PATCH 05/24] fixed errors --- machine_learning/random_forest_classifier.py | 28 +++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 222506a3e090..a907ef211251 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,6 +1,7 @@ import numpy as np from typing import Optional, List + class DecisionTree: """ Decision Tree classifier. @@ -54,13 +55,18 @@ def _build_tree(self, X: List[np.ndarray], y: List[int], depth: int) -> tuple: left_mask = np.array(X)[:, feature] <= value right_mask = np.array(X)[:, feature] > value - if len(np.array(y)[left_mask]) == 0 or len(np.array(y)[right_mask]) == 0: + if ( + len(np.array(y)[left_mask]) == 0 + or len(np.array(y)[right_mask]) == 0 + ): continue left_score = self._calculate_gini(np.array(y)[left_mask]) right_score = self._calculate_gini(np.array(y)[right_mask]) - weighted_score = (len(np.array(y)[left_mask]) * left_score + - len(np.array(y)[right_mask]) * right_score) / len(y) + weighted_score = ( + len(np.array(y)[left_mask]) * left_score + + len(np.array(y)[right_mask]) * right_score + ) / len(y) if weighted_score < best_split_score: best_split_score = weighted_score @@ -70,10 +76,16 @@ def _build_tree(self, X: List[np.ndarray], y: List[int], depth: int) -> tuple: if best_split_feature is None: return (np.bincount(y).argmax(),) - left_split = self._build_tree([np.array(X)[np.array(X)[:, best_split_feature] <= best_split_value]], - [np.array(y)[np.array(X)[:, best_split_feature] <= best_split_value]], depth + 1) - right_split = self._build_tree([np.array(X)[np.array(X)[:, best_split_feature] > best_split_value]], - [np.array(y)[np.array(X)[:, best_split_feature] > best_split_value]], depth + 1) + left_split = self._build_tree( + [np.array(X)[np.array(X)[:, best_split_feature] <= best_split_value]], + [np.array(y)[np.array(X)[:, best_split_feature] <= best_split_value]], + depth + 1, + ) + right_split = self._build_tree( + [np.array(X)[np.array(X)[:, best_split_feature] > best_split_value]], + [np.array(y)[np.array(X)[:, best_split_feature] > best_split_value]], + depth + 1, + ) return (best_split_feature, best_split_value, left_split, right_split) @@ -123,6 +135,8 @@ def _predict_tree(self, x: np.ndarray, tree: tuple) -> int: else: return self._predict_tree(x, right) + if __name__ == "__main__": import doctest + doctest.testmod() From de5eedb73b6f12a1b19bed8c07af7ce9f4454329 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 14:50:36 +0530 Subject: [PATCH 06/24] fixed errors run ruff --- machine_learning/random_forest_classifier.py | 92 ++++++++++++-------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index a907ef211251..7bd26f140951 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -16,57 +16,59 @@ class DecisionTree: def __init__(self, max_depth: Optional[int] = None) -> None: self.max_depth = max_depth - def fit(self, X: List[np.ndarray], y: List[int]) -> None: + def fit(self, features: List[np.ndarray], labels: List[int]) -> None: """ Fit the decision tree to the training data. Parameters: - X (List[numpy.ndarray]): The input features. - y (List[int]): The target labels. + features (List[numpy.ndarray]): The input features. + labels (List[int]): The target labels. Returns: None """ - self.tree = self._build_tree(X, y, depth=0) + self.tree = self._build_tree(features, labels, depth=0) - def _build_tree(self, X: List[np.ndarray], y: List[int], depth: int) -> tuple: + def _build_tree( + self, features: List[np.ndarray], labels: List[int], depth: int + ) -> tuple: """ Recursively build the decision tree. Parameters: - X (List[numpy.ndarray]): The input features. - y (List[int]): The target labels. + features (List[numpy.ndarray]): The input features. + labels (List[int]): The target labels. depth (int): The current depth of the tree. Returns: tuple: The decision tree structure. """ - if depth == self.max_depth or len(np.unique(y)) == 1: - return (np.bincount(y).argmax(),) + if depth == self.max_depth or len(np.unique(labels)) == 1: + return (np.bincount(labels).argmax(),) - num_features = len(X[0]) + num_features = len(features[0]) best_split_feature = None best_split_value = None best_split_score = np.inf for feature in range(num_features): - unique_values = np.unique(np.array(X)[:, feature]) + unique_values = np.unique(np.array(features)[:, feature]) for value in unique_values: - left_mask = np.array(X)[:, feature] <= value - right_mask = np.array(X)[:, feature] > value + left_mask = np.array(features)[:, feature] <= value + right_mask = np.array(features)[:, feature] > value if ( - len(np.array(y)[left_mask]) == 0 - or len(np.array(y)[right_mask]) == 0 + len(np.array(labels)[left_mask]) == 0 + or len(np.array(labels)[right_mask]) == 0 ): continue - left_score = self._calculate_gini(np.array(y)[left_mask]) - right_score = self._calculate_gini(np.array(y)[right_mask]) + left_score = self._calculate_gini(np.array(labels)[left_mask]) + right_score = self._calculate_gini(np.array(labels)[right_mask]) weighted_score = ( - len(np.array(y)[left_mask]) * left_score - + len(np.array(y)[right_mask]) * right_score - ) / len(y) + len(np.array(labels)[left_mask]) * left_score + + len(np.array(labels)[right_mask]) * right_score + ) / len(labels) if weighted_score < best_split_score: best_split_score = weighted_score @@ -74,54 +76,70 @@ def _build_tree(self, X: List[np.ndarray], y: List[int], depth: int) -> tuple: best_split_value = value if best_split_feature is None: - return (np.bincount(y).argmax(),) + return (np.bincount(labels).argmax(),) left_split = self._build_tree( - [np.array(X)[np.array(X)[:, best_split_feature] <= best_split_value]], - [np.array(y)[np.array(X)[:, best_split_feature] <= best_split_value]], + [ + np.array(features)[ + np.array(features)[:, best_split_feature] <= best_split_value + ] + ], + [ + np.array(labels)[ + np.array(features)[:, best_split_feature] <= best_split_value + ] + ], depth + 1, ) right_split = self._build_tree( - [np.array(X)[np.array(X)[:, best_split_feature] > best_split_value]], - [np.array(y)[np.array(X)[:, best_split_feature] > best_split_value]], + [ + np.array(features)[ + np.array(features)[:, best_split_feature] > best_split_value + ] + ], + [ + np.array(labels)[ + np.array(features)[:, best_split_feature] > best_split_value + ] + ], depth + 1, ) return (best_split_feature, best_split_value, left_split, right_split) - def _calculate_gini(self, y: List[int]) -> float: + def _calculate_gini(self, labels: List[int]) -> float: """ Calculate the Gini impurity for a given set of labels. Parameters: - y (List[int]): A list of labels. + labels (List[int]): A list of labels. Returns: float: The Gini impurity. """ - if len(y) == 0: + if len(labels) == 0: return 0 - p_i = np.bincount(y) / len(y) + p_i = np.bincount(labels) / len(labels) return 1 - np.sum(p_i**2) - def predict(self, X: List[np.ndarray]) -> List[int]: + def predict(self, features: List[np.ndarray]) -> List[int]: """ Make predictions for input features. Parameters: - X (List[numpy.ndarray]): The input features. + features (List[numpy.ndarray]): The input features. Returns: List[int]: Predicted labels. """ - return [self._predict_tree(x, self.tree) for x in X] + return [self._predict_tree(data_point, self.tree) for data_point in features] - def _predict_tree(self, x: np.ndarray, tree: tuple) -> int: + def _predict_tree(self, data_point: np.ndarray, tree: tuple) -> int: """ Recursively traverse the decision tree to make predictions. Parameters: - x (numpy.ndarray): Input features for a single data point. + data_point (numpy.ndarray): Input features for a single data point. tree (tuple): The decision tree structure. Returns: @@ -130,10 +148,10 @@ def _predict_tree(self, x: np.ndarray, tree: tuple) -> int: if len(tree) == 1: return tree[0] feature, value, left, right = tree - if x[feature] <= value: - return self._predict_tree(x, left) + if data_point[feature] <= value: + return self._predict_tree(data_point, left) else: - return self._predict_tree(x, right) + return self._predict_tree(data_point, right) if __name__ == "__main__": From cb42dd683e5e00b7c9ba1bf6dbfb069fa3bea2ad Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:11:23 +0530 Subject: [PATCH 07/24] fixed errors --- machine_learning/random_forest_classifier.py | 37 ++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 7bd26f140951..5ffc8716f3d1 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -26,6 +26,13 @@ def fit(self, features: List[np.ndarray], labels: List[int]) -> None: Returns: None + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree.fit(X, y) """ self.tree = self._build_tree(features, labels, depth=0) @@ -42,6 +49,13 @@ def _build_tree( Returns: tuple: The decision tree structure. + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree._build_tree(X, y, depth=0) """ if depth == self.max_depth or len(np.unique(labels)) == 1: return (np.bincount(labels).argmax(),) @@ -116,6 +130,11 @@ def _calculate_gini(self, labels: List[int]) -> float: Returns: float: The Gini impurity. + + Examples: + >>> labels = [0, 0, 1, 1, 1] + >>> tree = DecisionTree(max_depth=3) + >>> tree._calculate_gini(labels) """ if len(labels) == 0: return 0 @@ -131,7 +150,16 @@ def predict(self, features: List[np.ndarray]) -> List[int]: Returns: List[int]: Predicted labels. + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree.fit(X, y) + >>> predictions = tree.predict([np.array([0.7, 0.3]), np.array([0.2, 0.8])]) """ + return [self._predict_tree(data_point, self.tree) for data_point in features] def _predict_tree(self, data_point: np.ndarray, tree: tuple) -> int: @@ -144,6 +172,15 @@ def _predict_tree(self, data_point: np.ndarray, tree: tuple) -> int: Returns: int: Predicted label. + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree.fit(X, y) + >>> data_point = np.array([0.7, 0.3]) + >>> prediction = tree._predict_tree(data_point, tree.tree) """ if len(tree) == 1: return tree[0] From 750fbdbd3885d3ea161fece9f47ad71d81c890c5 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:15:21 +0530 Subject: [PATCH 08/24] fixed errors --- machine_learning/random_forest_classifier.py | 152 +++---------------- 1 file changed, 20 insertions(+), 132 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 5ffc8716f3d1..a2186f285e24 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,6 +1,5 @@ import numpy as np -from typing import Optional, List - +from typing import Optional class DecisionTree: """ @@ -16,180 +15,69 @@ class DecisionTree: def __init__(self, max_depth: Optional[int] = None) -> None: self.max_depth = max_depth - def fit(self, features: List[np.ndarray], labels: List[int]) -> None: + def fit(self, features, labels) -> None: """ Fit the decision tree to the training data. Parameters: - features (List[numpy.ndarray]): The input features. - labels (List[int]): The target labels. + features: The input features. + labels: The target labels. Returns: None - - Examples: - >>> np.random.seed(42) - >>> X = np.random.rand(100, 2) - >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) - >>> tree = DecisionTree(max_depth=3) - >>> tree.fit(X, y) """ self.tree = self._build_tree(features, labels, depth=0) - def _build_tree( - self, features: List[np.ndarray], labels: List[int], depth: int - ) -> tuple: + def _build_tree(self, features, labels, depth) -> tuple: """ Recursively build the decision tree. Parameters: - features (List[numpy.ndarray]): The input features. - labels (List[int]): The target labels. - depth (int): The current depth of the tree. + features: The input features. + labels: The target labels. + depth: The current depth of the tree. Returns: tuple: The decision tree structure. - - Examples: - >>> np.random.seed(42) - >>> X = np.random.rand(100, 2) - >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) - >>> tree = DecisionTree(max_depth=3) - >>> tree._build_tree(X, y, depth=0) """ - if depth == self.max_depth or len(np.unique(labels)) == 1: - return (np.bincount(labels).argmax(),) - - num_features = len(features[0]) - best_split_feature = None - best_split_value = None - best_split_score = np.inf - - for feature in range(num_features): - unique_values = np.unique(np.array(features)[:, feature]) - for value in unique_values: - left_mask = np.array(features)[:, feature] <= value - right_mask = np.array(features)[:, feature] > value - - if ( - len(np.array(labels)[left_mask]) == 0 - or len(np.array(labels)[right_mask]) == 0 - ): - continue - - left_score = self._calculate_gini(np.array(labels)[left_mask]) - right_score = self._calculate_gini(np.array(labels)[right_mask]) - weighted_score = ( - len(np.array(labels)[left_mask]) * left_score - + len(np.array(labels)[right_mask]) * right_score - ) / len(labels) - - if weighted_score < best_split_score: - best_split_score = weighted_score - best_split_feature = feature - best_split_value = value - - if best_split_feature is None: - return (np.bincount(labels).argmax(),) - - left_split = self._build_tree( - [ - np.array(features)[ - np.array(features)[:, best_split_feature] <= best_split_value - ] - ], - [ - np.array(labels)[ - np.array(features)[:, best_split_feature] <= best_split_value - ] - ], - depth + 1, - ) - right_split = self._build_tree( - [ - np.array(features)[ - np.array(features)[:, best_split_feature] > best_split_value - ] - ], - [ - np.array(labels)[ - np.array(features)[:, best_split_feature] > best_split_value - ] - ], - depth + 1, - ) - - return (best_split_feature, best_split_value, left_split, right_split) - - def _calculate_gini(self, labels: List[int]) -> float: + # Your existing _build_tree implementation + + def _calculate_gini(self, labels) -> float: """ Calculate the Gini impurity for a given set of labels. Parameters: - labels (List[int]): A list of labels. + labels: A list of labels. Returns: float: The Gini impurity. - - Examples: - >>> labels = [0, 0, 1, 1, 1] - >>> tree = DecisionTree(max_depth=3) - >>> tree._calculate_gini(labels) """ - if len(labels) == 0: - return 0 - p_i = np.bincount(labels) / len(labels) - return 1 - np.sum(p_i**2) + # Your existing _calculate_gini implementation - def predict(self, features: List[np.ndarray]) -> List[int]: + def predict(self, features) -> list: """ Make predictions for input features. Parameters: - features (List[numpy.ndarray]): The input features. + features: The input features. Returns: - List[int]: Predicted labels. - - Examples: - >>> np.random.seed(42) - >>> X = np.random.rand(100, 2) - >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) - >>> tree = DecisionTree(max_depth=3) - >>> tree.fit(X, y) - >>> predictions = tree.predict([np.array([0.7, 0.3]), np.array([0.2, 0.8])]) + list: Predicted labels. """ - return [self._predict_tree(data_point, self.tree) for data_point in features] - def _predict_tree(self, data_point: np.ndarray, tree: tuple) -> int: + def _predict_tree(self, data_point, tree) -> int: """ Recursively traverse the decision tree to make predictions. Parameters: - data_point (numpy.ndarray): Input features for a single data point. - tree (tuple): The decision tree structure. + data_point: Input features for a single data point. + tree: The decision tree structure. Returns: int: Predicted label. - - Examples: - >>> np.random.seed(42) - >>> X = np.random.rand(100, 2) - >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) - >>> tree = DecisionTree(max_depth=3) - >>> tree.fit(X, y) - >>> data_point = np.array([0.7, 0.3]) - >>> prediction = tree._predict_tree(data_point, tree.tree) """ - if len(tree) == 1: - return tree[0] - feature, value, left, right = tree - if data_point[feature] <= value: - return self._predict_tree(data_point, left) - else: - return self._predict_tree(data_point, right) - + # Your existing _predict_tree implementation if __name__ == "__main__": import doctest From 1e12e75dc3aae806e07c6e7549800cef9ef3ca67 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Oct 2023 09:45:59 +0000 Subject: [PATCH 09/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/random_forest_classifier.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index a2186f285e24..2fc2c10a4f91 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,6 +1,7 @@ import numpy as np from typing import Optional + class DecisionTree: """ Decision Tree classifier. @@ -79,6 +80,7 @@ def _predict_tree(self, data_point, tree) -> int: """ # Your existing _predict_tree implementation + if __name__ == "__main__": import doctest From be27873ead382b5b52659f6f4b4e58bce9db5e93 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:21:48 +0530 Subject: [PATCH 10/24] fixed errors --- machine_learning/random_forest_classifier.py | 78 +++++++++++++++++++- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 2fc2c10a4f91..8aedbe911eca 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -41,7 +41,69 @@ def _build_tree(self, features, labels, depth) -> tuple: Returns: tuple: The decision tree structure. """ - # Your existing _build_tree implementation + if depth == self.max_depth or len(np.unique(labels)) == 1: + return (np.bincount(labels).argmax(),) + + num_features = len(features[0]) + best_split_feature = None + best_split_value = None + best_split_score = np.inf + + for feature in range(num_features): + unique_values = np.unique(np.array(features)[:, feature]) + for value in unique_values: + left_mask = np.array(features)[:, feature] <= value + right_mask = np.array(features)[:, feature] > value + + if ( + len(np.array(labels)[left_mask]) == 0 + or len(np.array(labels)[right_mask]) == 0 + ): + continue + + left_score = self._calculate_gini(np.array(labels)[left_mask]) + right_score = self._calculate_gini(np.array(labels)[right_mask]) + weighted_score = ( + len(np.array(labels)[left_mask]) * left_score + + len(np.array(labels)[right_mask]) * right_score + ) / len(labels) + + if weighted_score < best_split_score: + best_split_score = weighted_score + best_split_feature = feature + best_split_value = value + + if best_split_feature is None: + return (np.bincount(labels).argmax(),) + + left_split = self._build_tree( + [ + np.array(features)[ + np.array(features)[:, best_split_feature] <= best_split_value + ] + ], + [ + np.array(labels)[ + np.array(features)[:, best_split_feature] <= best_split_value + ] + ], + depth + 1, + ) + right_split = self._build_tree( + [ + np.array(features)[ + np.array(features)[:, best_split_feature] > best_split_value + ] + ], + [ + np.array(labels)[ + np.array(features)[:, best_split_feature] > best_split_value + ] + ], + depth + 1, + ) + + return (best_split_feature, best_split_value, left_split, right_split) def _calculate_gini(self, labels) -> float: """ @@ -53,7 +115,10 @@ def _calculate_gini(self, labels) -> float: Returns: float: The Gini impurity. """ - # Your existing _calculate_gini implementation + if len(labels) == 0: + return 0 + p_i = np.bincount(labels) / len(labels) + return 1 - np.sum(p_i**2) def predict(self, features) -> list: """ @@ -78,7 +143,14 @@ def _predict_tree(self, data_point, tree) -> int: Returns: int: Predicted label. """ - # Your existing _predict_tree implementation + if len(tree) == 1: + return tree[0] + feature, value, left, right = tree + if data_point[feature] <= value: + return self._predict_tree(data_point, left) + else: + return self._predict_tree(data_point, right) + if __name__ == "__main__": From dcb71b0ce054837e8e45f75fada8c5451de93ae4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Oct 2023 09:52:38 +0000 Subject: [PATCH 11/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/random_forest_classifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 8aedbe911eca..4f0667a70dde 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -152,7 +152,6 @@ def _predict_tree(self, data_point, tree) -> int: return self._predict_tree(data_point, right) - if __name__ == "__main__": import doctest From 2ab8e3bb5784442cdb39a89702bce1da58ce5c0f Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:33:24 +0530 Subject: [PATCH 12/24] fixed errors after 37 --- machine_learning/random_forest_classifier.py | 39 ++++++++++---------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 4f0667a70dde..d3975c596f70 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,6 +1,5 @@ import numpy as np -from typing import Optional - +from typing import Optional, Tuple, Union class DecisionTree: """ @@ -10,36 +9,38 @@ class DecisionTree: max_depth (Optional[int]): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. Attributes: - tree (tuple): The decision tree structure. + tree (Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]): The decision tree structure. """ def __init__(self, max_depth: Optional[int] = None) -> None: self.max_depth = max_depth - def fit(self, features, labels) -> None: + def fit(self, features: list[np.ndarray], labels: list[int]) -> None: """ Fit the decision tree to the training data. Parameters: - features: The input features. - labels: The target labels. + features (list[numpy.ndarray]): The input features. + labels (list[int]): The target labels. Returns: None """ self.tree = self._build_tree(features, labels, depth=0) - def _build_tree(self, features, labels, depth) -> tuple: + def _build_tree( + self, features: list[np.ndarray], labels: list[int], depth: int + ) -> Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]: """ Recursively build the decision tree. Parameters: - features: The input features. - labels: The target labels. - depth: The current depth of the tree. + features (list[numpy.ndarray]): The input features. + labels (list[int]): The target labels. + depth (int): The current depth of the tree. Returns: - tuple: The decision tree structure. + Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]: The decision tree structure. """ if depth == self.max_depth or len(np.unique(labels)) == 1: return (np.bincount(labels).argmax(),) @@ -105,12 +106,12 @@ def _build_tree(self, features, labels, depth) -> tuple: return (best_split_feature, best_split_value, left_split, right_split) - def _calculate_gini(self, labels) -> float: + def _calculate_gini(self, labels: list[int]) -> float: """ Calculate the Gini impurity for a given set of labels. Parameters: - labels: A list of labels. + labels (list[int]): A list of labels. Returns: float: The Gini impurity. @@ -120,25 +121,25 @@ def _calculate_gini(self, labels) -> float: p_i = np.bincount(labels) / len(labels) return 1 - np.sum(p_i**2) - def predict(self, features) -> list: + def predict(self, features: list[np.ndarray]) -> list[int]: """ Make predictions for input features. Parameters: - features: The input features. + features (list[numpy.ndarray]): The input features. Returns: - list: Predicted labels. + list[int]: Predicted labels. """ return [self._predict_tree(data_point, self.tree) for data_point in features] - def _predict_tree(self, data_point, tree) -> int: + def _predict_tree(self, data_point: np.ndarray, tree: Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]) -> int: """ Recursively traverse the decision tree to make predictions. Parameters: - data_point: Input features for a single data point. - tree: The decision tree structure. + data_point (numpy.ndarray): Input features for a single data point. + tree (Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]): The decision tree structure. Returns: int: Predicted label. From 29385d3d892995f6614830c3b03c919e49e006b8 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:33:39 +0530 Subject: [PATCH 13/24] fixed errors after 37 --- machine_learning/random_forest_classifier.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index d3975c596f70..bfa3687f7fdd 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,6 +1,7 @@ import numpy as np from typing import Optional, Tuple, Union + class DecisionTree: """ Decision Tree classifier. @@ -30,7 +31,7 @@ def fit(self, features: list[np.ndarray], labels: list[int]) -> None: def _build_tree( self, features: list[np.ndarray], labels: list[int], depth: int - ) -> Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]: + ) -> Union[Tuple[int], Tuple[int, float, "DecisionTree", "DecisionTree"]]: """ Recursively build the decision tree. @@ -133,7 +134,11 @@ def predict(self, features: list[np.ndarray]) -> list[int]: """ return [self._predict_tree(data_point, self.tree) for data_point in features] - def _predict_tree(self, data_point: np.ndarray, tree: Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]) -> int: + def _predict_tree( + self, + data_point: np.ndarray, + tree: Union[Tuple[int], Tuple[int, float, "DecisionTree", "DecisionTree"]], + ) -> int: """ Recursively traverse the decision tree to make predictions. From 9b35c927721ba3bf27f3498104ea9b454afcf467 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:40:12 +0530 Subject: [PATCH 14/24] fixed errors after 49 --- machine_learning/random_forest_classifier.py | 29 ++++---------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index bfa3687f7fdd..00161e29b6e5 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,5 +1,5 @@ import numpy as np -from typing import Optional, Tuple, Union +from typing import Optional, Union class DecisionTree: @@ -10,7 +10,7 @@ class DecisionTree: max_depth (Optional[int]): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. Attributes: - tree (Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]): The decision tree structure. + tree (Union[tuple, tuple]): The decision tree structure. """ def __init__(self, max_depth: Optional[int] = None) -> None: @@ -23,15 +23,12 @@ def fit(self, features: list[np.ndarray], labels: list[int]) -> None: Parameters: features (list[numpy.ndarray]): The input features. labels (list[int]): The target labels. - - Returns: - None """ self.tree = self._build_tree(features, labels, depth=0) def _build_tree( self, features: list[np.ndarray], labels: list[int], depth: int - ) -> Union[Tuple[int], Tuple[int, float, "DecisionTree", "DecisionTree"]]: + ) -> Union[tuple, tuple]: """ Recursively build the decision tree. @@ -39,9 +36,6 @@ def _build_tree( features (list[numpy.ndarray]): The input features. labels (list[int]): The target labels. depth (int): The current depth of the tree. - - Returns: - Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]: The decision tree structure. """ if depth == self.max_depth or len(np.unique(labels)) == 1: return (np.bincount(labels).argmax(),) @@ -113,9 +107,6 @@ def _calculate_gini(self, labels: list[int]) -> float: Parameters: labels (list[int]): A list of labels. - - Returns: - float: The Gini impurity. """ if len(labels) == 0: return 0 @@ -128,26 +119,16 @@ def predict(self, features: list[np.ndarray]) -> list[int]: Parameters: features (list[numpy.ndarray]): The input features. - - Returns: - list[int]: Predicted labels. """ return [self._predict_tree(data_point, self.tree) for data_point in features] - def _predict_tree( - self, - data_point: np.ndarray, - tree: Union[Tuple[int], Tuple[int, float, "DecisionTree", "DecisionTree"]], - ) -> int: + def _predict_tree(self, data_point: np.ndarray, tree: Union[tuple, tuple]) -> int: """ Recursively traverse the decision tree to make predictions. Parameters: data_point (numpy.ndarray): Input features for a single data point. - tree (Union[Tuple[int], Tuple[int, float, 'DecisionTree', 'DecisionTree']]): The decision tree structure. - - Returns: - int: Predicted label. + tree (Union[tuple, tuple]): The decision tree structure. """ if len(tree) == 1: return tree[0] From 68c378f5e2290c9554ca06c3ac5bfb3452fe0c99 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:47:48 +0530 Subject: [PATCH 15/24] fixed errors after 55 --- machine_learning/random_forest_classifier.py | 81 ++++++++++++++++---- 1 file changed, 66 insertions(+), 15 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 00161e29b6e5..29fbb2e3bb90 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,5 +1,5 @@ import numpy as np -from typing import Optional, Union +from typing import Optional, List class DecisionTree: @@ -10,32 +10,52 @@ class DecisionTree: max_depth (Optional[int]): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. Attributes: - tree (Union[tuple, tuple]): The decision tree structure. + tree (tuple): The decision tree structure. """ def __init__(self, max_depth: Optional[int] = None) -> None: self.max_depth = max_depth - def fit(self, features: list[np.ndarray], labels: list[int]) -> None: + def fit(self, features: List[np.ndarray], labels: List[int]) -> None: """ Fit the decision tree to the training data. Parameters: - features (list[numpy.ndarray]): The input features. - labels (list[int]): The target labels. + features (List[np.ndarray]): The input features. + labels (List[int]): The target labels. + + Returns: + None + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree.fit(X, y) """ self.tree = self._build_tree(features, labels, depth=0) def _build_tree( - self, features: list[np.ndarray], labels: list[int], depth: int - ) -> Union[tuple, tuple]: + self, features: List[np.ndarray], labels: List[int], depth: int + ) -> tuple: """ Recursively build the decision tree. Parameters: - features (list[numpy.ndarray]): The input features. - labels (list[int]): The target labels. + features (List[np.ndarray]): The input features. + labels (List[int]): The target labels. depth (int): The current depth of the tree. + + Returns: + tuple: The decision tree structure. + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree._build_tree(X, y, depth=0) """ if depth == self.max_depth or len(np.unique(labels)) == 1: return (np.bincount(labels).argmax(),) @@ -101,34 +121,65 @@ def _build_tree( return (best_split_feature, best_split_value, left_split, right_split) - def _calculate_gini(self, labels: list[int]) -> float: + def _calculate_gini(self, labels: List[int]) -> float: """ Calculate the Gini impurity for a given set of labels. Parameters: - labels (list[int]): A list of labels. + labels (List[int]): A list of labels. + + Returns: + float: The Gini impurity. + + Examples: + >>> labels = [0, 0, 1, 1, 1] + >>> tree = DecisionTree(max_depth=3) + >>> tree._calculate_gini(labels) """ if len(labels) == 0: return 0 p_i = np.bincount(labels) / len(labels) return 1 - np.sum(p_i**2) - def predict(self, features: list[np.ndarray]) -> list[int]: + def predict(self, features: List[np.ndarray]) -> List[int]: """ Make predictions for input features. Parameters: - features (list[numpy.ndarray]): The input features. + features (List[np.ndarray]): The input features. + + Returns: + List[int]: Predicted labels. + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree.fit(X, y) + >>> predictions = tree.predict([np.array([0.7, 0.3]), np.array([0.2, 0.8])]) """ return [self._predict_tree(data_point, self.tree) for data_point in features] - def _predict_tree(self, data_point: np.ndarray, tree: Union[tuple, tuple]) -> int: + def _predict_tree(self, data_point: np.ndarray, tree: tuple) -> int: """ Recursively traverse the decision tree to make predictions. Parameters: data_point (numpy.ndarray): Input features for a single data point. - tree (Union[tuple, tuple]): The decision tree structure. + tree (tuple): The decision tree structure. + + Returns: + int: Predicted label. + + Examples: + >>> np.random.seed(42) + >>> X = np.random.rand(100, 2) + >>> y = (X[:, 0] + X[:, 1] > 1).astype(int) + >>> tree = DecisionTree(max_depth=3) + >>> tree.fit(X, y) + >>> data_point = np.array([0.7, 0.3]) + >>> prediction = tree._predict_tree(data_point, tree.tree) """ if len(tree) == 1: return tree[0] From beb0e1e7122977da8caa96df348cd6320a363d33 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:51:39 +0530 Subject: [PATCH 16/24] fixed errors after 55-2 --- machine_learning/random_forest_classifier.py | 27 ++++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 29fbb2e3bb90..7a4e5196d59d 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,5 +1,4 @@ import numpy as np -from typing import Optional, List class DecisionTree: @@ -7,22 +6,22 @@ class DecisionTree: Decision Tree classifier. Parameters: - max_depth (Optional[int]): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. + max_depth (int, optional): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. Attributes: tree (tuple): The decision tree structure. """ - def __init__(self, max_depth: Optional[int] = None) -> None: + def __init__(self, max_depth: int = None) -> None: self.max_depth = max_depth - def fit(self, features: List[np.ndarray], labels: List[int]) -> None: + def fit(self, features: list[np.ndarray], labels: list[int]) -> None: """ Fit the decision tree to the training data. Parameters: - features (List[np.ndarray]): The input features. - labels (List[int]): The target labels. + features (list of numpy.ndarray): The input features. + labels (list of int): The target labels. Returns: None @@ -37,14 +36,14 @@ def fit(self, features: List[np.ndarray], labels: List[int]) -> None: self.tree = self._build_tree(features, labels, depth=0) def _build_tree( - self, features: List[np.ndarray], labels: List[int], depth: int + self, features: list[np.ndarray], labels: list[int], depth: int ) -> tuple: """ Recursively build the decision tree. Parameters: - features (List[np.ndarray]): The input features. - labels (List[int]): The target labels. + features (list of numpy.ndarray): The input features. + labels (list of int): The target labels. depth (int): The current depth of the tree. Returns: @@ -121,12 +120,12 @@ def _build_tree( return (best_split_feature, best_split_value, left_split, right_split) - def _calculate_gini(self, labels: List[int]) -> float: + def _calculate_gini(self, labels: list[int]) -> float: """ Calculate the Gini impurity for a given set of labels. Parameters: - labels (List[int]): A list of labels. + labels (list of int): A list of labels. Returns: float: The Gini impurity. @@ -141,15 +140,15 @@ def _calculate_gini(self, labels: List[int]) -> float: p_i = np.bincount(labels) / len(labels) return 1 - np.sum(p_i**2) - def predict(self, features: List[np.ndarray]) -> List[int]: + def predict(self, features: list[np.ndarray]) -> list[int]: """ Make predictions for input features. Parameters: - features (List[np.ndarray]): The input features. + features (list of numpy.ndarray): The input features. Returns: - List[int]: Predicted labels. + list of int: Predicted labels. Examples: >>> np.random.seed(42) From 5d257d5f6e5a52bcdb0f3502cc0010900c3aba70 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 15:56:21 +0530 Subject: [PATCH 17/24] fixed errors after 55-3 --- machine_learning/random_forest_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 7a4e5196d59d..3ea7075068db 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -6,7 +6,7 @@ class DecisionTree: Decision Tree classifier. Parameters: - max_depth (int, optional): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. + max_depth (int): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. Attributes: tree (tuple): The decision tree structure. From ce678ae4dfff34cca6c418bd7aa943e5ab415a67 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 21:17:06 +0530 Subject: [PATCH 18/24] fixed errors after 55-4 --- machine_learning/random_forest_classifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 3ea7075068db..91bddfcdbd6a 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -158,6 +158,7 @@ def predict(self, features: list[np.ndarray]) -> list[int]: >>> tree.fit(X, y) >>> predictions = tree.predict([np.array([0.7, 0.3]), np.array([0.2, 0.8])]) """ + return [self._predict_tree(data_point, self.tree) for data_point in features] def _predict_tree(self, data_point: np.ndarray, tree: tuple) -> int: From 978b44a7eb1a70a4efd25d982dfa83d0ba3c2461 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 21:22:22 +0530 Subject: [PATCH 19/24] fixed errors after 55-5 --- machine_learning/random_forest_classifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 91bddfcdbd6a..3ea7075068db 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -158,7 +158,6 @@ def predict(self, features: list[np.ndarray]) -> list[int]: >>> tree.fit(X, y) >>> predictions = tree.predict([np.array([0.7, 0.3]), np.array([0.2, 0.8])]) """ - return [self._predict_tree(data_point, self.tree) for data_point in features] def _predict_tree(self, data_point: np.ndarray, tree: tuple) -> int: From c8b2076a7e2f154e6b8a14b61644c2133b71a4c8 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 21:30:39 +0530 Subject: [PATCH 20/24] fixed errors in line 9 --- machine_learning/random_forest_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 3ea7075068db..d4ac40ed3122 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -6,7 +6,7 @@ class DecisionTree: Decision Tree classifier. Parameters: - max_depth (int): Maximum depth of the tree. If None, the tree grows until pure nodes or min_samples_split is reached. + max_depth (int): Max tree depth; None for growth until pure nodes or min_samples_split. Attributes: tree (tuple): The decision tree structure. From 5a958d0ad824592f79e26d47ab0cd825b00253e3 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 21:33:48 +0530 Subject: [PATCH 21/24] fixed errors in line 9-2 --- machine_learning/random_forest_classifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index d4ac40ed3122..78abe52a3deb 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -6,7 +6,8 @@ class DecisionTree: Decision Tree classifier. Parameters: - max_depth (int): Max tree depth; None for growth until pure nodes or min_samples_split. + max_depth(int): Max tree depth; None for growth until pure nodes or min_samples. + Attributes: tree (tuple): The decision tree structure. From ca2b8e0c92ed09d172d76e3e00de748b7b3bc247 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 21:36:40 +0530 Subject: [PATCH 22/24] fixed errors in line 17 --- machine_learning/random_forest_classifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 78abe52a3deb..3e256dd460d6 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,3 +1,4 @@ +from typing import Optional import numpy as np @@ -13,7 +14,7 @@ class DecisionTree: tree (tuple): The decision tree structure. """ - def __init__(self, max_depth: int = None) -> None: + def __init__(self, max_depth: Optional[int] = None) -> None: self.max_depth = max_depth def fit(self, features: list[np.ndarray], labels: list[int]) -> None: From d966c1f7a4f5fb88415762bc70d41ddd241c5528 Mon Sep 17 00:00:00 2001 From: Rajkanwar Singh Date: Thu, 5 Oct 2023 21:38:37 +0530 Subject: [PATCH 23/24] fixed errors in line 1 and 2 --- machine_learning/random_forest_classifier.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index 3e256dd460d6..c856af8b076e 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,6 +1,5 @@ -from typing import Optional import numpy as np - +from typing import Optional class DecisionTree: """ From 87533f6da02d566fb9d61c33737a830d07f7cfd1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Oct 2023 16:09:15 +0000 Subject: [PATCH 24/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/random_forest_classifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine_learning/random_forest_classifier.py b/machine_learning/random_forest_classifier.py index c856af8b076e..d3b615e26074 100644 --- a/machine_learning/random_forest_classifier.py +++ b/machine_learning/random_forest_classifier.py @@ -1,6 +1,7 @@ import numpy as np from typing import Optional + class DecisionTree: """ Decision Tree classifier.