Update catboost_regressor.py

AHuzail · web-flow · commit 65048fd598fa · 2024-10-08T03:27:17.000+05:30
added description
diff --git a/machine_learning/catboost_regressor.py b/machine_learning/catboost_regressor.py
@@ -1,80 +1,91 @@
-# CatBoost Classifier Example
+"""
+CatBoost Regressor Example.
+
+This script demonstrates the usage of the CatBoost Regressor for a simple regression task.
+CatBoost is a powerful gradient boosting library that handles categorical features automatically
+and is highly efficient.
+
+Make sure to install CatBoost using:
+    pip install catboost
+
+Contributed by: @AHuzail
+"""
+
 import numpy as np
-from matplotlib import pyplot as plt
-from sklearn.datasets import load_iris
-from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.datasets import load_boston
 from sklearn.model_selection import train_test_split
-from catboost import CatBoostClassifier
+from sklearn.metrics import mean_squared_error
+from catboost import CatBoostRegressor
 
 
-def data_handling(data: dict) -> tuple:
+def data_handling() -> tuple:
     """
-    Extracts the features and target values from the provided dataset.
-
-    Args:
-        data (dict): A dictionary containing the dataset's features and targets.
+    Loads and handles the dataset, splitting it into features and targets.
 
+    The Boston dataset is used as a regression example.
+    
     Returns:
-        tuple: A tuple with features and targets.
+        tuple: A tuple of (features, target), where both are numpy arrays.
 
     Example:
-    >>> data_handling({'data':'[5.1, 3.5, 1.4, 0.2]', 'target': [0]})
-    ('[5.1, 3.5, 1.4, 0.2]', [0])
+    >>> features, target = data_handling()
+    >>> features.shape
+    (506, 13)
+    >>> target.shape
+    (506,)
     """
-    return data["data"], data["target"]
+    # Load Boston dataset (note: this dataset may be deprecated, replace if needed)
+    boston = load_boston()
+    features = boston.data
+    target = boston.target
+    return features, target
 
 
-def catboost(features: np.ndarray, target: np.ndarray) -> CatBoostClassifier:
+def catboost_regressor(features: np.ndarray, target: np.ndarray) -> CatBoostRegressor:
     """
-    Trains a CatBoostClassifier using the provided features and target.
+    Trains a CatBoostRegressor using the provided features and target values.
 
     Args:
-        features (np.ndarray): The input features for training the classifier.
-        target (np.ndarray): The target labels corresponding to the features.
+        features (np.ndarray): The input features for the regression model.
+        target (np.ndarray): The target values for the regression model.
 
     Returns:
-        CatBoostClassifier: A trained CatBoost classifier.
+        CatBoostRegressor: A trained CatBoost regressor model.
 
     Example:
-    >>> catboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0]))
-    CatBoostClassifier(...)
+    >>> features, target = data_handling()
+    >>> model = catboost_regressor(features, target)
+    >>> isinstance(model, CatBoostRegressor)
+    True
     """
-    classifier = CatBoostClassifier(verbose=0)  # Suppressing verbose output
-    classifier.fit(features, target)
-    return classifier
+    regressor = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, verbose=0)
+    regressor.fit(features, target)
+    return regressor
 
 
 def main() -> None:
     """
-    Demonstrates the training and evaluation of a CatBoost classifier
-    on the Iris dataset, displaying a confusion matrix of the results.
+    Main function to run the CatBoost Regressor example.
 
-    The dataset is split into training and testing sets, the model is
-    trained on the training data, and then evaluated on the test data.
-    A normalized confusion matrix is displayed.
+    It loads the data, splits it into training and testing sets,
+    trains the regressor on the training data, and evaluates its performance
+    on the test data.
     """
-
-    # Load the Iris dataset
-    iris = load_iris()
-    features, targets = data_handling(iris)
+    # Load and split the dataset
+    features, target = data_handling()
     x_train, x_test, y_train, y_test = train_test_split(
-        features, targets, test_size=0.25
+        features, target, test_size=0.25, random_state=42
     )
 
-    # Train a CatBoost classifier
-    catboost_classifier = catboost(x_train, y_train)
-
-    # Display the confusion matrix for the test data
-    ConfusionMatrixDisplay.from_estimator(
-        catboost_classifier,
-        x_test,
-        y_test,
-        display_labels=iris["target_names"],
-        cmap="Blues",
-        normalize="true",
-    )
-    plt.title("Normalized Confusion Matrix - IRIS Dataset")
-    plt.show()
+    # Train CatBoost Regressor
+    regressor = catboost_regressor(x_train, y_train)
+
+    # Predict on the test set
+    predictions = regressor.predict(x_test)
+
+    # Evaluate the performance using Mean Squared Error
+    mse = mean_squared_error(y_test, predictions)
+    print(f"Mean Squared Error on Test Set: {mse:.4f}")
 
 
 if __name__ == "__main__":